* Have 'string' refer to the whitespace-padded string

2025-11-04 01:48:04 +03:00 · 2015-01-24 07:32:38 +11:00 · 2015-01-24 07:32:38 +11:00 · 5fd72bc220
commit 5fd72bc220
parent 706305ee26
2 changed files with 8 additions and 16 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -65,3 +65,4 @@ cdef class Token:
    cdef readonly attr_t dep
    cdef readonly ndarray repvec
    cdef readonly unicode string
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -235,16 +235,10 @@ cdef class Token:
        self.tag = t.tag
        self.dep = t.dep
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
-
+        cdef int next_idx = (t+1).idx
-    def __unicode__(self):
+        if next_idx <= self.idx:
-        cdef const TokenC* t = &self._seq.data[self.i]
+            next_idx = self.idx + self.length
-        cdef int end_idx = t.idx + t.lex.length
+        self.string = tokens._string[self.idx:next_idx]
        if self.i + 1 == self._seq.length:
            return self.string
        if end_idx == t[1].idx:
            return self.string
        else:
            return self.string + ' '
    def __len__(self):
        """The number of unicode code-points in the original string.
@ -260,13 +254,10 @@ cdef class Token:
            cdef const TokenC* t = &self._seq.data[self.i]
            return Token(self._seq, self.i + t.head)
-    property string:
+    property whitespace:
        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
+            cdef int end_idx = self.idx + self.length
-            if t.lex.orth == 0:
+            
                return ''
            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
            return py_ustr
    property orth_:
        def __get__(self):