* Have 'string' refer to the whitespace-padded string

This commit is contained in:
Matthew Honnibal 2015-01-24 07:32:38 +11:00
parent 706305ee26
commit 5fd72bc220
2 changed files with 8 additions and 16 deletions

View File

@ -65,3 +65,4 @@ cdef class Token:
cdef readonly attr_t dep cdef readonly attr_t dep
cdef readonly ndarray repvec cdef readonly ndarray repvec
cdef readonly unicode string

View File

@ -235,16 +235,10 @@ cdef class Token:
self.tag = t.tag self.tag = t.tag
self.dep = t.dep self.dep = t.dep
self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec) self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
cdef int next_idx = (t+1).idx
def __unicode__(self): if next_idx <= self.idx:
cdef const TokenC* t = &self._seq.data[self.i] next_idx = self.idx + self.length
cdef int end_idx = t.idx + t.lex.length self.string = tokens._string[self.idx:next_idx]
if self.i + 1 == self._seq.length:
return self.string
if end_idx == t[1].idx:
return self.string
else:
return self.string + ' '
def __len__(self): def __len__(self):
"""The number of unicode code-points in the original string. """The number of unicode code-points in the original string.
@ -260,13 +254,10 @@ cdef class Token:
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head) return Token(self._seq, self.i + t.head)
property string: property whitespace:
def __get__(self): def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef int end_idx = self.idx + self.length
if t.lex.orth == 0:
return ''
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
return py_ustr
property orth_: property orth_:
def __get__(self): def __get__(self):