mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Have 'string' refer to the whitespace-padded string
This commit is contained in:
		
							parent
							
								
									706305ee26
								
							
						
					
					
						commit
						5fd72bc220
					
				| 
						 | 
					@ -65,3 +65,4 @@ cdef class Token:
 | 
				
			||||||
    cdef readonly attr_t dep
 | 
					    cdef readonly attr_t dep
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef readonly ndarray repvec
 | 
					    cdef readonly ndarray repvec
 | 
				
			||||||
 | 
					    cdef readonly unicode string
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -235,16 +235,10 @@ cdef class Token:
 | 
				
			||||||
        self.tag = t.tag
 | 
					        self.tag = t.tag
 | 
				
			||||||
        self.dep = t.dep
 | 
					        self.dep = t.dep
 | 
				
			||||||
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
 | 
					        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
 | 
				
			||||||
 | 
					        cdef int next_idx = (t+1).idx
 | 
				
			||||||
    def __unicode__(self):
 | 
					        if next_idx <= self.idx:
 | 
				
			||||||
        cdef const TokenC* t = &self._seq.data[self.i]
 | 
					            next_idx = self.idx + self.length
 | 
				
			||||||
        cdef int end_idx = t.idx + t.lex.length
 | 
					        self.string = tokens._string[self.idx:next_idx]
 | 
				
			||||||
        if self.i + 1 == self._seq.length:
 | 
					 | 
				
			||||||
            return self.string
 | 
					 | 
				
			||||||
        if end_idx == t[1].idx:
 | 
					 | 
				
			||||||
            return self.string
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return self.string + ' '
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        """The number of unicode code-points in the original string.
 | 
					        """The number of unicode code-points in the original string.
 | 
				
			||||||
| 
						 | 
					@ -260,13 +254,10 @@ cdef class Token:
 | 
				
			||||||
            cdef const TokenC* t = &self._seq.data[self.i]
 | 
					            cdef const TokenC* t = &self._seq.data[self.i]
 | 
				
			||||||
            return Token(self._seq, self.i + t.head)
 | 
					            return Token(self._seq, self.i + t.head)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property string:
 | 
					    property whitespace:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            cdef const TokenC* t = &self._seq.data[self.i]
 | 
					            cdef int end_idx = self.idx + self.length
 | 
				
			||||||
            if t.lex.orth == 0:
 | 
					            
 | 
				
			||||||
                return ''
 | 
					 | 
				
			||||||
            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
 | 
					 | 
				
			||||||
            return py_ustr
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property orth_:
 | 
					    property orth_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user