mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Restore _py_tokens cache, to handle orphan tokens.
This commit is contained in:
		
							parent
							
								
									67641f3b58
								
							
						
					
					
						commit
						8214b74eec
					
				| 
						 | 
					@ -23,6 +23,8 @@ cdef class Doc:
 | 
				
			||||||
    cdef public bint is_tagged
 | 
					    cdef public bint is_tagged
 | 
				
			||||||
    cdef public bint is_parsed
 | 
					    cdef public bint is_parsed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef public list _py_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int length
 | 
					    cdef int length
 | 
				
			||||||
    cdef int max_length
 | 
					    cdef int max_length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,7 +71,7 @@ cdef class Doc:
 | 
				
			||||||
    Container class for annotated text.  Constructed via English.__call__ or
 | 
					    Container class for annotated text.  Constructed via English.__call__ or
 | 
				
			||||||
    Tokenizer.__call__.
 | 
					    Tokenizer.__call__.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __cinit__(self, Vocab vocab):
 | 
					    def __init__(self, Vocab vocab):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        size = 20
 | 
					        size = 20
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
| 
						 | 
					@ -87,6 +87,13 @@ cdef class Doc:
 | 
				
			||||||
        self.length = 0
 | 
					        self.length = 0
 | 
				
			||||||
        self.is_tagged = False
 | 
					        self.is_tagged = False
 | 
				
			||||||
        self.is_parsed = False
 | 
					        self.is_parsed = False
 | 
				
			||||||
 | 
					        self._py_tokens = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __dealloc__(self):
 | 
				
			||||||
 | 
					        cdef Token token
 | 
				
			||||||
 | 
					        if self._py_tokens is not None:
 | 
				
			||||||
 | 
					            for token in self._py_tokens:
 | 
				
			||||||
 | 
					                token.take_ownership_of_c_data()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __getitem__(self, object i):
 | 
					    def __getitem__(self, object i):
 | 
				
			||||||
        """Get a token.
 | 
					        """Get a token.
 | 
				
			||||||
| 
						 | 
					@ -103,7 +110,7 @@ cdef class Doc:
 | 
				
			||||||
        if i < 0:
 | 
					        if i < 0:
 | 
				
			||||||
            i = self.length + i
 | 
					            i = self.length + i
 | 
				
			||||||
        bounds_check(i, self.length, PADDING)
 | 
					        bounds_check(i, self.length, PADDING)
 | 
				
			||||||
        return Token.cinit(self.vocab, &self.data[i], i, self.length)
 | 
					        return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __iter__(self):
 | 
					    def __iter__(self):
 | 
				
			||||||
        """Iterate over the tokens.
 | 
					        """Iterate over the tokens.
 | 
				
			||||||
| 
						 | 
					@ -112,7 +119,7 @@ cdef class Doc:
 | 
				
			||||||
            token (Token):
 | 
					            token (Token):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            yield Token.cinit(self.vocab, &self.data[i], i, self.length)
 | 
					            yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self.length
 | 
					        return self.length
 | 
				
			||||||
| 
						 | 
					@ -187,6 +194,7 @@ cdef class Doc:
 | 
				
			||||||
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
 | 
					            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
 | 
				
			||||||
        t.spacy = has_space
 | 
					        t.spacy = has_space
 | 
				
			||||||
        self.length += 1
 | 
					        self.length += 1
 | 
				
			||||||
 | 
					        self._py_tokens.append(None)
 | 
				
			||||||
        return t.idx + t.lex.length + t.spacy
 | 
					        return t.idx + t.lex.length + t.spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @cython.boundscheck(False)
 | 
					    @cython.boundscheck(False)
 | 
				
			||||||
| 
						 | 
					@ -259,7 +267,6 @@ cdef class Doc:
 | 
				
			||||||
    cdef int set_parse(self, const TokenC* parsed) except -1:
 | 
					    cdef int set_parse(self, const TokenC* parsed) except -1:
 | 
				
			||||||
        # TODO: This method is fairly misleading atm. It's used by GreedyParser
 | 
					        # TODO: This method is fairly misleading atm. It's used by GreedyParser
 | 
				
			||||||
        # to actually apply the parse calculated. Need to rethink this.
 | 
					        # to actually apply the parse calculated. Need to rethink this.
 | 
				
			||||||
        self._py_tokens = [None] * self.length
 | 
					 | 
				
			||||||
        self.is_parsed = True
 | 
					        self.is_parsed = True
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            self.data[i] = parsed[i]
 | 
					            self.data[i] = parsed[i]
 | 
				
			||||||
| 
						 | 
					@ -345,8 +352,6 @@ cdef class Doc:
 | 
				
			||||||
            # ...And, set heads back to a relative position
 | 
					            # ...And, set heads back to a relative position
 | 
				
			||||||
            self.data[i].head -= i
 | 
					            self.data[i].head -= i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Clear cached Python objects
 | 
					 | 
				
			||||||
        self._py_tokens = [None] * self.length
 | 
					 | 
				
			||||||
        # Return the merged Python object
 | 
					        # Return the merged Python object
 | 
				
			||||||
        return self[start]
 | 
					        return self[start]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,17 +9,22 @@ cdef class Token:
 | 
				
			||||||
    cdef readonly int i
 | 
					    cdef readonly int i
 | 
				
			||||||
    cdef int array_len
 | 
					    cdef int array_len
 | 
				
			||||||
    cdef bint _owns_c_data
 | 
					    cdef bint _owns_c_data
 | 
				
			||||||
 | 
					    cdef list _py_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len):
 | 
					    cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len,
 | 
				
			||||||
 | 
					                            list _py_tokens):
 | 
				
			||||||
        if offset < 0 or offset >= array_len:
 | 
					        if offset < 0 or offset >= array_len:
 | 
				
			||||||
            msg = "Attempt to access token at %d, max length %d"
 | 
					            msg = "Attempt to access token at %d, max length %d"
 | 
				
			||||||
            raise IndexError(msg % (offset, array_len))
 | 
					            raise IndexError(msg % (offset, array_len))
 | 
				
			||||||
        
 | 
					        if _py_tokens[offset] != None:
 | 
				
			||||||
 | 
					            return _py_tokens[offset]
 | 
				
			||||||
        cdef Token self = Token.__new__(Token, vocab)
 | 
					        cdef Token self = Token.__new__(Token, vocab)
 | 
				
			||||||
        self.c = token
 | 
					        self.c = token
 | 
				
			||||||
        self.i = offset
 | 
					        self.i = offset
 | 
				
			||||||
        self.array_len = array_len
 | 
					        self.array_len = array_len
 | 
				
			||||||
 | 
					        self._py_tokens = _py_tokens
 | 
				
			||||||
 | 
					        self._py_tokens[offset] = self
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int take_ownership_of_c_data(self) except -1
 | 
					    cdef int take_ownership_of_c_data(self) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -23,6 +23,7 @@ cdef class Token:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __cinit__(self, Vocab vocab):
 | 
					    def __cinit__(self, Vocab vocab):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
 | 
					        self._py_tokens = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __dealloc__(self):
 | 
					    def __dealloc__(self):
 | 
				
			||||||
        if self._owns_c_data:
 | 
					        if self._owns_c_data:
 | 
				
			||||||
| 
						 | 
					@ -45,7 +46,7 @@ cdef class Token:
 | 
				
			||||||
        self._owns_c_data = True
 | 
					        self._owns_c_data = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def nbor(self, int i=1):
 | 
					    def nbor(self, int i=1):
 | 
				
			||||||
        return Token.cinit(self.vocab, self.c, self.i, self.array_len)
 | 
					        return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property lex_id:
 | 
					    property lex_id:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -152,7 +153,7 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                elif ptr + ptr.head == self.c:
 | 
					                elif ptr + ptr.head == self.c:
 | 
				
			||||||
                    yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
 | 
					                    yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
 | 
				
			||||||
                                      self.array_len)
 | 
					                                      self.array_len, self._py_tokens)
 | 
				
			||||||
                    ptr += 1
 | 
					                    ptr += 1
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    ptr += 1
 | 
					                    ptr += 1
 | 
				
			||||||
| 
						 | 
					@ -171,7 +172,7 @@ cdef class Token:
 | 
				
			||||||
                    ptr += ptr.head
 | 
					                    ptr += ptr.head
 | 
				
			||||||
                elif ptr + ptr.head == self.c:
 | 
					                elif ptr + ptr.head == self.c:
 | 
				
			||||||
                    tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
 | 
					                    tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
 | 
				
			||||||
                                  self.array_len))
 | 
					                                  self.array_len, self._py_tokens))
 | 
				
			||||||
                    ptr -= 1
 | 
					                    ptr -= 1
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    ptr -= 1
 | 
					                    ptr -= 1
 | 
				
			||||||
| 
						 | 
					@ -196,19 +197,19 @@ cdef class Token:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return Token.cinit(self.vocab,
 | 
					            return Token.cinit(self.vocab,
 | 
				
			||||||
                               (self.c - self.i) + self.c.l_edge, self.c.l_edge,
 | 
					                               (self.c - self.i) + self.c.l_edge, self.c.l_edge,
 | 
				
			||||||
                               self.array_len)
 | 
					                               self.array_len, self._py_tokens)
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
    property right_edge:
 | 
					    property right_edge:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return Token.cinit(self.vocab,
 | 
					            return Token.cinit(self.vocab,
 | 
				
			||||||
                               (self.c - self.i) + self.c.r_edge, self.c.r_edge,
 | 
					                               (self.c - self.i) + self.c.r_edge, self.c.r_edge,
 | 
				
			||||||
                               self.array_len)
 | 
					                               self.array_len, self._py_tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property head:
 | 
					    property head:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            """The token predicted by the parser to be the head of the current token."""
 | 
					            """The token predicted by the parser to be the head of the current token."""
 | 
				
			||||||
            return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head,
 | 
					            return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head,
 | 
				
			||||||
                               self.array_len)
 | 
					                               self.array_len, self._py_tokens)
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    property conjuncts:
 | 
					    property conjuncts:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user