mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* Work on fixing orphaned Token objects bug
This commit is contained in:
		
							parent
							
								
									789a6fe462
								
							
						
					
					
						commit
						cae077b583
					
				|  | @ -61,20 +61,20 @@ cdef class Token: | |||
|     cdef bint _owns_c_data | ||||
| 
 | ||||
|      | ||||
|     cdef list _py | ||||
|     cdef Tokens _seq | ||||
|     cdef tuple _tag_strings | ||||
|     cdef tuple _dep_strings | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline Token cinit(Vocab vocab, unicode string, | ||||
|                             const TokenC* token, int offset, int array_len, | ||||
|                             list py_tokens, tuple tag_strings, tuple dep_strings): | ||||
|                             Tokens parent_seq, tuple tag_strings, tuple dep_strings): | ||||
|         if offset < 0 or offset >= array_len: | ||||
| 
 | ||||
|             msg = "Attempt to access token at %d, max length %d" | ||||
|             raise IndexError(msg % (offset, array_len)) | ||||
|         if py_tokens[offset] is not None: | ||||
|             return py_tokens[offset] | ||||
|         if parent_seq._py_tokens[offset] is not None: | ||||
|             return parent_seq._py_tokens[offset] | ||||
| 
 | ||||
|         cdef Token self = Token.__new__(Token, vocab, string) | ||||
| 
 | ||||
|  | @ -82,10 +82,10 @@ cdef class Token: | |||
|         self.i = offset | ||||
|         self.array_len = array_len | ||||
| 
 | ||||
|         self._py = py_tokens | ||||
|         self._seq = parent_seq | ||||
|         self._tag_strings = tag_strings | ||||
|         self._dep_strings = dep_strings | ||||
|         py_tokens[offset] = self | ||||
|         self._seq._py_tokens[offset] = self | ||||
|         return self | ||||
| 
 | ||||
|     cdef int take_ownership_of_c_data(self) except -1 | ||||
|  |  | |||
|  | @ -19,7 +19,6 @@ cimport cython | |||
| 
 | ||||
| from cpython.mem cimport PyMem_Malloc, PyMem_Free | ||||
| from libc.string cimport memcpy | ||||
| import sys | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
|  | @ -95,21 +94,6 @@ cdef class Tokens: | |||
|         self._tag_strings = tuple() # These will be set by the POS tagger and parser | ||||
|         self._dep_strings = tuple() # The strings are arbitrary and model-specific. | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         # The Token object initially only gets a view of the underlying C | ||||
|         # data --- it doesn't own it. But, if we have Token objects that are | ||||
|         # going to outlive this instance, those objects need a copy of the C | ||||
|         # data. | ||||
|         cdef Token token | ||||
|         if self._py_tokens is not None: | ||||
|             for token in self._py_tokens: | ||||
|                 if token is not None: | ||||
|                     # Why 3? 1 for the entry in the _py_tokens list, | ||||
|                     # and 1 for this reference. If we have _another_ ref, then | ||||
|                     # the token will live, and needs to own its data. | ||||
|                     if sys.getrefcount(token) >= 3: | ||||
|                         token.take_ownership_of_c_data() | ||||
| 
 | ||||
|     def __getitem__(self, object i): | ||||
|         """Retrieve a token. | ||||
|          | ||||
|  | @ -124,7 +108,7 @@ cdef class Tokens: | |||
|         bounds_check(i, self.length, PADDING) | ||||
|         return Token.cinit(self.vocab, self._string, | ||||
|                            &self.data[i], i, self.length, | ||||
|                            self._py_tokens, self._tag_strings, self._dep_strings) | ||||
|                            self, self._tag_strings, self._dep_strings) | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         """Iterate over the tokens. | ||||
|  | @ -135,7 +119,7 @@ cdef class Tokens: | |||
|         for i in range(self.length): | ||||
|             yield Token.cinit(self.vocab, self._string, | ||||
|                               &self.data[i], i, self.length, | ||||
|                               self._py_tokens, self._tag_strings, self._dep_strings) | ||||
|                               self, self._tag_strings, self._dep_strings) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return self.length | ||||
|  | @ -277,7 +261,7 @@ cdef class Token: | |||
|     def nbor(self, int i=1): | ||||
|         return Token.cinit(self.vocab, self._string, | ||||
|                            self.c, self.i, self.array_len, | ||||
|                            self._py, self._tag_strings, self._dep_strings) | ||||
|                            self._seq, self._tag_strings, self._dep_strings) | ||||
| 
 | ||||
|     property string: | ||||
|         def __get__(self): | ||||
|  | @ -378,7 +362,7 @@ cdef class Token: | |||
|                 elif ptr + ptr.head == self.c: | ||||
|                     yield Token.cinit(self.vocab, self._string, | ||||
|                                       ptr, ptr - (self.c - self.i), self.array_len, | ||||
|                                       self._py, self._tag_strings, self._dep_strings) | ||||
|                                       self._seq, self._tag_strings, self._dep_strings) | ||||
|                     ptr += 1 | ||||
|                 else: | ||||
|                     ptr += 1 | ||||
|  | @ -397,7 +381,7 @@ cdef class Token: | |||
|                 elif ptr + ptr.head == self.c: | ||||
|                     yield Token.cinit(self.vocab, self._string, | ||||
|                                       ptr, ptr - (self.c - self.i), self.array_len, | ||||
|                                       self._py, self._tag_strings, self._dep_strings) | ||||
|                                       self._seq, self._tag_strings, self._dep_strings) | ||||
|                     ptr -= 1 | ||||
|                 else: | ||||
|                     ptr -= 1 | ||||
|  | @ -407,7 +391,7 @@ cdef class Token: | |||
|             """The token predicted by the parser to be the head of the current token.""" | ||||
|             return Token.cinit(self.vocab, self._string, | ||||
|                                self.c + self.c.head, self.i + self.c.head, self.array_len, | ||||
|                                self._py, self._tag_strings, self._dep_strings) | ||||
|                                self._seq, self._tag_strings, self._dep_strings) | ||||
| 
 | ||||
|     property whitespace_: | ||||
|         def __get__(self): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user