mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
* Work on fixing orphaned Token objects bug
This commit is contained in:
parent
789a6fe462
commit
cae077b583
|
@ -61,20 +61,20 @@ cdef class Token:
|
||||||
cdef bint _owns_c_data
|
cdef bint _owns_c_data
|
||||||
|
|
||||||
|
|
||||||
cdef list _py
|
cdef Tokens _seq
|
||||||
cdef tuple _tag_strings
|
cdef tuple _tag_strings
|
||||||
cdef tuple _dep_strings
|
cdef tuple _dep_strings
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline Token cinit(Vocab vocab, unicode string,
|
cdef inline Token cinit(Vocab vocab, unicode string,
|
||||||
const TokenC* token, int offset, int array_len,
|
const TokenC* token, int offset, int array_len,
|
||||||
list py_tokens, tuple tag_strings, tuple dep_strings):
|
Tokens parent_seq, tuple tag_strings, tuple dep_strings):
|
||||||
if offset < 0 or offset >= array_len:
|
if offset < 0 or offset >= array_len:
|
||||||
|
|
||||||
msg = "Attempt to access token at %d, max length %d"
|
msg = "Attempt to access token at %d, max length %d"
|
||||||
raise IndexError(msg % (offset, array_len))
|
raise IndexError(msg % (offset, array_len))
|
||||||
if py_tokens[offset] is not None:
|
if parent_seq._py_tokens[offset] is not None:
|
||||||
return py_tokens[offset]
|
return parent_seq._py_tokens[offset]
|
||||||
|
|
||||||
cdef Token self = Token.__new__(Token, vocab, string)
|
cdef Token self = Token.__new__(Token, vocab, string)
|
||||||
|
|
||||||
|
@ -82,10 +82,10 @@ cdef class Token:
|
||||||
self.i = offset
|
self.i = offset
|
||||||
self.array_len = array_len
|
self.array_len = array_len
|
||||||
|
|
||||||
self._py = py_tokens
|
self._seq = parent_seq
|
||||||
self._tag_strings = tag_strings
|
self._tag_strings = tag_strings
|
||||||
self._dep_strings = dep_strings
|
self._dep_strings = dep_strings
|
||||||
py_tokens[offset] = self
|
self._seq._py_tokens[offset] = self
|
||||||
return self
|
return self
|
||||||
|
|
||||||
cdef int take_ownership_of_c_data(self) except -1
|
cdef int take_ownership_of_c_data(self) except -1
|
||||||
|
|
|
@ -19,7 +19,6 @@ cimport cython
|
||||||
|
|
||||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
import sys
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -95,21 +94,6 @@ cdef class Tokens:
|
||||||
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||||
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
# The Token object initially only gets a view of the underlying C
|
|
||||||
# data --- it doesn't own it. But, if we have Token objects that are
|
|
||||||
# going to outlive this instance, those objects need a copy of the C
|
|
||||||
# data.
|
|
||||||
cdef Token token
|
|
||||||
if self._py_tokens is not None:
|
|
||||||
for token in self._py_tokens:
|
|
||||||
if token is not None:
|
|
||||||
# Why 3? 1 for the entry in the _py_tokens list,
|
|
||||||
# and 1 for this reference. If we have _another_ ref, then
|
|
||||||
# the token will live, and needs to own its data.
|
|
||||||
if sys.getrefcount(token) >= 3:
|
|
||||||
token.take_ownership_of_c_data()
|
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Retrieve a token.
|
"""Retrieve a token.
|
||||||
|
|
||||||
|
@ -124,7 +108,7 @@ cdef class Tokens:
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token.cinit(self.vocab, self._string,
|
return Token.cinit(self.vocab, self._string,
|
||||||
&self.data[i], i, self.length,
|
&self.data[i], i, self.length,
|
||||||
self._py_tokens, self._tag_strings, self._dep_strings)
|
self, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the tokens.
|
"""Iterate over the tokens.
|
||||||
|
@ -135,7 +119,7 @@ cdef class Tokens:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
yield Token.cinit(self.vocab, self._string,
|
yield Token.cinit(self.vocab, self._string,
|
||||||
&self.data[i], i, self.length,
|
&self.data[i], i, self.length,
|
||||||
self._py_tokens, self._tag_strings, self._dep_strings)
|
self, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.length
|
return self.length
|
||||||
|
@ -277,7 +261,7 @@ cdef class Token:
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
return Token.cinit(self.vocab, self._string,
|
return Token.cinit(self.vocab, self._string,
|
||||||
self.c, self.i, self.array_len,
|
self.c, self.i, self.array_len,
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
self._seq, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -378,7 +362,7 @@ cdef class Token:
|
||||||
elif ptr + ptr.head == self.c:
|
elif ptr + ptr.head == self.c:
|
||||||
yield Token.cinit(self.vocab, self._string,
|
yield Token.cinit(self.vocab, self._string,
|
||||||
ptr, ptr - (self.c - self.i), self.array_len,
|
ptr, ptr - (self.c - self.i), self.array_len,
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
self._seq, self._tag_strings, self._dep_strings)
|
||||||
ptr += 1
|
ptr += 1
|
||||||
else:
|
else:
|
||||||
ptr += 1
|
ptr += 1
|
||||||
|
@ -397,7 +381,7 @@ cdef class Token:
|
||||||
elif ptr + ptr.head == self.c:
|
elif ptr + ptr.head == self.c:
|
||||||
yield Token.cinit(self.vocab, self._string,
|
yield Token.cinit(self.vocab, self._string,
|
||||||
ptr, ptr - (self.c - self.i), self.array_len,
|
ptr, ptr - (self.c - self.i), self.array_len,
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
self._seq, self._tag_strings, self._dep_strings)
|
||||||
ptr -= 1
|
ptr -= 1
|
||||||
else:
|
else:
|
||||||
ptr -= 1
|
ptr -= 1
|
||||||
|
@ -407,7 +391,7 @@ cdef class Token:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
return Token.cinit(self.vocab, self._string,
|
return Token.cinit(self.vocab, self._string,
|
||||||
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
self._seq, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
property whitespace_:
|
property whitespace_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user