* Work on fixing orphaned Token objects bug

This commit is contained in:
Matthew Honnibal 2015-02-16 15:20:31 -05:00
parent 789a6fe462
commit cae077b583
2 changed files with 12 additions and 28 deletions

View File

@ -61,20 +61,20 @@ cdef class Token:
cdef bint _owns_c_data cdef bint _owns_c_data
cdef list _py cdef Tokens _seq
cdef tuple _tag_strings cdef tuple _tag_strings
cdef tuple _dep_strings cdef tuple _dep_strings
@staticmethod @staticmethod
cdef inline Token cinit(Vocab vocab, unicode string, cdef inline Token cinit(Vocab vocab, unicode string,
const TokenC* token, int offset, int array_len, const TokenC* token, int offset, int array_len,
list py_tokens, tuple tag_strings, tuple dep_strings): Tokens parent_seq, tuple tag_strings, tuple dep_strings):
if offset < 0 or offset >= array_len: if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d" msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, array_len)) raise IndexError(msg % (offset, array_len))
if py_tokens[offset] is not None: if parent_seq._py_tokens[offset] is not None:
return py_tokens[offset] return parent_seq._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab, string) cdef Token self = Token.__new__(Token, vocab, string)
@ -82,10 +82,10 @@ cdef class Token:
self.i = offset self.i = offset
self.array_len = array_len self.array_len = array_len
self._py = py_tokens self._seq = parent_seq
self._tag_strings = tag_strings self._tag_strings = tag_strings
self._dep_strings = dep_strings self._dep_strings = dep_strings
py_tokens[offset] = self self._seq._py_tokens[offset] = self
return self return self
cdef int take_ownership_of_c_data(self) except -1 cdef int take_ownership_of_c_data(self) except -1

View File

@ -19,7 +19,6 @@ cimport cython
from cpython.mem cimport PyMem_Malloc, PyMem_Free from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.string cimport memcpy from libc.string cimport memcpy
import sys
DEF PADDING = 5 DEF PADDING = 5
@ -95,21 +94,6 @@ cdef class Tokens:
self._tag_strings = tuple() # These will be set by the POS tagger and parser self._tag_strings = tuple() # These will be set by the POS tagger and parser
self._dep_strings = tuple() # The strings are arbitrary and model-specific. self._dep_strings = tuple() # The strings are arbitrary and model-specific.
def __dealloc__(self):
# The Token object initially only gets a view of the underlying C
# data --- it doesn't own it. But, if we have Token objects that are
# going to outlive this instance, those objects need a copy of the C
# data.
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
if token is not None:
# Why 3? 1 for the entry in the _py_tokens list,
# and 1 for this reference. If we have _another_ ref, then
# the token will live, and needs to own its data.
if sys.getrefcount(token) >= 3:
token.take_ownership_of_c_data()
def __getitem__(self, object i): def __getitem__(self, object i):
"""Retrieve a token. """Retrieve a token.
@ -124,7 +108,7 @@ cdef class Tokens:
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
&self.data[i], i, self.length, &self.data[i], i, self.length,
self._py_tokens, self._tag_strings, self._dep_strings) self, self._tag_strings, self._dep_strings)
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -135,7 +119,7 @@ cdef class Tokens:
for i in range(self.length): for i in range(self.length):
yield Token.cinit(self.vocab, self._string, yield Token.cinit(self.vocab, self._string,
&self.data[i], i, self.length, &self.data[i], i, self.length,
self._py_tokens, self._tag_strings, self._dep_strings) self, self._tag_strings, self._dep_strings)
def __len__(self): def __len__(self):
return self.length return self.length
@ -277,7 +261,7 @@ cdef class Token:
def nbor(self, int i=1): def nbor(self, int i=1):
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
self.c, self.i, self.array_len, self.c, self.i, self.array_len,
self._py, self._tag_strings, self._dep_strings) self._seq, self._tag_strings, self._dep_strings)
property string: property string:
def __get__(self): def __get__(self):
@ -378,7 +362,7 @@ cdef class Token:
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string, yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len, ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings) self._seq, self._tag_strings, self._dep_strings)
ptr += 1 ptr += 1
else: else:
ptr += 1 ptr += 1
@ -397,7 +381,7 @@ cdef class Token:
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string, yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len, ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings) self._seq, self._tag_strings, self._dep_strings)
ptr -= 1 ptr -= 1
else: else:
ptr -= 1 ptr -= 1
@ -407,7 +391,7 @@ cdef class Token:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len, self.c + self.c.head, self.i + self.c.head, self.array_len,
self._py, self._tag_strings, self._dep_strings) self._seq, self._tag_strings, self._dep_strings)
property whitespace_: property whitespace_:
def __get__(self): def __get__(self):