* Restore _py_tokens cache, to handle orphan tokens.

This commit is contained in:
Matthew Honnibal 2015-07-13 22:28:10 +02:00
parent 67641f3b58
commit 8214b74eec
4 changed files with 27 additions and 14 deletions

View File

@ -23,6 +23,8 @@ cdef class Doc:
cdef public bint is_tagged cdef public bint is_tagged
cdef public bint is_parsed cdef public bint is_parsed
cdef public list _py_tokens
cdef int length cdef int length
cdef int max_length cdef int max_length

View File

@ -71,7 +71,7 @@ cdef class Doc:
Container class for annotated text. Constructed via English.__call__ or Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__. Tokenizer.__call__.
""" """
def __cinit__(self, Vocab vocab): def __init__(self, Vocab vocab):
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
self.mem = Pool() self.mem = Pool()
@ -87,6 +87,13 @@ cdef class Doc:
self.length = 0 self.length = 0
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self._py_tokens = []
def __dealloc__(self):
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
token.take_ownership_of_c_data()
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a token.
@ -103,7 +110,7 @@ cdef class Doc:
if i < 0: if i < 0:
i = self.length + i i = self.length + i
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token.cinit(self.vocab, &self.data[i], i, self.length) return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. """Iterate over the tokens.
@ -112,7 +119,7 @@ cdef class Doc:
token (Token): token (Token):
""" """
for i in range(self.length): for i in range(self.length):
yield Token.cinit(self.vocab, &self.data[i], i, self.length) yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
def __len__(self): def __len__(self):
return self.length return self.length
@ -187,6 +194,7 @@ cdef class Doc:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.spacy = has_space t.spacy = has_space
self.length += 1 self.length += 1
self._py_tokens.append(None)
return t.idx + t.lex.length + t.spacy return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False) @cython.boundscheck(False)
@ -259,7 +267,6 @@ cdef class Doc:
cdef int set_parse(self, const TokenC* parsed) except -1: cdef int set_parse(self, const TokenC* parsed) except -1:
# TODO: This method is fairly misleading atm. It's used by GreedyParser # TODO: This method is fairly misleading atm. It's used by GreedyParser
# to actually apply the parse calculated. Need to rethink this. # to actually apply the parse calculated. Need to rethink this.
self._py_tokens = [None] * self.length
self.is_parsed = True self.is_parsed = True
for i in range(self.length): for i in range(self.length):
self.data[i] = parsed[i] self.data[i] = parsed[i]
@ -345,8 +352,6 @@ cdef class Doc:
# ...And, set heads back to a relative position # ...And, set heads back to a relative position
self.data[i].head -= i self.data[i].head -= i
# Clear cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object # Return the merged Python object
return self[start] return self[start]

View File

@ -9,17 +9,22 @@ cdef class Token:
cdef readonly int i cdef readonly int i
cdef int array_len cdef int array_len
cdef bint _owns_c_data cdef bint _owns_c_data
cdef list _py_tokens
@staticmethod @staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len): cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len,
list _py_tokens):
if offset < 0 or offset >= array_len: if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d" msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, array_len)) raise IndexError(msg % (offset, array_len))
if _py_tokens[offset] != None:
return _py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab) cdef Token self = Token.__new__(Token, vocab)
self.c = token self.c = token
self.i = offset self.i = offset
self.array_len = array_len self.array_len = array_len
self._py_tokens = _py_tokens
self._py_tokens[offset] = self
return self return self
cdef int take_ownership_of_c_data(self) except -1 cdef int take_ownership_of_c_data(self) except -1

View File

@ -23,6 +23,7 @@ cdef class Token:
""" """
def __cinit__(self, Vocab vocab): def __cinit__(self, Vocab vocab):
self.vocab = vocab self.vocab = vocab
self._py_tokens = []
def __dealloc__(self): def __dealloc__(self):
if self._owns_c_data: if self._owns_c_data:
@ -45,7 +46,7 @@ cdef class Token:
self._owns_c_data = True self._owns_c_data = True
def nbor(self, int i=1): def nbor(self, int i=1):
return Token.cinit(self.vocab, self.c, self.i, self.array_len) return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens)
property lex_id: property lex_id:
def __get__(self): def __get__(self):
@ -152,7 +153,7 @@ cdef class Token:
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
self.array_len) self.array_len, self._py_tokens)
ptr += 1 ptr += 1
else: else:
ptr += 1 ptr += 1
@ -171,7 +172,7 @@ cdef class Token:
ptr += ptr.head ptr += ptr.head
elif ptr + ptr.head == self.c: elif ptr + ptr.head == self.c:
tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
self.array_len)) self.array_len, self._py_tokens))
ptr -= 1 ptr -= 1
else: else:
ptr -= 1 ptr -= 1
@ -196,19 +197,19 @@ cdef class Token:
def __get__(self): def __get__(self):
return Token.cinit(self.vocab, return Token.cinit(self.vocab,
(self.c - self.i) + self.c.l_edge, self.c.l_edge, (self.c - self.i) + self.c.l_edge, self.c.l_edge,
self.array_len) self.array_len, self._py_tokens)
property right_edge: property right_edge:
def __get__(self): def __get__(self):
return Token.cinit(self.vocab, return Token.cinit(self.vocab,
(self.c - self.i) + self.c.r_edge, self.c.r_edge, (self.c - self.i) + self.c.r_edge, self.c.r_edge,
self.array_len) self.array_len, self._py_tokens)
property head: property head:
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head, return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head,
self.array_len) self.array_len, self._py_tokens)
property conjuncts: property conjuncts:
def __get__(self): def __get__(self):