From 8214b74eec57547c9ed6c8ee16cd742cbc56950e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 13 Jul 2015 22:28:10 +0200 Subject: [PATCH] * Restore _py_tokens cache, to handle orphan tokens. --- spacy/tokens/doc.pxd | 2 ++ spacy/tokens/doc.pyx | 17 +++++++++++------ spacy/tokens/token.pxd | 9 +++++++-- spacy/tokens/token.pyx | 13 +++++++------ 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index a19c387ba..94f8cf993 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -23,6 +23,8 @@ cdef class Doc: cdef public bint is_tagged cdef public bint is_parsed + cdef public list _py_tokens + cdef int length cdef int max_length diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1daef2c05..d8585e92d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -71,7 +71,7 @@ cdef class Doc: Container class for annotated text. Constructed via English.__call__ or Tokenizer.__call__. """ - def __cinit__(self, Vocab vocab): + def __init__(self, Vocab vocab): self.vocab = vocab size = 20 self.mem = Pool() @@ -87,6 +87,13 @@ cdef class Doc: self.length = 0 self.is_tagged = False self.is_parsed = False + self._py_tokens = [] + + def __dealloc__(self): + cdef Token token + if self._py_tokens is not None: + for token in self._py_tokens: + token.take_ownership_of_c_data() def __getitem__(self, object i): """Get a token. @@ -103,7 +110,7 @@ cdef class Doc: if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) - return Token.cinit(self.vocab, &self.data[i], i, self.length) + return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) def __iter__(self): """Iterate over the tokens. @@ -112,7 +119,7 @@ cdef class Doc: token (Token): """ for i in range(self.length): - yield Token.cinit(self.vocab, &self.data[i], i, self.length) + yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens) def __len__(self): return self.length @@ -187,6 +194,7 @@ cdef class Doc: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy t.spacy = has_space self.length += 1 + self._py_tokens.append(None) return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) @@ -259,7 +267,6 @@ cdef class Doc: cdef int set_parse(self, const TokenC* parsed) except -1: # TODO: This method is fairly misleading atm. It's used by GreedyParser # to actually apply the parse calculated. Need to rethink this. - self._py_tokens = [None] * self.length self.is_parsed = True for i in range(self.length): self.data[i] = parsed[i] @@ -345,8 +352,6 @@ cdef class Doc: # ...And, set heads back to a relative position self.data[i].head -= i - # Clear cached Python objects - self._py_tokens = [None] * self.length # Return the merged Python object return self[start] diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index a921ba080..433804fc2 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -9,17 +9,22 @@ cdef class Token: cdef readonly int i cdef int array_len cdef bint _owns_c_data + cdef list _py_tokens @staticmethod - cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len): + cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len, + list _py_tokens): if offset < 0 or offset >= array_len: msg = "Attempt to access token at %d, max length %d" raise IndexError(msg % (offset, array_len)) - + if _py_tokens[offset] != None: + return _py_tokens[offset] cdef Token self = Token.__new__(Token, vocab) self.c = token self.i = offset self.array_len = array_len + self._py_tokens = _py_tokens + self._py_tokens[offset] = self return self cdef int take_ownership_of_c_data(self) except -1 diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b1bde6a13..583a8c11f 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -23,6 +23,7 @@ cdef class Token: """ def __cinit__(self, Vocab vocab): self.vocab = vocab + self._py_tokens = [] def __dealloc__(self): if self._owns_c_data: @@ -45,7 +46,7 @@ cdef class Token: self._owns_c_data = True def nbor(self, int i=1): - return Token.cinit(self.vocab, self.c, self.i, self.array_len) + return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens) property lex_id: def __get__(self): @@ -152,7 +153,7 @@ cdef class Token: elif ptr + ptr.head == self.c: yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), - self.array_len) + self.array_len, self._py_tokens) ptr += 1 else: ptr += 1 @@ -171,7 +172,7 @@ cdef class Token: ptr += ptr.head elif ptr + ptr.head == self.c: tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i), - self.array_len)) + self.array_len, self._py_tokens)) ptr -= 1 else: ptr -= 1 @@ -196,19 +197,19 @@ cdef class Token: def __get__(self): return Token.cinit(self.vocab, (self.c - self.i) + self.c.l_edge, self.c.l_edge, - self.array_len) + self.array_len, self._py_tokens) property right_edge: def __get__(self): return Token.cinit(self.vocab, (self.c - self.i) + self.c.r_edge, self.c.r_edge, - self.array_len) + self.array_len, self._py_tokens) property head: def __get__(self): """The token predicted by the parser to be the head of the current token.""" return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head, - self.array_len) + self.array_len, self._py_tokens) property conjuncts: def __get__(self):