* Restore _py_tokens cache, to handle orphan tokens.

2025-08-24 22:14:56 +03:00 · 2015-07-13 22:28:10 +02:00 · 2015-07-13 22:28:10 +02:00 · 8214b74eec
commit 8214b74eec
parent 67641f3b58
4 changed files with 27 additions and 14 deletions
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -23,6 +23,8 @@ cdef class Doc:
    cdef public bint is_tagged
    cdef public bint is_parsed
    cdef public list _py_tokens
    cdef int length
    cdef int max_length
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -71,7 +71,7 @@ cdef class Doc:
    Container class for annotated text.  Constructed via English.__call__ or
    Tokenizer.__call__.
    """
-    def __cinit__(self, Vocab vocab):
+    def __init__(self, Vocab vocab):
        self.vocab = vocab
        size = 20
        self.mem = Pool()
@ -87,6 +87,13 @@ cdef class Doc:
        self.length = 0
        self.is_tagged = False
        self.is_parsed = False
        self._py_tokens = []
    def __dealloc__(self):
        cdef Token token
        if self._py_tokens is not None:
            for token in self._py_tokens:
                token.take_ownership_of_c_data()
    def __getitem__(self, object i):
        """Get a token.
@ -103,7 +110,7 @@ cdef class Doc:
        if i < 0:
            i = self.length + i
        bounds_check(i, self.length, PADDING)
-        return Token.cinit(self.vocab, &self.data[i], i, self.length)
+        return Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
    def __iter__(self):
        """Iterate over the tokens.
@ -112,7 +119,7 @@ cdef class Doc:
            token (Token):
        """
        for i in range(self.length):
-            yield Token.cinit(self.vocab, &self.data[i], i, self.length)
+            yield Token.cinit(self.vocab, &self.data[i], i, self.length, self._py_tokens)
    def __len__(self):
        return self.length
@ -187,6 +194,7 @@ cdef class Doc:
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
        t.spacy = has_space
        self.length += 1
        self._py_tokens.append(None)
        return t.idx + t.lex.length + t.spacy
    @cython.boundscheck(False)
@ -259,7 +267,6 @@ cdef class Doc:
    cdef int set_parse(self, const TokenC* parsed) except -1:
        # TODO: This method is fairly misleading atm. It's used by GreedyParser
        # to actually apply the parse calculated. Need to rethink this.
        self._py_tokens = [None] * self.length
        self.is_parsed = True
        for i in range(self.length):
            self.data[i] = parsed[i]
@ -345,8 +352,6 @@ cdef class Doc:
            # ...And, set heads back to a relative position
            self.data[i].head -= i
        # Clear cached Python objects
        self._py_tokens = [None] * self.length
        # Return the merged Python object
        return self[start]
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -9,17 +9,22 @@ cdef class Token:
    cdef readonly int i
    cdef int array_len
    cdef bint _owns_c_data
    cdef list _py_tokens
    @staticmethod
-    cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len):
+    cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, int array_len,
                            list _py_tokens):
        if offset < 0 or offset >= array_len:
            msg = "Attempt to access token at %d, max length %d"
            raise IndexError(msg % (offset, array_len))
-        
+        if _py_tokens[offset] != None:
            return _py_tokens[offset]
        cdef Token self = Token.__new__(Token, vocab)
        self.c = token
        self.i = offset
        self.array_len = array_len
        self._py_tokens = _py_tokens
        self._py_tokens[offset] = self
        return self
    cdef int take_ownership_of_c_data(self) except -1
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -23,6 +23,7 @@ cdef class Token:
    """
    def __cinit__(self, Vocab vocab):
        self.vocab = vocab
        self._py_tokens = []
    def __dealloc__(self):
        if self._owns_c_data:
@ -45,7 +46,7 @@ cdef class Token:
        self._owns_c_data = True
    def nbor(self, int i=1):
-        return Token.cinit(self.vocab, self.c, self.i, self.array_len)
+        return Token.cinit(self.vocab, self.c, self.i, self.array_len, self._py_tokens)
    property lex_id:
        def __get__(self):
@ -152,7 +153,7 @@ cdef class Token:
                elif ptr + ptr.head == self.c:
                    yield Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
-                                      self.array_len)
+                                      self.array_len, self._py_tokens)
                    ptr += 1
                else:
                    ptr += 1
@ -171,7 +172,7 @@ cdef class Token:
                    ptr += ptr.head
                elif ptr + ptr.head == self.c:
                    tokens.append(Token.cinit(self.vocab, ptr, ptr - (self.c - self.i),
-                                  self.array_len))
+                                  self.array_len, self._py_tokens))
                    ptr -= 1
                else:
                    ptr -= 1
@ -196,19 +197,19 @@ cdef class Token:
        def __get__(self):
            return Token.cinit(self.vocab,
                               (self.c - self.i) + self.c.l_edge, self.c.l_edge,
-                               self.array_len)
+                               self.array_len, self._py_tokens)
    property right_edge:
        def __get__(self):
            return Token.cinit(self.vocab,
                               (self.c - self.i) + self.c.r_edge, self.c.r_edge,
-                               self.array_len)
+                               self.array_len, self._py_tokens)
    property head:
        def __get__(self):
            """The token predicted by the parser to be the head of the current token."""
            return Token.cinit(self.vocab, self.c + self.c.head, self.i + self.c.head,
-                               self.array_len)
+                               self.array_len, self._py_tokens)
    property conjuncts:
        def __get__(self):