* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation.

2025-07-11 08:42:28 +03:00 · 2015-01-31 13:42:58 +11:00 · 2015-01-31 13:42:58 +11:00 · 77d62d0179
commit 77d62d0179
parent 88170e6295
2 changed files with 213 additions and 162 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -37,8 +37,8 @@ cdef class Tokens:
    cdef list _py_tokens
    cdef unicode _string
-    cdef list _tag_strings
+    cdef tuple _tag_strings
-    cdef list _dep_strings
+    cdef tuple _dep_strings
    cdef public bint is_tagged
    cdef public bint is_parsed
@ -52,24 +52,35 @@ cdef class Tokens:
 cdef class Token:
-    cdef readonly Tokens _seq
+    cdef Vocab vocab
-    cdef readonly int i
+    cdef Pool mem
    cdef unicode _string
-    cdef readonly attr_t idx
+    cdef const TokenC* c
-    cdef readonly attr_t cluster
+    cdef int i
-    cdef readonly attr_t length
+    cdef int array_len
-    cdef readonly attr_t orth
+
-    cdef readonly attr_t lower
+    
-    cdef readonly attr_t norm
+    cdef list _py
-    cdef readonly attr_t shape
+    cdef tuple _tag_strings
-    cdef readonly attr_t prefix
+    cdef tuple _dep_strings
-    cdef readonly attr_t suffix
+
-    cdef readonly float prob
+    @staticmethod
-    cdef readonly float sentiment
+    cdef inline Token cinit(Pool mem, Vocab vocab, unicode string,
-    cdef readonly attr_t flags
+                            const TokenC* token, int offset, int array_len,
-    cdef readonly attr_t lemma
+                            list py_tokens, tuple tag_strings, tuple dep_strings):
-    cdef readonly univ_pos_t pos
+        assert offset >= 0 and offset < array_len
-    cdef readonly attr_t tag
+        if py_tokens[offset] is not None:
-    cdef readonly attr_t dep
+            return py_tokens[offset]
-    cdef readonly ndarray repvec
+
-    cdef readonly unicode string
+        cdef Token self = Token.__new__(Token, mem, vocab, string)
        self.c = token
        self.i = offset
        self.array_len = array_len
        self._py = py_tokens
        self._tag_strings = tag_strings
        self._dep_strings = dep_strings
        py_tokens[offset] = self
        return self
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -88,31 +88,11 @@ cdef class Tokens:
        self.length = 0
        self.is_tagged = False
        self.is_parsed = False
-        self._py_tokens = [None] * self.length
+        self._py_tokens = []
-        self._tag_strings = [] # These will be set by the POS tagger and parser
+        self._tag_strings = tuple() # These will be set by the POS tagger and parser
-        self._dep_strings = [] # The strings are arbitrary and model-specific.
+        self._dep_strings = tuple() # The strings are arbitrary and model-specific.
-    def sentences(self):
+    def __getitem__(self, object i):
        cdef int i
        sentences = []
        cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
        cdef attr_t period = self.vocab.strings['.']
        cdef attr_t question = self.vocab.strings['?']
        cdef attr_t exclamation = self.vocab.strings['!']
        spans = []
        start = None
        for i in range(self.length):
            if start is None:
                start = i
            if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
              self.data[i].lex.orth == question:
                spans.append((start, i+1))
                start = None
        if start is not None:
            spans.append((start, self.length))
        return spans
    def __getitem__(self, i):
        """Retrieve a token.
        The Python Token objects are created lazily from internal C data, and
@ -124,9 +104,9 @@ cdef class Tokens:
        if i < 0:
            i = self.length - i
        bounds_check(i, self.length, PADDING)
-        if self._py_tokens[i] is None:
+        return Token.cinit(self.mem, self.vocab, self._string,
-            self._py_tokens[i] = Token(self, i)
+                           &self.data[i], i, self.length,
-        return self._py_tokens[i]
+                           self._py_tokens, self._tag_strings, self._dep_strings)
    def __iter__(self):
        """Iterate over the tokens.
@ -135,7 +115,9 @@ cdef class Tokens:
            token (Token):
        """
        for i in range(self.length):
-            yield self[i]
+            yield Token.cinit(self.mem, self.vocab, self._string,
                              &self.data[i], i, self.length,
                              self._py_tokens, self._tag_strings, self._dep_strings)
    def __len__(self):
        return self.length
@ -157,6 +139,8 @@ cdef class Tokens:
        self._py_tokens.append(None)
        return idx + t.lex.length
    @cython.boundscheck(False)
    cpdef long[:,:] to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -224,64 +208,125 @@ cdef class Tokens:
        for i in range(self.length, self.max_length + PADDING):
            self.data[i].lex = &EMPTY_LEXEME
    @property
    def sents(self):
        """This is really only a place-holder for a proper solution."""
        cdef int i
        sentences = []
        cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
        cdef attr_t period = self.vocab.strings['.']
        cdef attr_t question = self.vocab.strings['?']
        cdef attr_t exclamation = self.vocab.strings['!']
        spans = []
        start = None
        for i in range(self.length):
            if start is None:
                start = i
            if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
              self.data[i].lex.orth == question:
                spans.append((start, i+1))
                start = None
        if start is not None:
            spans.append((start, self.length))
        return spans
@cython.freelist(64)
 cdef class Token:
    """An individual token."""
-    def __cinit__(self, Tokens tokens, int i):
+    def __cinit__(self, Pool mem, Vocab vocab, unicode string):
-        self._seq = tokens
+        self.mem = mem
-        self.i = i
+        self.vocab = vocab
-        cdef const TokenC* t = &tokens.data[i]
+        self._string = string
        self.idx = t.idx
        self.cluster = t.lex.cluster
        self.length = t.lex.length
        self.orth = t.lex.orth
        self.lower = t.lex.lower
        self.norm = t.lex.norm
        self.shape = t.lex.shape
        self.prefix = t.lex.prefix
        self.suffix = t.lex.suffix
        self.prob = t.lex.prob
        self.sentiment = t.lex.sentiment
        self.flags = t.lex.flags
        self.lemma = t.lemma
        self.pos = t.pos
        self.tag = t.tag
        self.dep = t.dep
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
        cdef int next_idx = (t+1).idx
        if next_idx <= self.idx:
            next_idx = self.idx + self.length
        self.string = tokens._string[self.idx:next_idx]
    def __len__(self):
-        return self._seq.data[self.i].lex.length
+        return self.c.lex.length
    def nbor(self, int i=1):
-        return Token(self._seq, self.i + i)
+        return Token.cinit(self.mem, self.vocab, self._string,
                           self.c, self.i, self.array_len,
                           self._py, self._tag_strings, self._dep_strings)
    @property
    def string(self):
        cdef int next_idx = (self.c + 1).idx
        if next_idx < self.c.idx:
            next_idx = self.c.idx + self.c.lex.length
        return self._string[self.c.idx:next_idx]
    @property
    def idx(self):
        return self.c.idx
    @property
    def cluster(self):
        return self.c.lex.cluster
    @property
    def cluster(self):
        return self.c.lex.cluster
    @property
    def orth(self):
        return self.c.lex.orth
    @property
    def lower(self):
        return self.c.lex.lower
    @property
    def norm(self):
        return self.c.lex.norm
    @property
    def shape(self):
        return self.c.lex.shape
    @property
    def prefix(self):
        return self.c.lex.prefix
    @property
    def suffix(self):
        return self.c.lex.suffix
    @property
    def lemma(self):
        return self.c.lemma
    @property
    def pos(self):
        return self.c.pos
    @property
    def tag(self):
        return self.c.tag
    @property
    def dep(self):
        return self.c.dep
    @property
    def repvec(self):
        return numpy.asarray(<float[:300,]> self.c.lex.repvec)
    @property 
    def n_lefts(self):
        if not self._seq.is_parsed:
            msg = _parse_unset_error
            raise AttributeError(msg)
        cdef const TokenC* tokens = self._seq.data
        cdef int n = 0
-        for i in range(self.i):
+        cdef const TokenC* ptr = self.c - self.i
-            if i + tokens[i].head == self.i:
+        while ptr != self.c:
            if ptr + ptr.head == self.c:
                n += 1
            ptr += 1
        return n
    @property 
    def n_rights(self):
        if not self._seq.is_parsed:
            msg = _parse_unset_error
            raise AttributeError(msg)
        cdef const TokenC* tokens = self._seq.data
        cdef int n = 0
-        for i in range(self.i+1, self._seq.length):
+        cdef const TokenC* ptr = self.c + (self.array_len - self.i)
-            if (i + tokens[i].head) == self.i:
+        while ptr != self.c:
            if ptr + ptr.head == self.c:
                n += 1
            ptr -= 1
        return n
    @property 
@ -289,99 +334,94 @@ cdef class Token:
        """The leftward immediate children of the word, in the syntactic
        dependency parse.
        """
-        if not self._seq.is_parsed:
+        cdef const TokenC* ptr = self.c - self.i
-            msg = _parse_unset_error
+        while ptr < self.c:
-            raise AttributeError(msg)
+            # If this head is still to the right of us, we can skip to it
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
                ptr += ptr.head
-        cdef const TokenC* tokens = self._seq.data
+            elif ptr + ptr.head == self.c:
-        cdef int i
+                yield Token.cinit(self.mem, self.vocab, self._string,
-        for i in range(self.i):
+                                  ptr, self.i, self.array_len,
-            if i + tokens[i].head == self.i:
+                                  self._py, self._tag_strings, self._dep_strings)
-                yield Token(self._seq, i)
+                ptr += 1
            else:
                ptr += 1
    @property
    def rights(self):
        """The rightward immediate children of the word, in the syntactic
        dependency parse."""
        cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
        while ptr > self.c:
            # If this head is still to the right of us, we can skip to it
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
                ptr += ptr.head
            elif ptr + ptr.head == self.c:
                yield Token.cinit(self.mem, self.vocab, self._string,
                                  ptr, self.i, self.array_len,
                                  self._py, self._tag_strings, self._dep_strings)
                ptr -= 1
            else:
                ptr -= 1
-        if not self._seq.is_parsed:
+    @property
-            msg = _parse_unset_error
+    def head(self):
            raise AttributeError(msg)
        cdef const TokenC* tokens = self._seq.data
        cdef int i
        for i in range(self.i, self._seq.length):
            if i + tokens[i].head == self.i:
                yield Token(self._seq, i)
    property head:
        """The token predicted by the parser to be the head of the current token."""
-        def __get__(self):
+        return Token.cinit(self.mem, self.vocab, self._string,
-            if not self._seq.is_parsed:
+                           self.c + self.c.head, self.i, self.array_len,
-                msg = _parse_unset_error
+                           self._py, self._tag_strings, self._dep_strings)
                raise AttributeError(msg)
            cdef const TokenC* t = &self._seq.data[self.i]
            return self._seq[self.i + t.head]
-    property whitespace_:
+    @property
-        def __get__(self):
+    def whitespace_(self):
-            return self.string[self.length:]
+        return self.string[self.c.lex.length:]
-    property orth_:
+    @property
-        def __get__(self):
+    def orth_(self):
-            return self._seq.vocab.strings[self.orth]
+        return self.vocab.strings[self.c.lex.orth]
-    property lower_:
+    @property
-        def __get__(self):
+    def lower_(self):
-            return self._seq.vocab.strings[self.lower]
+        return self.vocab.strings[self.c.lex.lower]
-    property norm_:
+    @property
-        def __get__(self):
+    def norm_(self):
-            return self._seq.vocab.strings[self.norm]
+        return self.vocab.strings[self.c.lex.norm]
-    property shape_:
+    @property
-        def __get__(self):
+    def shape_(self):
-            return self._seq.vocab.strings[self.shape]
+        return self.vocab.strings[self.c.lex.shape]
-    property prefix_:
+    @property
-        def __get__(self):
+    def prefix_(self):
-            return self._seq.vocab.strings[self.prefix]
+        return self.vocab.strings[self.c.lex.prefix]
-    property suffix_:
+    @property
-        def __get__(self):
+    def suffix_(self):
-            return self._seq.vocab.strings[self.suffix]
+        return self.vocab.strings[self.c.lex.suffix]
-    property lemma_:
+    @property
-        def __get__(self):
+    def lemma_(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
+        return self.vocab.strings[self.c.lemma]
            if t.lemma == 0:
                return self.string
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
            return py_ustr
-    property pos_:
+    @property
-        def __get__(self):
+    def pos_(self):
-            id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
+        return _pos_id_to_string[self.c.pos]
            return id_to_string[self.pos]
-    property tag_:
+    @property
-        def __get__(self):
+    def tag_(self):
-            return self._seq._tag_strings[self.tag]
+        return self._tag_strings[self.c.tag]
-    property dep_:
+    @property
-        def __get__(self):
+    def dep_(self):
-            return self._seq._dep_strings[self.dep]
+        return self._dep_strings[self.c.dep]
-cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
+_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
    cdef int i
    for i in range(32):
        if bits & (1 << i):
            n -= 1
            if n < 1:
                return i
    return 0
 _parse_unset_error = """Text has not been parsed, so cannot be accessed.