* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

2025-10-29 23:17:59 +03:00 · 2015-07-13 21:46:02 +02:00 · 2015-07-13 21:46:02 +02:00 · 67641f3b58
commit 67641f3b58
parent 6eef0bf9ab
4 changed files with 49 additions and 34 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -29,7 +29,7 @@ cdef class Tokenizer:

    cpdef Doc tokens_from_list(self, list strings)

-    cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
+    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
                             vector[LexemeC*] *suffixes) except NULL
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -39,16 +39,17 @@ cdef class Tokenizer:
        return cls(vocab, rules, prefix_re, suffix_re, infix_re)

    cpdef Doc tokens_from_list(self, list strings):
-        cdef int length = sum([len(s) for s in strings])
-        cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
-        if length == 0:
+        cdef Doc tokens = Doc(self.vocab)
+        if sum([len(s) for s in strings]) == 0:
            return tokens
        cdef UniStr string_struct
        cdef unicode py_string
        cdef int idx = 0
        for i, py_string in enumerate(strings):
            slice_unicode(&string_struct, py_string, 0, len(py_string))
-            tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
+            # Note that we pass tokens.mem here --- the Doc object has ownership
+            tokens.push_back(
+                <const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)
            idx += len(py_string) + 1
        return tokens

@ -73,7 +74,7 @@ cdef class Tokenizer:
            tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
        """
        cdef int length = len(string)
-        cdef Doc tokens = Doc(self.vocab, string)
+        cdef Doc tokens = Doc(self.vocab)
        if length == 0:
            return tokens
        cdef int i = 0
@ -86,32 +87,39 @@ cdef class Tokenizer:
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    slice_unicode(&span, chars, start, i)
-                    cache_hit = self._try_cache(start, span.key, tokens)
+                    cache_hit = self._try_cache(span.key, tokens)
                    if not cache_hit:
                        self._tokenize(tokens, &span, start, i)
                in_ws = not in_ws
                start = i
                if chars[i] == ' ':
+                    tokens.data[tokens.length - 1].spacy = True
                    start += 1
        i += 1
        if start < i:
            slice_unicode(&span, chars, start, i)
-            cache_hit = self._try_cache(start, span.key, tokens)
+            cache_hit = self._try_cache(span.key, tokens)
            if not cache_hit:
                self._tokenize(tokens, &span, start, i)
+
+            tokens.data[tokens.length - 1].spacy = string[-1] == ' '
        return tokens

-    cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
+    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
            return False
        cdef int i
+        cdef int less_one = cached.length-1
        if cached.is_lex:
-            for i in range(cached.length):
-                idx = tokens.push_back(idx, cached.data.lexemes[i])
+            for i in range(less_one):
+                # There's a space at the end of the chunk.
+                tokens.push_back(cached.data.lexemes[i], False)
+            tokens.push_back(cached.data.lexemes[less_one], False)
        else:
-            for i in range(cached.length):
-                idx = tokens.push_back(idx, &cached.data.tokens[i])
+            for i in range(less_one):
+                tokens.push_back(&cached.data.tokens[i], False)
+            tokens.push_back(&cached.data.tokens[less_one], False)
        return True

    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
@ -171,36 +179,39 @@ cdef class Tokenizer:
                            vector[const LexemeC*] *prefixes,
                            vector[const LexemeC*] *suffixes) except -1:
        cdef bint cache_hit
+        cdef bint is_spacy
        cdef int split
        cdef const LexemeC* const* lexemes
-        cdef LexemeC* lexeme
+        cdef const LexemeC* lexeme
        cdef UniStr span
        cdef int i
+        # Have to calculate is_spacy here, i.e. does the token have a trailing
+        # space. There are no spaces *between* the tokens we attach
+        # here, and there *is* a space after the last token.
        if prefixes.size():
            for i in range(prefixes.size()):
-                idx = tokens.push_back(idx, prefixes[0][i])
+                tokens.push_back(prefixes[0][i], False)
        if string.n != 0:
-            cache_hit = self._try_cache(idx, string.key, tokens)
+            cache_hit = self._try_cache(string.key, tokens)
            if cache_hit:
-                # Get last idx
-                idx = tokens.data[tokens.length - 1].idx
-                # Increment by last length
-                idx += tokens.data[tokens.length - 1].lex.length
+                pass
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1:
-                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string))
+                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
                else:
+                    # Append the beginning, afix, end of the infix token
                    slice_unicode(&span, string.chars, 0, split)
-                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
+                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
                    slice_unicode(&span, string.chars, split, split+1)
-                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
+                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
                    slice_unicode(&span, string.chars, split + 1, string.n)
-                    idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
+                    tokens.push_back(self.vocab.get(tokens.mem, &span), False)
        cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
        while it != suffixes.rend():
-            idx = tokens.push_back(idx, deref(it))
+            lexeme = deref(it)
            preinc(it)
+            tokens.push_back(lexeme, False)

    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
        cdef int i
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -26,7 +26,7 @@ cdef class Doc:
    cdef int length
    cdef int max_length

-    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
+    cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1

    cpdef np.ndarray to_array(self, object features)

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -173,7 +173,7 @@ cdef class Doc:
                start = i
        yield Span(self, start, self.length)

-    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
+    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.data[self.length]
@ -181,9 +181,13 @@ cdef class Doc:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
-        t.idx = idx
+        if self.length == 0:
+            t.idx = 0
+        else:
+            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
+        t.spacy = has_space
        self.length += 1
-        return idx + t.lex.length
+        return t.idx + t.lex.length + t.spacy

    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
@ -375,11 +379,11 @@ cdef class Doc:
            string += vocab.strings[lex.orth]
            if space:
                string += u' '
-        cdef Doc doc = Doc(vocab, string)
+        cdef Doc doc = Doc(vocab)
+        cdef bint has_space = False
        cdef int idx = 0
        for i, id_ in enumerate(ids):
-            doc.push_back(idx, vocab.lexemes[id_])
-            idx += vocab.lexemes[id_].length
-            if spaces[i]:
-                idx += 1
+            lex = vocab.lexemes[id_]
+            has_space = spaces[i]
+            doc.push_back(lex, has_space)
        return doc