* Fix tokenizer

2025-07-23 22:49:57 +03:00 · 2015-07-14 00:10:51 +02:00 · 2015-07-14 00:10:51 +02:00 · 3b5baa660f
commit 3b5baa660f
parent 2ae0b439b2
1 changed files with 2 additions and 6 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -110,16 +110,12 @@ cdef class Tokenizer:
        if cached == NULL:
            return False
        cdef int i
-        cdef int less_one = cached.length-1
        if cached.is_lex:
-            for i in range(less_one):
-                # There's a space at the end of the chunk.
+            for i in range(cached.length):
                tokens.push_back(cached.data.lexemes[i], False)
-            tokens.push_back(cached.data.lexemes[less_one], False)
        else:
-            for i in range(less_one):
+            for i in range(cached.length):
                tokens.push_back(&cached.data.tokens[i], False)
-            tokens.push_back(&cached.data.tokens[less_one], False)
        return True

    cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: