* Pass only the tokens vector to _tokenize, instead of the whole python object.

2025-12-13 21:24:22 +03:00 · 2014-09-15 04:01:38 +02:00 · 2014-09-15 04:01:38 +02:00 · e68a431e5e
commit e68a431e5e
parent 08cef75ffd
2 changed files with 11 additions and 11 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -42,5 +42,5 @@ cdef class Language:
    cpdef Tokens tokenize(self, unicode text)
    cpdef Lexeme lookup(self, unicode text)

-    cdef int _tokenize(self, Tokens tokens, String* string)
+    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string)
    cdef int _split_one(self, Py_UNICODE* characters, size_t length)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -182,25 +182,25 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(c) == 1:
                if start < i:
                    string_from_slice(&span, chars, start, i)
-                    self._tokenize(tokens, &span)
+                    self._tokenize(tokens.v, &span)
                start = i + 1
        i += 1
        if start < i:
            string_from_slice(&span, chars, start, i)
-            self._tokenize(tokens, &span)
+            self._tokenize(tokens.v, &span)
        return tokens

-    cdef int _tokenize(self, Tokens tokens, String* string):
+    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string):
        cdef LexemeC** lexemes = <LexemeC**>self.cache.get(string.key)
        cdef size_t i
        if lexemes != NULL:
            i = 0
            while lexemes[i] != NULL:
-                tokens.v.push_back(lexemes[i])
+                tokens_v.push_back(lexemes[i])
                i += 1
            return 0
        cdef uint64_t key = string.key 
-        cdef size_t first_token = len(tokens)
+        cdef size_t first_token = tokens_v.size()
        cdef int split
        cdef int remaining = string.n
        cdef String prefix
@ -212,14 +212,14 @@ cdef class Language:
            if lexemes != NULL:
                i = 0
                while lexemes[i] != NULL:
-                    tokens.v.push_back(lexemes[i])
+                    tokens_v.push_back(lexemes[i])
                    i += 1
            else:
-                tokens.v.push_back(<LexemeC*>self.lexicon.get(&prefix))
-        lexemes = <LexemeC**>calloc(len(tokens) - first_token, sizeof(LexemeC*))
+                tokens_v.push_back(<LexemeC*>self.lexicon.get(&prefix))
+        lexemes = <LexemeC**>calloc(tokens_v.size() - first_token, sizeof(LexemeC*))
        cdef size_t j
-        for i, j in enumerate(range(first_token, tokens.v.size())):
-            lexemes[i] = tokens.v[0][j]
+        for i, j in enumerate(range(first_token, tokens_v.size())):
+            lexemes[i] = tokens_v[0][j]
        self.cache.set(key, lexemes)

    cdef int _split_one(self, Py_UNICODE* characters, size_t length):