* Moving to storing LexemeC structs internally

2025-11-06 19:07:30 +03:00 · 2014-09-11 21:44:58 +02:00 · 2014-09-11 21:44:58 +02:00 · bf9c60c31c
commit bf9c60c31c
parent 563047e90f
1 changed files with 17 additions and 14 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -95,20 +95,21 @@ cdef class Language:
        return tokens
    cdef _tokenize(self, Tokens tokens, unicode string):
-        cdef list lexemes
+        cdef LexemeC** lexemes
-        if len(string) == 1:
+        if string in self.cache:
-            lexemes = [self.lookup(string)]
+            lexemes = <LexemeC**><size_t>self.cache[string]
        elif string in self.cache:
            lexemes = self.cache[string]
        else:
            lexemes = []
            substrings = self._split(string)
            lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
            for i, substring in enumerate(substrings):
-                lexemes.append(self.lexicon.lookup(substring))
+                lexemes[i] = self.lexicon.lookup(substring)._c
-            self.cache[string] = lexemes
+            lexemes[i + 1] = NULL
-        cdef Lexeme lexeme
+            self.cache[string] = <size_t>lexemes
-        for lexeme in lexemes:
+        cdef LexemeC* lexeme
-            tokens.append(lexeme)
+        i = 0
        while lexemes[i] != NULL:
            tokens.push_back(lexemes[i])
            i += 1
    cdef list _split(self, unicode string):
        """Find how to split a contiguous span of non-space characters into substrings.
@ -147,11 +148,13 @@ cdef class Language:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        cdef LexemeC** lexemes
        for string, substrings in token_rules:
-            lexemes = []
+            lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
            for i, substring in enumerate(substrings):
-                lexemes.append(self.lexicon.lookup(substring))
+                lexemes[i] = self.lexicon.lookup(substring)._c
-            self.cache[string] = lexemes
+            lexemes[i + 1] = NULL
            self.cache[string] = <size_t>lexemes
 cdef class Lexicon: