* More performance fiddling, particularly moving the specials into the cache, so that we can just lookup the cache in _tokenize

2025-10-26 21:51:24 +03:00 · 2014-09-13 00:59:34 +02:00 · 2014-09-13 00:59:34 +02:00 · afdc9b7ac2
commit afdc9b7ac2
parent 7d239df4c8
1 changed files with 12 additions and 13 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -110,11 +110,14 @@ cdef class Language:
        return tokens
    cdef int _tokenize(self, Tokens tokens, String* string):
-        cdef LexemeC** lexemes = <LexemeC**>self.specials[string.key]
+        cdef LexemeC** lexemes = <LexemeC**>self.cache[string.key]
-        if lexemes == NULL:
+        lexemes = <LexemeC**>self.cache[string.key]
-            lexemes = <LexemeC**>self.cache[string.key]
+        cdef size_t i
        if lexemes != NULL:
-            _extend_tokens(tokens, lexemes)
+            i = 0
            while lexemes[i] != NULL:
                tokens.push_back(lexemes[i])
                i += 1
            return 0
        cdef uint64_t hashed = string.key
@ -128,11 +131,13 @@ cdef class Language:
            string_slice_prefix(string, &prefix, split)
            lexemes = <LexemeC**>self.specials[prefix.key]
            if lexemes != NULL:
-                _extend_tokens(tokens, lexemes)
+                i = 0
                while lexemes[i] != NULL:
                    tokens.push_back(lexemes[i])
                    i += 1
            else:
                tokens.push_back(<LexemeC*>self.lexicon.get(&prefix))
        lexemes = <LexemeC**>calloc(tokens.length - first_token, sizeof(LexemeC*))
        cdef size_t i
        cdef size_t j
        for i, j in enumerate(range(first_token, tokens.length)):
            lexemes[i] = tokens.lexemes[j]
@ -164,13 +169,7 @@ cdef class Language:
            lexemes[i + 1] = NULL
            string_from_unicode(&string, uni_string)
            self.specials[string.key] = <size_t>lexemes
-
+            self.cache[string.key] = <size_t>lexemes
 cdef void _extend_tokens(Tokens tokens, LexemeC** lexemes):
    cdef size_t i = 0
    while lexemes[i] != NULL:
        tokens.push_back(lexemes[i])
        i += 1
 cdef class Lexicon: