💫 Small efficiency fixes to tokenizer (#2587)

This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical. The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second. Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to: * Fix the variable-length lookarounds in the suffix, infix and `token_match` rules * Improve the performance of the `token_match` regex * Switch back from the `regex` library to the `re` library. ## Checklist  - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2025-08-05 21:00:19 +03:00 · 2018-07-24 23:35:54 +02:00 · 2018-07-24 23:35:54 +02:00 · 82277f63a3
commit 82277f63a3
parent 3c30d1763c
3 changed files with 11 additions and 21 deletions
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -150,7 +150,7 @@ cdef class Tokenizer:
        cdef vector[LexemeC*] prefixes
        cdef vector[LexemeC*] suffixes
        cdef int orig_size
-        cdef int has_special
+        cdef int has_special = 0
        orig_size = tokens.length
        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
                                   &has_special)
@ -272,7 +272,7 @@ cdef class Tokenizer:
                          int has_special, int n) except -1:
        cdef int i
        for i in range(n):
-            if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
+            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                return 0
        # See https://github.com/explosion/spaCy/issues/1250
        if has_special:
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -42,5 +42,4 @@ cdef class Vocab:
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL

-    cdef PreshMap _by_hash
    cdef PreshMap _by_orth
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -48,7 +48,6 @@ cdef class Vocab:
            lemmatizer = Lemmatizer({}, {}, {})
        self.cfg = {'oov_prob': oov_prob}
        self.mem = Pool()
-        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.length = 0
@ -118,13 +117,12 @@ cdef class Vocab:
            return &EMPTY_LEXEME
        cdef LexemeC* lex
        cdef hash_t key = hash_string(string)
-        lex = <LexemeC*>self._by_hash.get(key)
+        lex = <LexemeC*>self._by_orth.get(key)
        cdef size_t addr
        if lex != NULL:
-            if lex.orth != self.strings[string]:
+            if lex.orth != key:
                raise KeyError(Errors.E064.format(string=lex.orth,
-                                                  orth=self.strings[string],
-                                                  orth_id=string))
+                                                  orth=key, orth_id=string))
            return lex
        else:
            return self._new_lexeme(mem, string)
@ -165,14 +163,12 @@ cdef class Vocab:
                elif value is not None:
                    Lexeme.set_struct_attr(lex, attr, value)
        if not is_oov:
-            key = hash_string(string)
-            self._add_lex_to_vocab(key, lex)
+            self._add_lex_to_vocab(lex.orth, lex)
        if lex == NULL:
            raise ValueError(Errors.E085.format(string=string))
        return lex

    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
-        self._by_hash.set(key, <void*>lex)
        self._by_orth.set(lex.orth, <void*>lex)
        self.length += 1

@ -189,7 +185,7 @@ cdef class Vocab:
            int_key = hash_string(key)
        else:
            int_key = key
-        lex = self._by_hash.get(int_key)
+        lex = self._by_orth.get(int_key)
        return lex is not NULL

    def __iter__(self):
@ -461,7 +457,7 @@ cdef class Vocab:
        cdef LexemeC* lexeme = NULL
        cdef SerializedLexemeC lex_data
        cdef int size = 0
-        for key, addr in self._by_hash.items():
+        for key, addr in self._by_orth.items():
            if addr == 0:
                continue
            size += sizeof(lex_data.data)
@ -469,7 +465,7 @@ cdef class Vocab:
        byte_ptr = <unsigned char*>byte_string
        cdef int j
        cdef int i = 0
-        for key, addr in self._by_hash.items():
+        for key, addr in self._by_orth.items():
            if addr == 0:
                continue
            lexeme = <LexemeC*>addr
@ -504,17 +500,12 @@ cdef class Vocab:
                raise ValueError(Errors.E086.format(string=py_str,
                                                    orth_id=lexeme.orth,
                                                    hash_id=self.strings[py_str]))
-            key = hash_string(py_str)
-            self._by_hash.set(key, lexeme)
            self._by_orth.set(lexeme.orth, lexeme)
            self.length += 1

    def _reset_cache(self, keys, strings):
-        for k in keys:
-            del self._by_hash[k]
-
-        if len(strings) != 0:
-            self._by_orth = PreshMap()
+        # I'm not sure this made sense. Disable it for now.
+        raise NotImplementedError


 def pickle_vocab(vocab):