💫 Small efficiency fixes to tokenizer (#2587)

This patch improves tokenizer speed by about 10%, and reduces memory usage in the `Vocab` by removing a redundant index. The `vocab._by_orth` and `vocab._by_hash` indexed on different data in v1, but in v2 the orth and the hash are identical.

The patch also fixes an uninitialized variable in the tokenizer, the `has_special` flag. This checks whether a chunk we're tokenizing triggers a special-case rule. If it does, then we avoid caching within the chunk. This check led to incorrectly rejecting some chunks from the cache. 

With the `en_core_web_md` model, we now tokenize the IMDB train data at 503,104k words per second. Prior to this patch, we had 465,764k words per second.

Before switching to the regex library and supporting more languages, we had 1.3m words per second for the tokenizer. In order to recover the missing speed, we need to:

* Fix the variable-length lookarounds in the suffix, infix and `token_match` rules
* Improve the performance of the `token_match` regex
* Switch back from the `regex` library to the `re` library.

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
Matthew Honnibal 2018-07-24 23:35:54 +02:00 committed by Ines Montani
parent 3c30d1763c
commit 82277f63a3
3 changed files with 11 additions and 21 deletions

View File

@ -150,7 +150,7 @@ cdef class Tokenizer:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
cdef int has_special
cdef int has_special = 0
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
&has_special)
@ -272,7 +272,7 @@ cdef class Tokenizer:
int has_special, int n) except -1:
cdef int i
for i in range(n):
if self.vocab._by_hash.get(tokens[i].lex.orth) == NULL:
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
# See https://github.com/explosion/spaCy/issues/1250
if has_special:

View File

@ -42,5 +42,4 @@ cdef class Vocab:
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef PreshMap _by_hash
cdef PreshMap _by_orth

View File

@ -48,7 +48,6 @@ cdef class Vocab:
lemmatizer = Lemmatizer({}, {}, {})
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
self.length = 0
@ -118,13 +117,12 @@ cdef class Vocab:
return &EMPTY_LEXEME
cdef LexemeC* lex
cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key)
lex = <LexemeC*>self._by_orth.get(key)
cdef size_t addr
if lex != NULL:
if lex.orth != self.strings[string]:
if lex.orth != key:
raise KeyError(Errors.E064.format(string=lex.orth,
orth=self.strings[string],
orth_id=string))
orth=key, orth_id=string))
return lex
else:
return self._new_lexeme(mem, string)
@ -165,14 +163,12 @@ cdef class Vocab:
elif value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
key = hash_string(string)
self._add_lex_to_vocab(key, lex)
self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex)
self.length += 1
@ -189,7 +185,7 @@ cdef class Vocab:
int_key = hash_string(key)
else:
int_key = key
lex = self._by_hash.get(int_key)
lex = self._by_orth.get(int_key)
return lex is not NULL
def __iter__(self):
@ -461,7 +457,7 @@ cdef class Vocab:
cdef LexemeC* lexeme = NULL
cdef SerializedLexemeC lex_data
cdef int size = 0
for key, addr in self._by_hash.items():
for key, addr in self._by_orth.items():
if addr == 0:
continue
size += sizeof(lex_data.data)
@ -469,7 +465,7 @@ cdef class Vocab:
byte_ptr = <unsigned char*>byte_string
cdef int j
cdef int i = 0
for key, addr in self._by_hash.items():
for key, addr in self._by_orth.items():
if addr == 0:
continue
lexeme = <LexemeC*>addr
@ -504,17 +500,12 @@ cdef class Vocab:
raise ValueError(Errors.E086.format(string=py_str,
orth_id=lexeme.orth,
hash_id=self.strings[py_str]))
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
def _reset_cache(self, keys, strings):
for k in keys:
del self._by_hash[k]
if len(strings) != 0:
self._by_orth = PreshMap()
# I'm not sure this made sense. Disable it for now.
raise NotImplementedError
def pickle_vocab(vocab):