Completely cleanup tokenizer cache

Tokenizer cache can have be different keys than string

That modification can slow down tokenizer and need to be measured
This commit is contained in:
Roman Domrachev 2017-11-15 17:55:48 +03:00
parent 3e21680814
commit 505c6a2f2f
3 changed files with 11 additions and 10 deletions

View File

@ -561,7 +561,7 @@ class Language(object):
old_refs, recent_refs = recent_refs, old_refs old_refs, recent_refs = recent_refs, old_refs
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
self.vocab._reset_cache(keys, strings) self.vocab._reset_cache(keys, strings)
self.tokenizer._reset_cache(keys) self.tokenizer._reset_cache()
nr_seen = 0 nr_seen = 0
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):

View File

@ -11,19 +11,19 @@ def test_issue1506():
def string_generator(): def string_generator():
for _ in range(10001): for _ in range(10001):
yield "It's sentence produced by that bug." yield u"It's sentence produced by that bug."
for _ in range(10001): for _ in range(10001):
yield "I erase some hbdsaj lemmas." yield u"I erase some hbdsaj lemmas."
for _ in range(10001): for _ in range(10001):
yield "I erase lemmas." yield u"I erase lemmas."
for _ in range(10001): for _ in range(10001):
yield "It's sentence produced by that bug." yield u"It's sentence produced by that bug."
for _ in range(10001): for _ in range(10001):
yield "It's sentence produced by that bug." yield u"It's sentence produced by that bug."
for i, d in enumerate(nlp.pipe(string_generator())): for i, d in enumerate(nlp.pipe(string_generator())):
# We should run cleanup more than one time to actually cleanup data. # We should run cleanup more than one time to actually cleanup data.

View File

@ -132,10 +132,11 @@ cdef class Tokenizer:
for text in texts: for text in texts:
yield self(text) yield self(text)
def _reset_cache(self, keys): def _reset_cache(self):
for k in keys: # We cannot do selective cache cleanup because cache can be different than words
del self._cache[k] # saved in StringStore (prefixes/suffixes/etc).
del self._specials[k] self._cache = PreshMap()
self._specials = PreshMap()
cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key) cached = <_Cached*>self._cache.get(key)