mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Completely cleanup tokenizer cache
Tokenizer cache can have be different keys than string That modification can slow down tokenizer and need to be measured
This commit is contained in:
parent
3e21680814
commit
505c6a2f2f
|
@ -561,7 +561,7 @@ class Language(object):
|
||||||
old_refs, recent_refs = recent_refs, old_refs
|
old_refs, recent_refs = recent_refs, old_refs
|
||||||
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
|
keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data)
|
||||||
self.vocab._reset_cache(keys, strings)
|
self.vocab._reset_cache(keys, strings)
|
||||||
self.tokenizer._reset_cache(keys)
|
self.tokenizer._reset_cache()
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, disable=tuple()):
|
||||||
|
|
|
@ -11,19 +11,19 @@ def test_issue1506():
|
||||||
|
|
||||||
def string_generator():
|
def string_generator():
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "It's sentence produced by that bug."
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "I erase some hbdsaj lemmas."
|
yield u"I erase some hbdsaj lemmas."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "I erase lemmas."
|
yield u"I erase lemmas."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "It's sentence produced by that bug."
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
for _ in range(10001):
|
for _ in range(10001):
|
||||||
yield "It's sentence produced by that bug."
|
yield u"It's sentence produced by that bug."
|
||||||
|
|
||||||
for i, d in enumerate(nlp.pipe(string_generator())):
|
for i, d in enumerate(nlp.pipe(string_generator())):
|
||||||
# We should run cleanup more than one time to actually cleanup data.
|
# We should run cleanup more than one time to actually cleanup data.
|
||||||
|
|
|
@ -132,10 +132,11 @@ cdef class Tokenizer:
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
|
||||||
def _reset_cache(self, keys):
|
def _reset_cache(self):
|
||||||
for k in keys:
|
# We cannot do selective cache cleanup because cache can be different than words
|
||||||
del self._cache[k]
|
# saved in StringStore (prefixes/suffixes/etc).
|
||||||
del self._specials[k]
|
self._cache = PreshMap()
|
||||||
|
self._specials = PreshMap()
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user