mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-30 11:14:08 +03:00
Improve cache flushing in tokenizer
* Separate cache and specials memory (temporarily) * Flush cache when adding special cases * Repeated `self._cache = PreshMap()` and `self._specials = PreshMap()` are necessary due to this bug: https://github.com/explosion/preshed/issues/21
This commit is contained in:
parent
ae52c5eb52
commit
d277b6bc68
|
@ -191,22 +191,24 @@ cdef class Tokenizer:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
|
||||||
def _flush_cache(self):
|
def _flush_cache(self):
|
||||||
self._reset_cache([key for key in self._cache if not key in self._specials])
|
self._reset_cache([key for key in self._cache])
|
||||||
|
|
||||||
def _reset_cache(self, keys):
|
def _reset_cache(self, keys):
|
||||||
for k in keys:
|
for k in keys:
|
||||||
del self._cache[k]
|
|
||||||
if not k in self._specials:
|
|
||||||
cached = <_Cached*>self._cache.get(k)
|
cached = <_Cached*>self._cache.get(k)
|
||||||
|
del self._cache[k]
|
||||||
if cached is not NULL:
|
if cached is not NULL:
|
||||||
self.mem.free(cached)
|
self.mem.free(cached)
|
||||||
|
if len(self._cache) == 0:
|
||||||
|
self._cache = PreshMap()
|
||||||
|
|
||||||
def _reset_specials(self):
|
def _flush_specials(self):
|
||||||
for k in self._specials:
|
for k in self._specials:
|
||||||
cached = <_Cached*>self._specials.get(k)
|
cached = <_Cached*>self._specials.get(k)
|
||||||
del self._specials[k]
|
del self._specials[k]
|
||||||
if cached is not NULL:
|
if cached is not NULL:
|
||||||
self.mem.free(cached)
|
self.mem.free(cached)
|
||||||
|
self._specials = PreshMap()
|
||||||
|
|
||||||
cdef int _apply_special_cases(self, Doc doc):
|
cdef int _apply_special_cases(self, Doc doc):
|
||||||
"""Retokenize doc according to special cases.
|
"""Retokenize doc according to special cases.
|
||||||
|
@ -466,14 +468,11 @@ cdef class Tokenizer:
|
||||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
stale_special = <_Cached*>self._specials.get(key)
|
stale_special = <_Cached*>self._specials.get(key)
|
||||||
stale_cached = <_Cached*>self._cache.get(key)
|
|
||||||
self._flush_cache()
|
|
||||||
self._specials.set(key, cached)
|
self._specials.set(key, cached)
|
||||||
if stale_special is not NULL:
|
if stale_special is not NULL:
|
||||||
self.mem.free(stale_special)
|
self.mem.free(stale_special)
|
||||||
if stale_special != stale_cached and stale_cached is not NULL:
|
|
||||||
self.mem.free(stale_cached)
|
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
|
self._flush_cache()
|
||||||
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])
|
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])
|
||||||
|
|
||||||
def _reload_special_cases(self):
|
def _reload_special_cases(self):
|
||||||
|
@ -484,10 +483,8 @@ cdef class Tokenizer:
|
||||||
# only reload if all 4 of prefix, suffix, infix, token_match have
|
# only reload if all 4 of prefix, suffix, infix, token_match have
|
||||||
# have been initialized
|
# have been initialized
|
||||||
if self.vocab is not None and self._property_init_count >= 4:
|
if self.vocab is not None and self._property_init_count >= 4:
|
||||||
self._reset_cache([key for key in self._cache])
|
self._flush_cache()
|
||||||
self._reset_specials()
|
self._flush_specials()
|
||||||
self._cache = PreshMap()
|
|
||||||
self._specials = PreshMap()
|
|
||||||
self._load_special_cases(self._rules)
|
self._load_special_cases(self._rules)
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
|
@ -570,10 +567,8 @@ cdef class Tokenizer:
|
||||||
if data.get("rules"):
|
if data.get("rules"):
|
||||||
# make sure to hard reset the cache to remove data from the default exceptions
|
# make sure to hard reset the cache to remove data from the default exceptions
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._reset_cache([key for key in self._cache])
|
self._flush_cache()
|
||||||
self._reset_specials()
|
self._flush_specials()
|
||||||
self._cache = PreshMap()
|
|
||||||
self._specials = PreshMap()
|
|
||||||
self._load_special_cases(data.get("rules", {}))
|
self._load_special_cases(data.get("rules", {}))
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
Loading…
Reference in New Issue
Block a user