Reload special cases when necessary

Reload special cases when affixes or token_match are modified. Skip
reloading during initialization.
This commit is contained in:
Adriane Boyd 2019-09-08 22:40:08 +02:00
parent 64f86b7e97
commit 5eeaffe14f
2 changed files with 25 additions and 6 deletions

View File

@ -22,6 +22,7 @@ cdef class Tokenizer:
cdef object _infix_finditer cdef object _infix_finditer
cdef object _rules cdef object _rules
cdef object _special_matcher cdef object _special_matcher
cdef int _property_init_count
cpdef Doc tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)

View File

@ -62,6 +62,7 @@ cdef class Tokenizer:
self._rules = {} self._rules = {}
self._special_matcher = Matcher(self.vocab) self._special_matcher = Matcher(self.vocab)
self._load_special_cases(rules) self._load_special_cases(rules)
self._property_init_count = 0
property token_match: property token_match:
def __get__(self): def __get__(self):
@ -69,7 +70,8 @@ cdef class Tokenizer:
def __set__(self, token_match): def __set__(self, token_match):
self._token_match = token_match self._token_match = token_match
self._flush_cache() self._reload_special_cases()
self._property_init_count += 1
property prefix_search: property prefix_search:
def __get__(self): def __get__(self):
@ -77,7 +79,8 @@ cdef class Tokenizer:
def __set__(self, prefix_search): def __set__(self, prefix_search):
self._prefix_search = prefix_search self._prefix_search = prefix_search
self._flush_cache() self._reload_special_cases()
self._property_init_count += 1
property suffix_search: property suffix_search:
def __get__(self): def __get__(self):
@ -85,7 +88,8 @@ cdef class Tokenizer:
def __set__(self, suffix_search): def __set__(self, suffix_search):
self._suffix_search = suffix_search self._suffix_search = suffix_search
self._flush_cache() self._reload_special_cases()
self._property_init_count += 1
property infix_finditer: property infix_finditer:
def __get__(self): def __get__(self):
@ -93,7 +97,8 @@ cdef class Tokenizer:
def __set__(self, infix_finditer): def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer self._infix_finditer = infix_finditer
self._flush_cache() self._reload_special_cases()
self._property_init_count += 1
def __reduce__(self): def __reduce__(self):
args = (self.vocab, args = (self.vocab,
@ -466,6 +471,20 @@ cdef class Tokenizer:
self._rules[string] = substrings self._rules[string] = substrings
self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)]) self._special_matcher.add(string, None, [{ORTH: token.text} for token in self._tokenize_affixes(string)])
def _reload_special_cases(self):
try:
self._property_init_count
except AttributeError:
return
# only reload if all 4 of prefix, suffix, infix, token_match have
# have been initialized
if self.vocab is not None and self._property_init_count >= 4:
self._reset_cache([key for key in self._cache])
self._reset_specials()
self._cache = PreshMap()
self._specials = PreshMap()
self._load_special_cases(self._rules)
def to_disk(self, path, **kwargs): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
@ -550,8 +569,7 @@ cdef class Tokenizer:
self._reset_specials() self._reset_specials()
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
for string, substrings in data.get("rules", {}).items(): self._load_special_cases(data.get("rules", {}))
self.add_special_case(string, substrings)
return self return self