mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Flush tokenizer cache when necessary (#4258)
Flush tokenizer cache when affixes, token_match, or special cases are modified. Fixes #4238, same issue as in #1250.
This commit is contained in:
		
							parent
							
								
									d03401f532
								
							
						
					
					
						commit
						3780e2ff50
					
				| 
						 | 
					@ -13,7 +13,6 @@ from spacy.lemmatizer import Lemmatizer
 | 
				
			||||||
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
 | 
					from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					 | 
				
			||||||
def test_issue1061():
 | 
					def test_issue1061():
 | 
				
			||||||
    '''Test special-case works after tokenizing. Was caching problem.'''
 | 
					    '''Test special-case works after tokenizing. Was caching problem.'''
 | 
				
			||||||
    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
 | 
					    text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,10 +16,10 @@ cdef class Tokenizer:
 | 
				
			||||||
    cdef PreshMap _specials
 | 
					    cdef PreshMap _specials
 | 
				
			||||||
    cpdef readonly Vocab vocab
 | 
					    cpdef readonly Vocab vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef public object token_match
 | 
					    cdef object _token_match
 | 
				
			||||||
    cdef public object prefix_search
 | 
					    cdef object _prefix_search
 | 
				
			||||||
    cdef public object suffix_search
 | 
					    cdef object _suffix_search
 | 
				
			||||||
    cdef public object infix_finditer
 | 
					    cdef object _infix_finditer
 | 
				
			||||||
    cdef object _rules
 | 
					    cdef object _rules
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef Doc tokens_from_list(self, list strings)
 | 
					    cpdef Doc tokens_from_list(self, list strings)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -61,6 +61,38 @@ cdef class Tokenizer:
 | 
				
			||||||
            for chunk, substrings in sorted(rules.items()):
 | 
					            for chunk, substrings in sorted(rules.items()):
 | 
				
			||||||
                self.add_special_case(chunk, substrings)
 | 
					                self.add_special_case(chunk, substrings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property token_match:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self._token_match
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, token_match):
 | 
				
			||||||
 | 
					            self._token_match = token_match
 | 
				
			||||||
 | 
					            self._flush_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property prefix_search:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self._prefix_search
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, prefix_search):
 | 
				
			||||||
 | 
					            self._prefix_search = prefix_search
 | 
				
			||||||
 | 
					            self._flush_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property suffix_search:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self._suffix_search
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, suffix_search):
 | 
				
			||||||
 | 
					            self._suffix_search = suffix_search
 | 
				
			||||||
 | 
					            self._flush_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property infix_finditer:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            return self._infix_finditer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __set__(self, infix_finditer):
 | 
				
			||||||
 | 
					            self._infix_finditer = infix_finditer
 | 
				
			||||||
 | 
					            self._flush_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        args = (self.vocab,
 | 
					        args = (self.vocab,
 | 
				
			||||||
                self._rules,
 | 
					                self._rules,
 | 
				
			||||||
| 
						 | 
					@ -141,9 +173,23 @@ cdef class Tokenizer:
 | 
				
			||||||
        for text in texts:
 | 
					        for text in texts:
 | 
				
			||||||
            yield self(text)
 | 
					            yield self(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _flush_cache(self):
 | 
				
			||||||
 | 
					        self._reset_cache([key for key in self._cache if not key in self._specials])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _reset_cache(self, keys):
 | 
					    def _reset_cache(self, keys):
 | 
				
			||||||
        for k in keys:
 | 
					        for k in keys:
 | 
				
			||||||
            del self._cache[k]
 | 
					            del self._cache[k]
 | 
				
			||||||
 | 
					            if not k in self._specials:
 | 
				
			||||||
 | 
					                cached = <_Cached*>self._cache.get(k)
 | 
				
			||||||
 | 
					                if cached is not NULL:
 | 
				
			||||||
 | 
					                    self.mem.free(cached)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _reset_specials(self):
 | 
				
			||||||
 | 
					        for k in self._specials:
 | 
				
			||||||
 | 
					            cached = <_Cached*>self._specials.get(k)
 | 
				
			||||||
 | 
					            del self._specials[k]
 | 
				
			||||||
 | 
					            if cached is not NULL:
 | 
				
			||||||
 | 
					                self.mem.free(cached)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
 | 
					    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
 | 
				
			||||||
        cached = <_Cached*>self._cache.get(key)
 | 
					        cached = <_Cached*>self._cache.get(key)
 | 
				
			||||||
| 
						 | 
					@ -183,6 +229,9 @@ cdef class Tokenizer:
 | 
				
			||||||
        while string and len(string) != last_size:
 | 
					        while string and len(string) != last_size:
 | 
				
			||||||
            if self.token_match and self.token_match(string):
 | 
					            if self.token_match and self.token_match(string):
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					            if self._specials.get(hash_string(string)) != NULL:
 | 
				
			||||||
 | 
					                has_special[0] = 1
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
            last_size = len(string)
 | 
					            last_size = len(string)
 | 
				
			||||||
            pre_len = self.find_prefix(string)
 | 
					            pre_len = self.find_prefix(string)
 | 
				
			||||||
            if pre_len != 0:
 | 
					            if pre_len != 0:
 | 
				
			||||||
| 
						 | 
					@ -360,8 +409,15 @@ cdef class Tokenizer:
 | 
				
			||||||
        cached.is_lex = False
 | 
					        cached.is_lex = False
 | 
				
			||||||
        cached.data.tokens = self.vocab.make_fused_token(substrings)
 | 
					        cached.data.tokens = self.vocab.make_fused_token(substrings)
 | 
				
			||||||
        key = hash_string(string)
 | 
					        key = hash_string(string)
 | 
				
			||||||
 | 
					        stale_special = <_Cached*>self._specials.get(key)
 | 
				
			||||||
 | 
					        stale_cached = <_Cached*>self._cache.get(key)
 | 
				
			||||||
 | 
					        self._flush_cache()
 | 
				
			||||||
        self._specials.set(key, cached)
 | 
					        self._specials.set(key, cached)
 | 
				
			||||||
        self._cache.set(key, cached)
 | 
					        self._cache.set(key, cached)
 | 
				
			||||||
 | 
					        if stale_special is not NULL:
 | 
				
			||||||
 | 
					            self.mem.free(stale_special)
 | 
				
			||||||
 | 
					        if stale_special != stale_cached and stale_cached is not NULL:
 | 
				
			||||||
 | 
					            self.mem.free(stale_cached)
 | 
				
			||||||
        self._rules[string] = substrings
 | 
					        self._rules[string] = substrings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, **kwargs):
 | 
					    def to_disk(self, path, **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -444,7 +500,10 @@ cdef class Tokenizer:
 | 
				
			||||||
        if data.get("rules"):
 | 
					        if data.get("rules"):
 | 
				
			||||||
            # make sure to hard reset the cache to remove data from the default exceptions
 | 
					            # make sure to hard reset the cache to remove data from the default exceptions
 | 
				
			||||||
            self._rules = {}
 | 
					            self._rules = {}
 | 
				
			||||||
 | 
					            self._reset_cache([key for key in self._cache])
 | 
				
			||||||
 | 
					            self._reset_specials()
 | 
				
			||||||
            self._cache = PreshMap()
 | 
					            self._cache = PreshMap()
 | 
				
			||||||
 | 
					            self._specials = PreshMap()
 | 
				
			||||||
            for string, substrings in data.get("rules", {}).items():
 | 
					            for string, substrings in data.get("rules", {}).items():
 | 
				
			||||||
                self.add_special_case(string, substrings)
 | 
					                self.add_special_case(string, substrings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user