mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Fix tokenizer cache flushing (#7836)
* Fix tokenizer cache flushing
Fix/simplify tokenizer init detection in order to fix cache flushing
when properties are modified.
* Remove init reloading logic
* Remove logic disabling `_reload_special_cases` on init
  * Setting `rules` last in `__init__` (as before) means that setting
    other properties doesn't reload any special cases
  * Reset `rules` first in `from_bytes` so that setting other properties
    during deserialization doesn't reload any special cases
    unnecessarily
* Reset all properties in `Tokenizer.from_bytes` to allow any settings
  to be `None`
* Also reset special matcher when special cache is flushed
* Remove duplicate special case validation
* Add test for special cases flushing
* Extend test for tokenizer deserialization of None values
			
			
This commit is contained in:
		
							parent
							
								
									cfad7e21d5
								
							
						
					
					
						commit
						f4339f9bff
					
				|  | @ -26,10 +26,14 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): | |||
|     assert tokenizer.rules != {} | ||||
|     assert tokenizer.token_match is not None | ||||
|     assert tokenizer.url_match is not None | ||||
|     assert tokenizer.prefix_search is not None | ||||
|     assert tokenizer.infix_finditer is not None | ||||
|     tokenizer.from_bytes(tokenizer_bytes) | ||||
|     assert tokenizer.rules == {} | ||||
|     assert tokenizer.token_match is None | ||||
|     assert tokenizer.url_match is None | ||||
|     assert tokenizer.prefix_search is None | ||||
|     assert tokenizer.infix_finditer is None | ||||
| 
 | ||||
|     tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) | ||||
|     tokenizer.rules = {} | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| import pytest | ||||
| import re | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy.util import ensure_path | ||||
|  | @ -186,3 +187,31 @@ def test_tokenizer_special_cases_spaces(tokenizer): | |||
|     assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"] | ||||
|     tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}]) | ||||
|     assert [t.text for t in tokenizer("a b c")] == ["a b c"] | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_flush_cache(en_vocab): | ||||
|     suffix_re = re.compile(r"[\.]$") | ||||
|     tokenizer = Tokenizer( | ||||
|         en_vocab, | ||||
|         suffix_search=suffix_re.search, | ||||
|     ) | ||||
|     assert [t.text for t in tokenizer("a.")] == ["a", "."] | ||||
|     tokenizer.suffix_search = None | ||||
|     assert [t.text for t in tokenizer("a.")] == ["a."] | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_flush_specials(en_vocab): | ||||
|     suffix_re = re.compile(r"[\.]$") | ||||
|     rules = {"a a": [{"ORTH": "a a"}]} | ||||
|     tokenizer1 = Tokenizer( | ||||
|         en_vocab, | ||||
|         suffix_search=suffix_re.search, | ||||
|         rules=rules, | ||||
|     ) | ||||
|     tokenizer2 = Tokenizer( | ||||
|         en_vocab, | ||||
|         suffix_search=suffix_re.search, | ||||
|     ) | ||||
|     assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] | ||||
|     tokenizer1.rules = {} | ||||
|     assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."] | ||||
|  |  | |||
|  | @ -23,8 +23,8 @@ cdef class Tokenizer: | |||
|     cdef object _infix_finditer | ||||
|     cdef object _rules | ||||
|     cdef PhraseMatcher _special_matcher | ||||
|     cdef int _property_init_count | ||||
|     cdef int _property_init_max | ||||
|     cdef int _property_init_count  # TODO: unused, remove in v3.1 | ||||
|     cdef int _property_init_max    # TODO: unused, remove in v3.1 | ||||
| 
 | ||||
|     cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) | ||||
|     cdef int _apply_special_cases(self, Doc doc) except -1 | ||||
|  |  | |||
|  | @ -69,8 +69,6 @@ cdef class Tokenizer: | |||
|         self._rules = {} | ||||
|         self._special_matcher = PhraseMatcher(self.vocab) | ||||
|         self._load_special_cases(rules) | ||||
|         self._property_init_count = 0 | ||||
|         self._property_init_max = 4 | ||||
| 
 | ||||
|     property token_match: | ||||
|         def __get__(self): | ||||
|  | @ -79,8 +77,6 @@ cdef class Tokenizer: | |||
|         def __set__(self, token_match): | ||||
|             self._token_match = token_match | ||||
|             self._reload_special_cases() | ||||
|             if self._property_init_count <= self._property_init_max: | ||||
|                 self._property_init_count += 1 | ||||
| 
 | ||||
|     property url_match: | ||||
|         def __get__(self): | ||||
|  | @ -88,7 +84,7 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         def __set__(self, url_match): | ||||
|             self._url_match = url_match | ||||
|             self._flush_cache() | ||||
|             self._reload_special_cases() | ||||
| 
 | ||||
|     property prefix_search: | ||||
|         def __get__(self): | ||||
|  | @ -97,8 +93,6 @@ cdef class Tokenizer: | |||
|         def __set__(self, prefix_search): | ||||
|             self._prefix_search = prefix_search | ||||
|             self._reload_special_cases() | ||||
|             if self._property_init_count <= self._property_init_max: | ||||
|                 self._property_init_count += 1 | ||||
| 
 | ||||
|     property suffix_search: | ||||
|         def __get__(self): | ||||
|  | @ -107,8 +101,6 @@ cdef class Tokenizer: | |||
|         def __set__(self, suffix_search): | ||||
|             self._suffix_search = suffix_search | ||||
|             self._reload_special_cases() | ||||
|             if self._property_init_count <= self._property_init_max: | ||||
|                 self._property_init_count += 1 | ||||
| 
 | ||||
|     property infix_finditer: | ||||
|         def __get__(self): | ||||
|  | @ -117,8 +109,6 @@ cdef class Tokenizer: | |||
|         def __set__(self, infix_finditer): | ||||
|             self._infix_finditer = infix_finditer | ||||
|             self._reload_special_cases() | ||||
|             if self._property_init_count <= self._property_init_max: | ||||
|                 self._property_init_count += 1 | ||||
| 
 | ||||
|     property rules: | ||||
|         def __get__(self): | ||||
|  | @ -126,7 +116,7 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         def __set__(self, rules): | ||||
|             self._rules = {} | ||||
|             self._reset_cache([key for key in self._cache]) | ||||
|             self._flush_cache() | ||||
|             self._flush_specials() | ||||
|             self._cache = PreshMap() | ||||
|             self._specials = PreshMap() | ||||
|  | @ -226,6 +216,7 @@ cdef class Tokenizer: | |||
|                 self.mem.free(cached) | ||||
| 
 | ||||
|     def _flush_specials(self): | ||||
|         self._special_matcher = PhraseMatcher(self.vocab) | ||||
|         for k in self._specials: | ||||
|             cached = <_Cached*>self._specials.get(k) | ||||
|             del self._specials[k] | ||||
|  | @ -568,7 +559,6 @@ cdef class Tokenizer: | |||
|         """Add special-case tokenization rules.""" | ||||
|         if special_cases is not None: | ||||
|             for chunk, substrings in sorted(special_cases.items()): | ||||
|                 self._validate_special_case(chunk, substrings) | ||||
|                 self.add_special_case(chunk, substrings) | ||||
| 
 | ||||
|     def _validate_special_case(self, chunk, substrings): | ||||
|  | @ -616,16 +606,9 @@ cdef class Tokenizer: | |||
|             self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) | ||||
| 
 | ||||
|     def _reload_special_cases(self): | ||||
|         try: | ||||
|             self._property_init_count | ||||
|         except AttributeError: | ||||
|             return | ||||
|         # only reload if all 4 of prefix, suffix, infix, token_match have | ||||
|         # have been initialized | ||||
|         if self.vocab is not None and self._property_init_count >= self._property_init_max: | ||||
|             self._flush_cache() | ||||
|             self._flush_specials() | ||||
|             self._load_special_cases(self._rules) | ||||
|         self._flush_cache() | ||||
|         self._flush_specials() | ||||
|         self._load_special_cases(self._rules) | ||||
| 
 | ||||
|     def explain(self, text): | ||||
|         """A debugging tokenizer that provides information about which | ||||
|  | @ -811,6 +794,15 @@ cdef class Tokenizer: | |||
|             "url_match": lambda b: data.setdefault("url_match", b), | ||||
|             "exceptions": lambda b: data.setdefault("rules", b) | ||||
|         } | ||||
|         # reset all properties and flush all caches (through rules), | ||||
|         # reset rules first so that _reload_special_cases is trivial/fast as | ||||
|         # the other properties are reset | ||||
|         self.rules = {} | ||||
|         self.prefix_search = None | ||||
|         self.suffix_search = None | ||||
|         self.infix_finditer = None | ||||
|         self.token_match = None | ||||
|         self.url_match = None | ||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         if "prefix_search" in data and isinstance(data["prefix_search"], str): | ||||
|             self.prefix_search = re.compile(data["prefix_search"]).search | ||||
|  | @ -818,22 +810,12 @@ cdef class Tokenizer: | |||
|             self.suffix_search = re.compile(data["suffix_search"]).search | ||||
|         if "infix_finditer" in data and isinstance(data["infix_finditer"], str): | ||||
|             self.infix_finditer = re.compile(data["infix_finditer"]).finditer | ||||
|         # for token_match and url_match, set to None to override the language | ||||
|         # defaults if no regex is provided | ||||
|         if "token_match" in data and isinstance(data["token_match"], str): | ||||
|             self.token_match = re.compile(data["token_match"]).match | ||||
|         else: | ||||
|             self.token_match = None | ||||
|         if "url_match" in data and isinstance(data["url_match"], str): | ||||
|             self.url_match = re.compile(data["url_match"]).match | ||||
|         else: | ||||
|             self.url_match = None | ||||
|         if "rules" in data and isinstance(data["rules"], dict): | ||||
|             # make sure to hard reset the cache to remove data from the default exceptions | ||||
|             self._rules = {} | ||||
|             self._flush_cache() | ||||
|             self._flush_specials() | ||||
|             self._load_special_cases(data["rules"]) | ||||
|             self.rules = data["rules"] | ||||
|         return self | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user