fix loading custom tokenizer rules/exceptions from file

This commit is contained in:
svlandeg 2019-08-28 14:17:44 +02:00
parent 7bec0ebbcb
commit c54aabc3cd

View File

@ -441,8 +441,13 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if data.get("token_match"): if data.get("token_match"):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if data.get("rules"):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}
self._cache = PreshMap()
for string, substrings in data.get("rules", {}).items(): for string, substrings in data.get("rules", {}).items():
self.add_special_case(string, substrings) self.add_special_case(string, substrings)
return self return self