Restore empty tokenizer properties (#5026)

* Restore empty tokenizer properties

* Check for types in tokenizer.from_bytes()

* Add test for setting empty tokenizer rules
This commit is contained in:
adrianeboyd 2020-03-02 11:55:02 +01:00 committed by GitHub
parent c6b12ab02a
commit 2281c4708c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 9 deletions

View File

@ -15,12 +15,19 @@ def load_tokenizer(b):
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
"""Test that custom tokenizer with not all functions defined can be """Test that custom tokenizer with not all functions defined or empty
serialized and deserialized correctly (see #2494).""" properties can be serialized and deserialized correctly (see #2494,
#4991)."""
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()
Tokenizer(en_vocab).from_bytes(tokenizer_bytes) Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
tokenizer.rules = {}
tokenizer_bytes = tokenizer.to_bytes()
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
assert tokenizer_reloaded.rules == {}
@pytest.mark.skip(reason="Currently unreliable across platforms") @pytest.mark.skip(reason="Currently unreliable across platforms")
@pytest.mark.parametrize("text", ["I💜you", "theyre", "“hello”"]) @pytest.mark.parametrize("text", ["I💜you", "theyre", "“hello”"])

View File

@ -14,7 +14,7 @@ import re
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .strings cimport hash_string from .strings cimport hash_string
from .compat import unescape_unicode from .compat import unescape_unicode, basestring_
from .attrs import intify_attrs from .attrs import intify_attrs
from .symbols import ORTH from .symbols import ORTH
@ -568,22 +568,22 @@ cdef class Tokenizer:
for key in ["prefix_search", "suffix_search", "infix_finditer"]: for key in ["prefix_search", "suffix_search", "infix_finditer"]:
if key in data: if key in data:
data[key] = unescape_unicode(data[key]) data[key] = unescape_unicode(data[key])
if data.get("prefix_search"): if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
self.prefix_search = re.compile(data["prefix_search"]).search self.prefix_search = re.compile(data["prefix_search"]).search
if data.get("suffix_search"): if "suffix_search" in data and isinstance(data["suffix_search"], basestring_):
self.suffix_search = re.compile(data["suffix_search"]).search self.suffix_search = re.compile(data["suffix_search"]).search
if data.get("infix_finditer"): if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if data.get("token_match"): if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if data.get("rules"): if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions # make sure to hard reset the cache to remove data from the default exceptions
self._rules = {} self._rules = {}
self._reset_cache([key for key in self._cache]) self._reset_cache([key for key in self._cache])
self._reset_specials() self._reset_specials()
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
self._load_special_tokenization(data.get("rules", {})) self._load_special_tokenization(data["rules"])
return self return self