mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Restore empty tokenizer properties (#5026)
* Restore empty tokenizer properties * Check for types in tokenizer.from_bytes() * Add test for setting empty tokenizer rules
This commit is contained in:
parent
c6b12ab02a
commit
2281c4708c
|
@ -15,12 +15,19 @@ def load_tokenizer(b):
|
|||
|
||||
|
||||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||
"""Test that custom tokenizer with not all functions defined can be
|
||||
serialized and deserialized correctly (see #2494)."""
|
||||
"""Test that custom tokenizer with not all functions defined or empty
|
||||
properties can be serialized and deserialized correctly (see #2494,
|
||||
#4991)."""
|
||||
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||
|
||||
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
|
||||
tokenizer.rules = {}
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_reloaded.rules == {}
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||||
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||||
|
|
|
@ -14,7 +14,7 @@ import re
|
|||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .compat import unescape_unicode
|
||||
from .compat import unescape_unicode, basestring_
|
||||
from .attrs import intify_attrs
|
||||
from .symbols import ORTH
|
||||
|
||||
|
@ -568,22 +568,22 @@ cdef class Tokenizer:
|
|||
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
||||
if key in data:
|
||||
data[key] = unescape_unicode(data[key])
|
||||
if data.get("prefix_search"):
|
||||
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
|
||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||
if data.get("suffix_search"):
|
||||
if "suffix_search" in data and isinstance(data["suffix_search"], basestring_):
|
||||
self.suffix_search = re.compile(data["suffix_search"]).search
|
||||
if data.get("infix_finditer"):
|
||||
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
|
||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||
if data.get("token_match"):
|
||||
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
||||
self.token_match = re.compile(data["token_match"]).match
|
||||
if data.get("rules"):
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
# make sure to hard reset the cache to remove data from the default exceptions
|
||||
self._rules = {}
|
||||
self._reset_cache([key for key in self._cache])
|
||||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
self._load_special_tokenization(data.get("rules", {}))
|
||||
self._load_special_tokenization(data["rules"])
|
||||
|
||||
return self
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user