mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Restore empty tokenizer properties (#5026)
* Restore empty tokenizer properties * Check for types in tokenizer.from_bytes() * Add test for setting empty tokenizer rules
This commit is contained in:
parent
c6b12ab02a
commit
2281c4708c
|
@ -15,12 +15,19 @@ def load_tokenizer(b):
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||||
"""Test that custom tokenizer with not all functions defined can be
|
"""Test that custom tokenizer with not all functions defined or empty
|
||||||
serialized and deserialized correctly (see #2494)."""
|
properties can be serialized and deserialized correctly (see #2494,
|
||||||
|
#4991)."""
|
||||||
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
||||||
tokenizer_bytes = tokenizer.to_bytes()
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||||
|
|
||||||
|
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
|
||||||
|
tokenizer.rules = {}
|
||||||
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
|
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_reloaded.rules == {}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||||||
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||||||
|
|
|
@ -14,7 +14,7 @@ import re
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .compat import unescape_unicode
|
from .compat import unescape_unicode, basestring_
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
|
|
||||||
|
@ -568,22 +568,22 @@ cdef class Tokenizer:
|
||||||
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
for key in ["prefix_search", "suffix_search", "infix_finditer"]:
|
||||||
if key in data:
|
if key in data:
|
||||||
data[key] = unescape_unicode(data[key])
|
data[key] = unescape_unicode(data[key])
|
||||||
if data.get("prefix_search"):
|
if "prefix_search" in data and isinstance(data["prefix_search"], basestring_):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
if data.get("suffix_search"):
|
if "suffix_search" in data and isinstance(data["suffix_search"], basestring_):
|
||||||
self.suffix_search = re.compile(data["suffix_search"]).search
|
self.suffix_search = re.compile(data["suffix_search"]).search
|
||||||
if data.get("infix_finditer"):
|
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
|
||||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||||
if data.get("token_match"):
|
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
||||||
self.token_match = re.compile(data["token_match"]).match
|
self.token_match = re.compile(data["token_match"]).match
|
||||||
if data.get("rules"):
|
if "rules" in data and isinstance(data["rules"], dict):
|
||||||
# make sure to hard reset the cache to remove data from the default exceptions
|
# make sure to hard reset the cache to remove data from the default exceptions
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._reset_cache([key for key in self._cache])
|
self._reset_cache([key for key in self._cache])
|
||||||
self._reset_specials()
|
self._reset_specials()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
self._load_special_tokenization(data.get("rules", {}))
|
self._load_special_tokenization(data["rules"])
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user