mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Override language defaults for null token and URL match (#6705)
* Override language defaults for null token and URL match When the serialized `token_match` or `url_match` is `None`, override the language defaults to preserve `None` on deserialization. * Fix fixtures in tests
This commit is contained in:
parent
29c3ca7e34
commit
9957ed7897
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import re
|
||||
from spacy.util import get_lang_class
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
||||
|
@ -22,6 +23,17 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
|||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||
|
||||
# test that empty/unset values are set correctly on deserialization
|
||||
tokenizer = get_lang_class("en").Defaults.create_tokenizer()
|
||||
tokenizer.token_match = re.compile("test").match
|
||||
assert tokenizer.rules != {}
|
||||
assert tokenizer.token_match is not None
|
||||
assert tokenizer.url_match is not None
|
||||
tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer.rules == {}
|
||||
assert tokenizer.token_match is None
|
||||
assert tokenizer.url_match is None
|
||||
|
||||
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
||||
tokenizer.rules = {}
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
|
|
|
@ -608,10 +608,16 @@ cdef class Tokenizer:
|
|||
self.suffix_search = re.compile(data["suffix_search"]).search
|
||||
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
|
||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||
# for token_match and url_match, set to None to override the language
|
||||
# defaults if no regex is provided
|
||||
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
||||
self.token_match = re.compile(data["token_match"]).match
|
||||
else:
|
||||
self.token_match = None
|
||||
if "url_match" in data and isinstance(data["url_match"], basestring_):
|
||||
self.url_match = re.compile(data["url_match"]).match
|
||||
else:
|
||||
self.url_match = None
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
# make sure to hard reset the cache to remove data from the default exceptions
|
||||
self._rules = {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user