mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Override language defaults for null token and URL match (#6705)
* Override language defaults for null token and URL match When the serialized `token_match` or `url_match` is `None`, override the language defaults to preserve `None` on deserialization. * Fix fixtures in tests
This commit is contained in:
parent
29c3ca7e34
commit
9957ed7897
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
|
@ -22,6 +23,17 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||||
tokenizer_bytes = tokenizer.to_bytes()
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
||||||
|
|
||||||
|
# test that empty/unset values are set correctly on deserialization
|
||||||
|
tokenizer = get_lang_class("en").Defaults.create_tokenizer()
|
||||||
|
tokenizer.token_match = re.compile("test").match
|
||||||
|
assert tokenizer.rules != {}
|
||||||
|
assert tokenizer.token_match is not None
|
||||||
|
assert tokenizer.url_match is not None
|
||||||
|
tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer.rules == {}
|
||||||
|
assert tokenizer.token_match is None
|
||||||
|
assert tokenizer.url_match is None
|
||||||
|
|
||||||
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
||||||
tokenizer.rules = {}
|
tokenizer.rules = {}
|
||||||
tokenizer_bytes = tokenizer.to_bytes()
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
|
|
|
@ -608,10 +608,16 @@ cdef class Tokenizer:
|
||||||
self.suffix_search = re.compile(data["suffix_search"]).search
|
self.suffix_search = re.compile(data["suffix_search"]).search
|
||||||
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
|
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
|
||||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||||
|
# for token_match and url_match, set to None to override the language
|
||||||
|
# defaults if no regex is provided
|
||||||
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
if "token_match" in data and isinstance(data["token_match"], basestring_):
|
||||||
self.token_match = re.compile(data["token_match"]).match
|
self.token_match = re.compile(data["token_match"]).match
|
||||||
|
else:
|
||||||
|
self.token_match = None
|
||||||
if "url_match" in data and isinstance(data["url_match"], basestring_):
|
if "url_match" in data and isinstance(data["url_match"], basestring_):
|
||||||
self.url_match = re.compile(data["url_match"]).match
|
self.url_match = re.compile(data["url_match"]).match
|
||||||
|
else:
|
||||||
|
self.url_match = None
|
||||||
if "rules" in data and isinstance(data["rules"], dict):
|
if "rules" in data and isinstance(data["rules"], dict):
|
||||||
# make sure to hard reset the cache to remove data from the default exceptions
|
# make sure to hard reset the cache to remove data from the default exceptions
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user