Override language defaults for null token and URL match (#6705)

* Override language defaults for null token and URL match

When the serialized `token_match` or `url_match` is `None`, override the
language defaults to preserve `None` on deserialization.

* Fix fixtures in tests
This commit is contained in:
Adriane Boyd 2021-01-14 07:31:29 +01:00 committed by GitHub
parent 29c3ca7e34
commit 9957ed7897
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 0 deletions

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
import re
from spacy.util import get_lang_class from spacy.util import get_lang_class
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
@ -22,6 +23,17 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()
Tokenizer(en_vocab).from_bytes(tokenizer_bytes) Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
# test that empty/unset values are set correctly on deserialization
tokenizer = get_lang_class("en").Defaults.create_tokenizer()
tokenizer.token_match = re.compile("test").match
assert tokenizer.rules != {}
assert tokenizer.token_match is not None
assert tokenizer.url_match is not None
tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer.rules == {}
assert tokenizer.token_match is None
assert tokenizer.url_match is None
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
tokenizer.rules = {} tokenizer.rules = {}
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()

View File

@ -608,10 +608,16 @@ cdef class Tokenizer:
self.suffix_search = re.compile(data["suffix_search"]).search self.suffix_search = re.compile(data["suffix_search"]).search
if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_): if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_):
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
# for token_match and url_match, set to None to override the language
# defaults if no regex is provided
if "token_match" in data and isinstance(data["token_match"], basestring_): if "token_match" in data and isinstance(data["token_match"], basestring_):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
else:
self.token_match = None
if "url_match" in data and isinstance(data["url_match"], basestring_): if "url_match" in data and isinstance(data["url_match"], basestring_):
self.url_match = re.compile(data["url_match"]).match self.url_match = re.compile(data["url_match"]).match
else:
self.url_match = None
if "rules" in data and isinstance(data["rules"], dict): if "rules" in data and isinstance(data["rules"], dict):
# make sure to hard reset the cache to remove data from the default exceptions # make sure to hard reset the cache to remove data from the default exceptions
self._rules = {} self._rules = {}