mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge pull request #4207 from svlandeg/bugfix/serialize-tok-exc
Bugfix for serializing tokenizer rules/exceptions
This commit is contained in:
commit
516650f58f
57
spacy/tests/regression/test_issue4190.py
Normal file
57
spacy/tests/regression/test_issue4190.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4190():
|
||||||
|
test_string = "Test c."
|
||||||
|
|
||||||
|
# Load default language
|
||||||
|
nlp_1 = English()
|
||||||
|
doc_1a = nlp_1(test_string)
|
||||||
|
result_1a = [token.text for token in doc_1a]
|
||||||
|
|
||||||
|
# Modify tokenizer
|
||||||
|
customize_tokenizer(nlp_1)
|
||||||
|
doc_1b = nlp_1(test_string)
|
||||||
|
result_1b = [token.text for token in doc_1b]
|
||||||
|
|
||||||
|
# Save and Reload
|
||||||
|
with make_tempdir() as model_dir:
|
||||||
|
nlp_1.to_disk(model_dir)
|
||||||
|
nlp_2 = spacy.load(model_dir)
|
||||||
|
|
||||||
|
# This should be the modified tokenizer
|
||||||
|
doc_2 = nlp_2(test_string)
|
||||||
|
result_2 = [token.text for token in doc_2]
|
||||||
|
|
||||||
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||||
|
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||||
|
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
|
||||||
|
|
||||||
|
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||||
|
exceptions = {
|
||||||
|
k: v
|
||||||
|
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||||
|
if not (len(k) == 2 and k[1] == ".")
|
||||||
|
}
|
||||||
|
new_tokenizer = Tokenizer(
|
||||||
|
nlp.vocab,
|
||||||
|
exceptions,
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=nlp.tokenizer.token_match,
|
||||||
|
)
|
||||||
|
|
||||||
|
nlp.tokenizer = new_tokenizer
|
|
@ -441,8 +441,13 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||||
if data.get("token_match"):
|
if data.get("token_match"):
|
||||||
self.token_match = re.compile(data["token_match"]).match
|
self.token_match = re.compile(data["token_match"]).match
|
||||||
|
if data.get("rules"):
|
||||||
|
# make sure to hard reset the cache to remove data from the default exceptions
|
||||||
|
self._rules = {}
|
||||||
|
self._cache = PreshMap()
|
||||||
for string, substrings in data.get("rules", {}).items():
|
for string, substrings in data.get("rules", {}).items():
|
||||||
self.add_special_case(string, substrings)
|
self.add_special_case(string, substrings)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user