failing unit test for Issue 4190

2025-10-25 13:11:03 +03:00 · 2019-08-28 14:16:34 +02:00 · 2019-08-28 14:16:34 +02:00 · 7bec0ebbcb
commit 7bec0ebbcb
parent b91425f803
1 changed files with 57 additions and 0 deletions
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -0,0 +1,57 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+
+import spacy
+from spacy.tokenizer import Tokenizer
+
+from spacy.tests.util import make_tempdir
+
+
+def test_issue4190():
+    test_string = "Test c."
+
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]
+
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = spacy.load(model_dir)
+
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+
+    assert result_1b == result_2
+
+
+def customize_tokenizer(nlp):
+    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
+    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
+    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
+
+    # remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+    exceptions = {
+        k: v
+        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+        if not (len(k) == 2 and k[1] == ".")
+    }
+    new_tokenizer = Tokenizer(
+        nlp.vocab,
+        exceptions,
+        prefix_search=prefix_re.search,
+        suffix_search=suffix_re.search,
+        infix_finditer=infix_re.finditer,
+        token_match=nlp.tokenizer.token_match,
+    )
+
+    nlp.tokenizer = new_tokenizer