diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 9b6a011c9..5d1ac4c92 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from ..util import make_tempdir from ...language import Language +from ...tokenizer import Tokenizer import pytest +import re @pytest.fixture @@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data): language.to_disk(d) new_language = Language().from_disk(d) assert new_language.meta == language.meta + + +def test_serialize_with_custom_tokenizer(): + """Test that serialization with custom tokenizer works without token_match. + See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 + """ + prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''') + suffix_re = re.compile(r'''''') + infix_re = re.compile(r'''[~]''') + + def custom_tokenizer(nlp): + return Tokenizer(nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) + + nlp = Language() + nlp.tokenizer = custom_tokenizer(nlp) + with make_tempdir() as d: + nlp.to_disk(d)