diff --git a/spacy/tests/regression/test_issue801.py b/spacy/tests/regression/test_issue801.py new file mode 100644 index 000000000..df765830a --- /dev/null +++ b/spacy/tests/regression/test_issue801.py @@ -0,0 +1,20 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.xfail +@pytest.mark.parametrize('text,tokens', [ + ('"deserve,"--and', ['"', "deserve", ",", '"', "--", "and"]), + ("exception;--exclusive", ["exception", ";", "--", "exclusive"]), + ("day.--Is", ["day", ".", "--", "Is"]), + ("refinement:--just", ["refinement", ":", "--", "just"]), + ("memories?--To", ["memories", "?", "--", "To"]), + ("Useful.=--Therefore", ["Useful", ".", "=", "--", "Therefore"]), + ("=Hope.=--Pandora", ["=", "Hope", ".", "=", "--", "Pandora"])]) +def test_issue801(en_tokenizer, text, tokens): + """Test that special characters + hyphens are split correctly.""" + doc = en_tokenizer(text) + assert len(doc) == len(tokens) + assert [t.text for t in doc] == tokens