From 13a4ab37e0f2221a563074554100ad5f2bfe05d6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 2 Feb 2017 15:33:52 +0100 Subject: [PATCH] Add regression test for #801 --- spacy/tests/regression/test_issue801.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 spacy/tests/regression/test_issue801.py diff --git a/spacy/tests/regression/test_issue801.py b/spacy/tests/regression/test_issue801.py new file mode 100644 index 000000000..df765830a --- /dev/null +++ b/spacy/tests/regression/test_issue801.py @@ -0,0 +1,20 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.xfail +@pytest.mark.parametrize('text,tokens', [ + ('"deserve,"--and', ['"', "deserve", ",", '"', "--", "and"]), + ("exception;--exclusive", ["exception", ";", "--", "exclusive"]), + ("day.--Is", ["day", ".", "--", "Is"]), + ("refinement:--just", ["refinement", ":", "--", "just"]), + ("memories?--To", ["memories", "?", "--", "To"]), + ("Useful.=--Therefore", ["Useful", ".", "=", "--", "Therefore"]), + ("=Hope.=--Pandora", ["=", "Hope", ".", "=", "--", "Pandora"])]) +def test_issue801(en_tokenizer, text, tokens): + """Test that special characters + hyphens are split correctly.""" + doc = en_tokenizer(text) + assert len(doc) == len(tokens) + assert [t.text for t in doc] == tokens