Add regression test for #801

This commit is contained in:
Ines Montani 2017-02-02 15:33:52 +01:00
parent 16ce7409e4
commit 13a4ab37e0

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text,tokens', [
('"deserve,"--and', ['"', "deserve", ",", '"', "--", "and"]),
("exception;--exclusive", ["exception", ";", "--", "exclusive"]),
("day.--Is", ["day", ".", "--", "Is"]),
("refinement:--just", ["refinement", ":", "--", "just"]),
("memories?--To", ["memories", "?", "--", "To"]),
("Useful.=--Therefore", ["Useful", ".", "=", "--", "Therefore"]),
("=Hope.=--Pandora", ["=", "Hope", ".", "=", "--", "Pandora"])])
def test_issue801(en_tokenizer, text, tokens):
"""Test that special characters + hyphens are split correctly."""
doc = en_tokenizer(text)
assert len(doc) == len(tokens)
assert [t.text for t in doc] == tokens