# coding: utf-8 from __future__ import unicode_literals import pytest @pytest.mark.xfail @pytest.mark.parametrize('text,tokens', [ ('"deserve,"--and', ['"', "deserve", ",", '"', "--", "and"]), ("exception;--exclusive", ["exception", ";", "--", "exclusive"]), ("day.--Is", ["day", ".", "--", "Is"]), ("refinement:--just", ["refinement", ":", "--", "just"]), ("memories?--To", ["memories", "?", "--", "To"]), ("Useful.=--Therefore", ["Useful", ".", "=", "--", "Therefore"]), ("=Hope.=--Pandora", ["=", "Hope", ".", "=", "--", "Pandora"])]) def test_issue801(en_tokenizer, text, tokens): """Test that special characters + hyphens are split correctly.""" doc = en_tokenizer(text) assert len(doc) == len(tokens) assert [t.text for t in doc] == tokens