mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 00:04:15 +03:00
20 lines
734 B
Python
20 lines
734 B
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize('text,tokens', [
|
|
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
|
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
|
("day.--Is", ["day", ".--", "Is"]),
|
|
("refinement:--just", ["refinement", ":--", "just"]),
|
|
("memories?--To", ["memories", "?--", "To"]),
|
|
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
|
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
|
|
def test_issue801(en_tokenizer, text, tokens):
|
|
"""Test that special characters + hyphens are split correctly."""
|
|
doc = en_tokenizer(text)
|
|
assert len(doc) == len(tokens)
|
|
assert [t.text for t in doc] == tokens
|