spaCy/spacy/tests/en/tokenizer/test_exceptions.py

21 lines
512 B
Python
Raw Normal View History

# coding: utf-8
"""Test that tokenizer exceptions are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(en_tokenizer):
text = "It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
assert tokens[3].text == "i.e."