mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
28 lines
764 B
Python
28 lines
764 B
Python
|
# coding: utf-8
|
||
|
"""Test that tokenizer exceptions and emoticons are handles correctly."""
|
||
|
|
||
|
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||
|
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||
|
tokens = de_tokenizer(text)
|
||
|
assert len(tokens) == 2
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||
|
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||
|
tokens = de_tokenizer(text)
|
||
|
assert len(tokens) == 1
|
||
|
|
||
|
|
||
|
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||
|
text = "Ich bin z.Zt. im Urlaub."
|
||
|
tokens = de_tokenizer(text)
|
||
|
assert len(tokens) == 6
|
||
|
assert tokens[2].text == "z.Zt."
|
||
|
assert tokens[2].lemma_ == "zur Zeit"
|