mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
968aff2f6a
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize)) - [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here) ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
# coding: utf-8
|
|
"""Test that tokenizer exceptions and emoticons are handles correctly."""
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
|
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 2
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
|
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 1
|
|
|
|
|
|
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
|
text = "Ich bin z.Zt. im Urlaub."
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 6
|
|
assert tokens[2].text == "z.Zt."
|
|
assert tokens[2].lemma_ == "zur Zeit"
|
|
|
|
|
|
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
|
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
|
tokens = de_tokenizer(text)
|
|
assert [token.norm_ for token in tokens] == norms
|
|
|
|
|
|
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
|
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
|
tokens = de_tokenizer(text)
|
|
assert tokens[0].norm_ == norm
|