mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
db55577c45
* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"])
|
|
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 2
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
|
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 1
|
|
|
|
|
|
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
|
text = "Ich bin z.Zt. im Urlaub."
|
|
tokens = de_tokenizer(text)
|
|
assert len(tokens) == 6
|
|
assert tokens[2].text == "z.Zt."
|
|
assert tokens[2].lemma_ == "zur Zeit"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
|
|
)
|
|
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
|
tokens = de_tokenizer(text)
|
|
assert [token.norm_ for token in tokens] == norms
|
|
|
|
|
|
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
|
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
|
tokens = de_tokenizer(text)
|
|
assert tokens[0].norm_ == norm
|