mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
b665a32b95
<!--- Provide a general summary of your changes in the title. --> ## Description See #3079. Here I'm merging into `develop` instead of `master`. ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> Bug fix. ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
79 lines
2.5 KiB
Python
79 lines
2.5 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
from spacy.lang.ru import Russian
|
|
|
|
from ...util import get_doc
|
|
|
|
|
|
@pytest.fixture
|
|
def ru_lemmatizer():
|
|
pytest.importorskip("pymorphy2")
|
|
return Russian.Defaults.create_lemmatizer()
|
|
|
|
|
|
def test_ru_doc_lemmatization(ru_tokenizer):
|
|
words = ["мама", "мыла", "раму"]
|
|
tags = [
|
|
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
|
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
|
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
|
]
|
|
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
|
|
lemmas = [token.lemma_ for token in doc]
|
|
assert lemmas == ["мама", "мыть", "рама"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,lemmas",
|
|
[
|
|
("гвоздики", ["гвоздик", "гвоздика"]),
|
|
("люди", ["человек"]),
|
|
("реки", ["река"]),
|
|
("кольцо", ["кольцо"]),
|
|
("пепперони", ["пепперони"]),
|
|
],
|
|
)
|
|
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
|
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,pos,morphology,lemma",
|
|
[
|
|
("рой", "NOUN", None, "рой"),
|
|
("рой", "VERB", None, "рыть"),
|
|
("клей", "NOUN", None, "клей"),
|
|
("клей", "VERB", None, "клеить"),
|
|
("три", "NUM", None, "три"),
|
|
("кос", "NOUN", {"Number": "Sing"}, "кос"),
|
|
("кос", "NOUN", {"Number": "Plur"}, "коса"),
|
|
("кос", "ADJ", None, "косой"),
|
|
("потом", "NOUN", None, "пот"),
|
|
("потом", "ADV", None, "потом"),
|
|
],
|
|
)
|
|
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
|
ru_lemmatizer, text, pos, morphology, lemma
|
|
):
|
|
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text,morphology,lemma",
|
|
[
|
|
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
|
|
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
|
|
("вина", {"Gender": "Fem"}, "вина"),
|
|
("вина", {"Gender": "Neut"}, "вино"),
|
|
],
|
|
)
|
|
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
|
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
|
|
|
|
|
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
|
assert ru_lemmatizer.punct("«") == ['"']
|
|
assert ru_lemmatizer.punct("»") == ['"']
|