diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0fc74243d..3a5c8e451 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -337,17 +337,17 @@ def ru_tokenizer(): return get_lang_class("ru")().tokenizer -@pytest.fixture +@pytest.fixture(scope="session") def ru_lemmatizer(): pytest.importorskip("pymorphy3") return get_lang_class("ru")().add_pipe("lemmatizer") -@pytest.fixture +@pytest.fixture(scope="session") def ru_lookup_lemmatizer(): - pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy3") return get_lang_class("ru")().add_pipe( - "lemmatizer", config={"mode": "pymorphy2_lookup"} + "lemmatizer", config={"mode": "pymorphy3_lookup"} ) @@ -423,19 +423,19 @@ def uk_tokenizer(): return get_lang_class("uk")().tokenizer -@pytest.fixture +@pytest.fixture(scope="session") def uk_lemmatizer(): pytest.importorskip("pymorphy3") pytest.importorskip("pymorphy3_dicts_uk") return get_lang_class("uk")().add_pipe("lemmatizer") -@pytest.fixture +@pytest.fixture(scope="session") def uk_lookup_lemmatizer(): - pytest.importorskip("pymorphy2") - pytest.importorskip("pymorphy2_dicts_uk") + pytest.importorskip("pymorphy3") + pytest.importorskip("pymorphy3_dicts_uk") return get_lang_class("uk")().add_pipe( - "lemmatizer", config={"mode": "pymorphy2_lookup"} + "lemmatizer", config={"mode": "pymorphy3_lookup"} ) diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index e82fd4f8c..d28f98044 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -81,6 +81,7 @@ def test_ru_lemmatizer_punct(ru_lemmatizer): def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): + assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup" words = ["мама", "мыла", "раму"] pos = ["NOUN", "VERB", "NOUN"] morphs = [ @@ -92,3 +93,17 @@ def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): doc = ru_lookup_lemmatizer(doc) lemmas = [token.lemma_ for token in doc] assert lemmas == ["мама", "мыла", "раму"] + + +@pytest.mark.parametrize( + "word,lemma", + ( + ("бременем", "бремя"), + ("будешь", "быть"), + ("какая-то", "какой-то"), + ), +) +def test_ruk_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma): + assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup" + doc = Doc(ru_lookup_lemmatizer.vocab, words=[word]) + assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 788744aa1..616c36f48 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -8,12 +8,22 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) + assert uk_lemmatizer.mode == "pymorphy3" uk_lemmatizer(doc) assert [token.lemma for token in doc] -def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer): - """Check that the lookup uk lemmatizer runs.""" - doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"]) - uk_lookup_lemmatizer(doc) - assert [token.lemma for token in doc] +@pytest.mark.parametrize( + "word,lemma", + ( + ("якийсь", "якийсь"), + pytest.param("зеленої", "зелений", marks=pytest.mark.xfail()), + ("розповідають", "розповідати"), + ("розповіси", "розповісти"), + pytest.param("телятові", "теля", marks=pytest.mark.xfail()), + ), +) +def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma): + assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup" + doc = Doc(uk_lookup_lemmatizer.vocab, words=[word]) + assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma diff --git a/spacy/tests/pipeline/test_pymorphy_lemmatizer.py b/spacy/tests/pipeline/test_pymorphy_lemmatizer.py deleted file mode 100644 index a1a5f03f2..000000000 --- a/spacy/tests/pipeline/test_pymorphy_lemmatizer.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -import pickle -from spacy import util, registry -from spacy.lang.uk import Ukrainian -from spacy.lang.ru import Russian -from spacy.lookups import Lookups - -from ..util import make_tempdir - - -def test_lookup_lemmatizer_uk(): - nlp = Ukrainian() - lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "pymorphy2_lookup"}) - assert isinstance(lemmatizer.lookups, Lookups) - assert not lemmatizer.lookups.tables - assert lemmatizer.mode == "pymorphy2_lookup" - nlp.initialize() - assert nlp("якась")[0].lemma_ == "якийсь" - assert nlp("якийсь")[0].lemma_ == "якийсь" - assert nlp("зеленої")[0].lemma_ == 'зелений' - assert nlp("розповідають")[0].lemma_ == 'розповідати' - assert nlp("розповіси")[0].lemma_ == 'розповісти' - # assert nlp("телятові")[0].lemma_ == 'теля' # pymorph2 fails - -def test_lookup_lemmatizer_ru(): - nlp = Russian() - lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "pymorphy2_lookup"}) - assert isinstance(lemmatizer.lookups, Lookups) - assert not lemmatizer.lookups.tables - assert lemmatizer.mode == "pymorphy2_lookup" - nlp.initialize() - assert nlp("бременем")[0].lemma_ == 'бремя' - assert nlp("будешь")[0].lemma_ == "быть" - # assert nlp("какая-то")[0].lemma_ == "какой-то" # fails due to faulty word splitting - assert nlp("зелёной")[0].lemma_ == 'зелёный'