mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 13:44:55 +03:00
Move and update tests for ru/uk lookup lemmatizer modes
This commit is contained in:
parent
4f3f291cca
commit
c06a02904f
|
@ -337,17 +337,17 @@ def ru_tokenizer():
|
||||||
return get_lang_class("ru")().tokenizer
|
return get_lang_class("ru")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def ru_lemmatizer():
|
def ru_lemmatizer():
|
||||||
pytest.importorskip("pymorphy3")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def ru_lookup_lemmatizer():
|
def ru_lookup_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe(
|
return get_lang_class("ru")().add_pipe(
|
||||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -423,19 +423,19 @@ def uk_tokenizer():
|
||||||
return get_lang_class("uk")().tokenizer
|
return get_lang_class("uk")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def uk_lemmatizer():
|
def uk_lemmatizer():
|
||||||
pytest.importorskip("pymorphy3")
|
pytest.importorskip("pymorphy3")
|
||||||
pytest.importorskip("pymorphy3_dicts_uk")
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def uk_lookup_lemmatizer():
|
def uk_lookup_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
pytest.importorskip("pymorphy2_dicts_uk")
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe(
|
return get_lang_class("uk")().add_pipe(
|
||||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -81,6 +81,7 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
|
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
morphs = [
|
morphs = [
|
||||||
|
@ -92,3 +93,17 @@ def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
doc = ru_lookup_lemmatizer(doc)
|
doc = ru_lookup_lemmatizer(doc)
|
||||||
lemmas = [token.lemma_ for token in doc]
|
lemmas = [token.lemma_ for token in doc]
|
||||||
assert lemmas == ["мама", "мыла", "раму"]
|
assert lemmas == ["мама", "мыла", "раму"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word,lemma",
|
||||||
|
(
|
||||||
|
("бременем", "бремя"),
|
||||||
|
("будешь", "быть"),
|
||||||
|
("какая-то", "какой-то"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_ruk_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma):
|
||||||
|
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
|
doc = Doc(ru_lookup_lemmatizer.vocab, words=[word])
|
||||||
|
assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||||
|
|
|
@ -8,12 +8,22 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
assert uk_lemmatizer.mode == "pymorphy3"
|
||||||
uk_lemmatizer(doc)
|
uk_lemmatizer(doc)
|
||||||
assert [token.lemma for token in doc]
|
assert [token.lemma for token in doc]
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
@pytest.mark.parametrize(
|
||||||
"""Check that the lookup uk lemmatizer runs."""
|
"word,lemma",
|
||||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
(
|
||||||
uk_lookup_lemmatizer(doc)
|
("якийсь", "якийсь"),
|
||||||
assert [token.lemma for token in doc]
|
pytest.param("зеленої", "зелений", marks=pytest.mark.xfail()),
|
||||||
|
("розповідають", "розповідати"),
|
||||||
|
("розповіси", "розповісти"),
|
||||||
|
pytest.param("телятові", "теля", marks=pytest.mark.xfail()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma):
|
||||||
|
assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
|
doc = Doc(uk_lookup_lemmatizer.vocab, words=[word])
|
||||||
|
assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
import pytest
|
|
||||||
import pickle
|
|
||||||
from spacy import util, registry
|
|
||||||
from spacy.lang.uk import Ukrainian
|
|
||||||
from spacy.lang.ru import Russian
|
|
||||||
from spacy.lookups import Lookups
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_lookup_lemmatizer_uk():
|
|
||||||
nlp = Ukrainian()
|
|
||||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "pymorphy2_lookup"})
|
|
||||||
assert isinstance(lemmatizer.lookups, Lookups)
|
|
||||||
assert not lemmatizer.lookups.tables
|
|
||||||
assert lemmatizer.mode == "pymorphy2_lookup"
|
|
||||||
nlp.initialize()
|
|
||||||
assert nlp("якась")[0].lemma_ == "якийсь"
|
|
||||||
assert nlp("якийсь")[0].lemma_ == "якийсь"
|
|
||||||
assert nlp("зеленої")[0].lemma_ == 'зелений'
|
|
||||||
assert nlp("розповідають")[0].lemma_ == 'розповідати'
|
|
||||||
assert nlp("розповіси")[0].lemma_ == 'розповісти'
|
|
||||||
# assert nlp("телятові")[0].lemma_ == 'теля' # pymorph2 fails
|
|
||||||
|
|
||||||
def test_lookup_lemmatizer_ru():
|
|
||||||
nlp = Russian()
|
|
||||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "pymorphy2_lookup"})
|
|
||||||
assert isinstance(lemmatizer.lookups, Lookups)
|
|
||||||
assert not lemmatizer.lookups.tables
|
|
||||||
assert lemmatizer.mode == "pymorphy2_lookup"
|
|
||||||
nlp.initialize()
|
|
||||||
assert nlp("бременем")[0].lemma_ == 'бремя'
|
|
||||||
assert nlp("будешь")[0].lemma_ == "быть"
|
|
||||||
# assert nlp("какая-то")[0].lemma_ == "какой-то" # fails due to faulty word splitting
|
|
||||||
assert nlp("зелёной")[0].lemma_ == 'зелёный'
|
|
Loading…
Reference in New Issue
Block a user