diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index c37a3a91a..f4a35de38 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -28,34 +28,39 @@ class RussianLemmatizer(Lemmatizer): from pymorphy2 import MorphAnalyzer except ImportError: raise ImportError( - "The Russian lemmatizer mode 'pymorphy2' requires the " - "pymorphy2 library. Install it with: pip install pymorphy2" + "The lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2" + "# for Ukrainian dictionaries:" + "pip install pymorphy2-dicts-uk" ) from None if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer() - elif mode == "pymorphy3": + self._morph = MorphAnalyzer(lang="ru") + elif mode in {"pymorphy3", "pymorphy3_lookup"}: try: from pymorphy3 import MorphAnalyzer except ImportError: raise ImportError( - "The Russian lemmatizer mode 'pymorphy3' requires the " - "pymorphy3 library. Install it with: pip install pymorphy3" + "The lemmatizer mode 'pymorphy3' requires the " + "pymorphy3 library and dictionaries. Install them with: " + "pip install pymorphy3" + "# for Ukrainian dictionaries:" + "pip install pymorphy3-dicts-uk" ) from None if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer() + self._morph = MorphAnalyzer(lang="ru") super().__init__( vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) - def pymorphy2_lemmatize(self, token: Token) -> List[str]: + def _pymorphy_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_ morphology = token.morph.to_dict() if univ_pos == "PUNCT": return [PUNCT_RULES.get(string, string)] if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): - # Skip unchangeable pos - return [string.lower()] + return self._pymorphy_lookup_lemmatize(token) analyses = self._morph.parse(string) filtered_analyses = [] for analysis in analyses: @@ -63,8 +68,10 @@ class RussianLemmatizer(Lemmatizer): # Skip suggested parse variant for unknown word for pymorphy continue analysis_pos, _ = oc2ud(str(analysis.tag)) - if analysis_pos == univ_pos or ( - analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") + if ( + analysis_pos == univ_pos + or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")) + or ((analysis_pos == "PRON") and (univ_pos == "DET")) ): filtered_analyses.append(analysis) if not len(filtered_analyses): @@ -107,15 +114,27 @@ class RussianLemmatizer(Lemmatizer): dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) ) - def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: + def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text analyses = self._morph.parse(string) - if len(analyses) == 1: - return [analyses[0].normal_form] + # often multiple forms would derive from the same normal form + # thus check _unique_ normal forms + normal_forms = set([an.normal_form for an in analyses]) + if len(normal_forms) == 1: + return [next(iter(normal_forms))] return [string] + def pymorphy2_lemmatize(self, token: Token) -> List[str]: + return self._pymorphy_lemmatize(token) + + def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: + return self._pymorphy_lookup_lemmatize(token) + def pymorphy3_lemmatize(self, token: Token) -> List[str]: - return self.pymorphy2_lemmatize(token) + return self._pymorphy_lemmatize(token) + + def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]: + return self._pymorphy_lookup_lemmatize(token) def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 8337e7328..37015cc2a 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -29,7 +29,7 @@ class UkrainianLemmatizer(RussianLemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer(lang="uk") - elif mode == "pymorphy3": + elif mode in {"pymorphy3", "pymorphy3_lookup"}: try: from pymorphy3 import MorphAnalyzer except ImportError: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0fc74243d..3a5c8e451 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -337,17 +337,17 @@ def ru_tokenizer(): return get_lang_class("ru")().tokenizer -@pytest.fixture +@pytest.fixture(scope="session") def ru_lemmatizer(): pytest.importorskip("pymorphy3") return get_lang_class("ru")().add_pipe("lemmatizer") -@pytest.fixture +@pytest.fixture(scope="session") def ru_lookup_lemmatizer(): - pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy3") return get_lang_class("ru")().add_pipe( - "lemmatizer", config={"mode": "pymorphy2_lookup"} + "lemmatizer", config={"mode": "pymorphy3_lookup"} ) @@ -423,19 +423,19 @@ def uk_tokenizer(): return get_lang_class("uk")().tokenizer -@pytest.fixture +@pytest.fixture(scope="session") def uk_lemmatizer(): pytest.importorskip("pymorphy3") pytest.importorskip("pymorphy3_dicts_uk") return get_lang_class("uk")().add_pipe("lemmatizer") -@pytest.fixture +@pytest.fixture(scope="session") def uk_lookup_lemmatizer(): - pytest.importorskip("pymorphy2") - pytest.importorskip("pymorphy2_dicts_uk") + pytest.importorskip("pymorphy3") + pytest.importorskip("pymorphy3_dicts_uk") return get_lang_class("uk")().add_pipe( - "lemmatizer", config={"mode": "pymorphy2_lookup"} + "lemmatizer", config={"mode": "pymorphy3_lookup"} ) diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index e82fd4f8c..9a5a9ad68 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -81,6 +81,7 @@ def test_ru_lemmatizer_punct(ru_lemmatizer): def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): + assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup" words = ["мама", "мыла", "раму"] pos = ["NOUN", "VERB", "NOUN"] morphs = [ @@ -92,3 +93,17 @@ def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): doc = ru_lookup_lemmatizer(doc) lemmas = [token.lemma_ for token in doc] assert lemmas == ["мама", "мыла", "раму"] + + +@pytest.mark.parametrize( + "word,lemma", + ( + ("бременем", "бремя"), + ("будешь", "быть"), + ("какая-то", "какой-то"), + ), +) +def test_ru_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma): + assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup" + doc = Doc(ru_lookup_lemmatizer.vocab, words=[word]) + assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 788744aa1..a65bb25e5 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -8,12 +8,20 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) + assert uk_lemmatizer.mode == "pymorphy3" uk_lemmatizer(doc) assert [token.lemma for token in doc] -def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer): - """Check that the lookup uk lemmatizer runs.""" - doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"]) - uk_lookup_lemmatizer(doc) - assert [token.lemma for token in doc] +@pytest.mark.parametrize( + "word,lemma", + ( + ("якийсь", "якийсь"), + ("розповідають", "розповідати"), + ("розповіси", "розповісти"), + ), +) +def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma): + assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup" + doc = Doc(uk_lookup_lemmatizer.vocab, words=[word]) + assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma