mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Update Russian and Ukrainian lemmatizers (#11811)
* pymorph2 issues #11620, #11626, #11625: - #11620: pymorphy2_lookup - #11626: handle multiple forms pointing to the same normal form + handling empty POS tag - #11625: matching DET that are labelled as PRON by pymorhp2 * Move lemmatizer algorithm changes back into RussianLemmatizer * Fix uk pymorphy3_lookup mode init * Move and update tests for ru/uk lookup lemmatizer modes * Fix typo * Remove traces of previous behavior for uninflected POS * Refactor to private generic-looking pymorphy methods * Remove xfailed uk lemmatizer cases * Update spacy/lang/ru/lemmatizer.py Co-authored-by: Richard Hudson <richard@explosion.ai> Co-authored-by: Dmytro S Lituiev <d.lituiev@gmail.com> Co-authored-by: Richard Hudson <richard@explosion.ai>
This commit is contained in:
parent
8f062b849c
commit
30d31fd335
|
@ -28,34 +28,39 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
"The lemmatizer mode 'pymorphy2' requires the "
|
||||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
"pymorphy2 library and dictionaries. Install them with: "
|
||||||
|
"pip install pymorphy2"
|
||||||
|
"# for Ukrainian dictionaries:"
|
||||||
|
"pip install pymorphy2-dicts-uk"
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer(lang="ru")
|
||||||
elif mode == "pymorphy3":
|
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy3 import MorphAnalyzer
|
from pymorphy3 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"The Russian lemmatizer mode 'pymorphy3' requires the "
|
"The lemmatizer mode 'pymorphy3' requires the "
|
||||||
"pymorphy3 library. Install it with: pip install pymorphy3"
|
"pymorphy3 library and dictionaries. Install them with: "
|
||||||
|
"pip install pymorphy3"
|
||||||
|
"# for Ukrainian dictionaries:"
|
||||||
|
"pip install pymorphy3-dicts-uk"
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer()
|
self._morph = MorphAnalyzer(lang="ru")
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
||||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
univ_pos = token.pos_
|
univ_pos = token.pos_
|
||||||
morphology = token.morph.to_dict()
|
morphology = token.morph.to_dict()
|
||||||
if univ_pos == "PUNCT":
|
if univ_pos == "PUNCT":
|
||||||
return [PUNCT_RULES.get(string, string)]
|
return [PUNCT_RULES.get(string, string)]
|
||||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||||
# Skip unchangeable pos
|
return self._pymorphy_lookup_lemmatize(token)
|
||||||
return [string.lower()]
|
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
filtered_analyses = []
|
filtered_analyses = []
|
||||||
for analysis in analyses:
|
for analysis in analyses:
|
||||||
|
@ -63,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
# Skip suggested parse variant for unknown word for pymorphy
|
# Skip suggested parse variant for unknown word for pymorphy
|
||||||
continue
|
continue
|
||||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||||
if analysis_pos == univ_pos or (
|
if (
|
||||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
analysis_pos == univ_pos
|
||||||
|
or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
|
||||||
|
or ((analysis_pos == "PRON") and (univ_pos == "DET"))
|
||||||
):
|
):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
|
@ -107,15 +114,27 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||||
)
|
)
|
||||||
|
|
||||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
analyses = self._morph.parse(string)
|
analyses = self._morph.parse(string)
|
||||||
if len(analyses) == 1:
|
# often multiple forms would derive from the same normal form
|
||||||
return [analyses[0].normal_form]
|
# thus check _unique_ normal forms
|
||||||
|
normal_forms = set([an.normal_form for an in analyses])
|
||||||
|
if len(normal_forms) == 1:
|
||||||
|
return [next(iter(normal_forms))]
|
||||||
return [string]
|
return [string]
|
||||||
|
|
||||||
|
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
return self._pymorphy_lemmatize(token)
|
||||||
|
|
||||||
|
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
return self._pymorphy_lookup_lemmatize(token)
|
||||||
|
|
||||||
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||||
return self.pymorphy2_lemmatize(token)
|
return self._pymorphy_lemmatize(token)
|
||||||
|
|
||||||
|
def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
return self._pymorphy_lookup_lemmatize(token)
|
||||||
|
|
||||||
|
|
||||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||||
|
|
|
@ -29,7 +29,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
) from None
|
) from None
|
||||||
if getattr(self, "_morph", None) is None:
|
if getattr(self, "_morph", None) is None:
|
||||||
self._morph = MorphAnalyzer(lang="uk")
|
self._morph = MorphAnalyzer(lang="uk")
|
||||||
elif mode == "pymorphy3":
|
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||||
try:
|
try:
|
||||||
from pymorphy3 import MorphAnalyzer
|
from pymorphy3 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -337,17 +337,17 @@ def ru_tokenizer():
|
||||||
return get_lang_class("ru")().tokenizer
|
return get_lang_class("ru")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def ru_lemmatizer():
|
def ru_lemmatizer():
|
||||||
pytest.importorskip("pymorphy3")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def ru_lookup_lemmatizer():
|
def ru_lookup_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
return get_lang_class("ru")().add_pipe(
|
return get_lang_class("ru")().add_pipe(
|
||||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -423,19 +423,19 @@ def uk_tokenizer():
|
||||||
return get_lang_class("uk")().tokenizer
|
return get_lang_class("uk")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def uk_lemmatizer():
|
def uk_lemmatizer():
|
||||||
pytest.importorskip("pymorphy3")
|
pytest.importorskip("pymorphy3")
|
||||||
pytest.importorskip("pymorphy3_dicts_uk")
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture(scope="session")
|
||||||
def uk_lookup_lemmatizer():
|
def uk_lookup_lemmatizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy3")
|
||||||
pytest.importorskip("pymorphy2_dicts_uk")
|
pytest.importorskip("pymorphy3_dicts_uk")
|
||||||
return get_lang_class("uk")().add_pipe(
|
return get_lang_class("uk")().add_pipe(
|
||||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -81,6 +81,7 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
|
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
words = ["мама", "мыла", "раму"]
|
words = ["мама", "мыла", "раму"]
|
||||||
pos = ["NOUN", "VERB", "NOUN"]
|
pos = ["NOUN", "VERB", "NOUN"]
|
||||||
morphs = [
|
morphs = [
|
||||||
|
@ -92,3 +93,17 @@ def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||||
doc = ru_lookup_lemmatizer(doc)
|
doc = ru_lookup_lemmatizer(doc)
|
||||||
lemmas = [token.lemma_ for token in doc]
|
lemmas = [token.lemma_ for token in doc]
|
||||||
assert lemmas == ["мама", "мыла", "раму"]
|
assert lemmas == ["мама", "мыла", "раму"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word,lemma",
|
||||||
|
(
|
||||||
|
("бременем", "бремя"),
|
||||||
|
("будешь", "быть"),
|
||||||
|
("какая-то", "какой-то"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_ru_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma):
|
||||||
|
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
|
doc = Doc(ru_lookup_lemmatizer.vocab, words=[word])
|
||||||
|
assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||||
|
|
|
@ -8,12 +8,20 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
def test_uk_lemmatizer(uk_lemmatizer):
|
def test_uk_lemmatizer(uk_lemmatizer):
|
||||||
"""Check that the default uk lemmatizer runs."""
|
"""Check that the default uk lemmatizer runs."""
|
||||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||||
|
assert uk_lemmatizer.mode == "pymorphy3"
|
||||||
uk_lemmatizer(doc)
|
uk_lemmatizer(doc)
|
||||||
assert [token.lemma for token in doc]
|
assert [token.lemma for token in doc]
|
||||||
|
|
||||||
|
|
||||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
@pytest.mark.parametrize(
|
||||||
"""Check that the lookup uk lemmatizer runs."""
|
"word,lemma",
|
||||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
(
|
||||||
uk_lookup_lemmatizer(doc)
|
("якийсь", "якийсь"),
|
||||||
assert [token.lemma for token in doc]
|
("розповідають", "розповідати"),
|
||||||
|
("розповіси", "розповісти"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma):
|
||||||
|
assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||||
|
doc = Doc(uk_lookup_lemmatizer.vocab, words=[word])
|
||||||
|
assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||||
|
|
Loading…
Reference in New Issue
Block a user