Strip diacritics for pymorphy lemmatizer

2026-03-05 20:31:30 +03:00 · 2023-04-20 11:14:43 +02:00 · 2023-04-20 11:14:43 +02:00 · b1e47b50b9
commit b1e47b50b9
parent 8e6a3d58d8
2 changed files with 6 additions and 1 deletions
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -1,7 +1,9 @@
 from typing import Optional, List, Dict, Tuple, Callable
+import re

 from thinc.api import Model

+from ..char_classes import COMBINING_DIACRITICS
 from ...pipeline import Lemmatizer
 from ...pipeline.lemmatizer import lemmatizer_score
 from ...symbols import POS
@ -52,9 +54,10 @@ class RussianLemmatizer(Lemmatizer):
        super().__init__(
            vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
        )
+        self._diacritics_re = re.compile(f"[{COMBINING_DIACRITICS}]")

    def _pymorphy_lemmatize(self, token: Token) -> List[str]:
-        string = token.text
+        string = self._diacritics_re.sub("", token.text)
        univ_pos = token.pos_
        morphology = token.morph.to_dict()
        if univ_pos == "PUNCT":
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -65,6 +65,8 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
        ("гвоздики", "Gender=Masc", "гвоздик"),
        ("вина", "Gender=Fem", "вина"),
        ("вина", "Gender=Neut", "вино"),
+        ("жену", "Gender=Fem", "жена"),
+        ("жену́", "Gender=Fem", "жена"),
    ],
 )
 def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):