Strip diacritics for pymorphy lemmatizer

This commit is contained in:
Adriane Boyd 2023-04-20 11:14:43 +02:00
parent 8e6a3d58d8
commit b1e47b50b9
2 changed files with 6 additions and 1 deletions

View File

@ -1,7 +1,9 @@
from typing import Optional, List, Dict, Tuple, Callable from typing import Optional, List, Dict, Tuple, Callable
import re
from thinc.api import Model from thinc.api import Model
from ..char_classes import COMBINING_DIACRITICS
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...pipeline.lemmatizer import lemmatizer_score from ...pipeline.lemmatizer import lemmatizer_score
from ...symbols import POS from ...symbols import POS
@ -52,9 +54,10 @@ class RussianLemmatizer(Lemmatizer):
super().__init__( super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
) )
self._diacritics_re = re.compile(f"[{COMBINING_DIACRITICS}]")
def _pymorphy_lemmatize(self, token: Token) -> List[str]: def _pymorphy_lemmatize(self, token: Token) -> List[str]:
string = token.text string = self._diacritics_re.sub("", token.text)
univ_pos = token.pos_ univ_pos = token.pos_
morphology = token.morph.to_dict() morphology = token.morph.to_dict()
if univ_pos == "PUNCT": if univ_pos == "PUNCT":

View File

@ -65,6 +65,8 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
("гвоздики", "Gender=Masc", "гвоздик"), ("гвоздики", "Gender=Masc", "гвоздик"),
("вина", "Gender=Fem", "вина"), ("вина", "Gender=Fem", "вина"),
("вина", "Gender=Neut", "вино"), ("вина", "Gender=Neut", "вино"),
("жену", "Gender=Fem", "жена"),
("жену́", "Gender=Fem", "жена"),
], ],
) )
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma): def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):