mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-14 15:12:15 +03:00
Strip diacritics for pymorphy lemmatizer
This commit is contained in:
parent
8e6a3d58d8
commit
b1e47b50b9
|
@ -1,7 +1,9 @@
|
|||
from typing import Optional, List, Dict, Tuple, Callable
|
||||
import re
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ..char_classes import COMBINING_DIACRITICS
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...pipeline.lemmatizer import lemmatizer_score
|
||||
from ...symbols import POS
|
||||
|
@ -52,9 +54,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
self._diacritics_re = re.compile(f"[{COMBINING_DIACRITICS}]")
|
||||
|
||||
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
string = self._diacritics_re.sub("", token.text)
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "PUNCT":
|
||||
|
|
|
@ -65,6 +65,8 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
|||
("гвоздики", "Gender=Masc", "гвоздик"),
|
||||
("вина", "Gender=Fem", "вина"),
|
||||
("вина", "Gender=Neut", "вино"),
|
||||
("жену", "Gender=Fem", "жена"),
|
||||
("жену́", "Gender=Fem", "жена"),
|
||||
],
|
||||
)
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
|
||||
|
|
Loading…
Reference in New Issue
Block a user