From b1e47b50b9a0cc423c77eec2d131dcd08d23b372 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 11:14:43 +0200 Subject: [PATCH 1/2] Strip diacritics for pymorphy lemmatizer --- spacy/lang/ru/lemmatizer.py | 5 ++++- spacy/tests/lang/ru/test_lemmatizer.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index f4a35de38..35a6ccc7f 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,7 +1,9 @@ from typing import Optional, List, Dict, Tuple, Callable +import re from thinc.api import Model +from ..char_classes import COMBINING_DIACRITICS from ...pipeline import Lemmatizer from ...pipeline.lemmatizer import lemmatizer_score from ...symbols import POS @@ -52,9 +54,10 @@ class RussianLemmatizer(Lemmatizer): super().__init__( vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + self._diacritics_re = re.compile(f"[{COMBINING_DIACRITICS}]") def _pymorphy_lemmatize(self, token: Token) -> List[str]: - string = token.text + string = self._diacritics_re.sub("", token.text) univ_pos = token.pos_ morphology = token.morph.to_dict() if univ_pos == "PUNCT": diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 9a5a9ad68..f7a51af16 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -65,6 +65,8 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms( ("гвоздики", "Gender=Masc", "гвоздик"), ("вина", "Gender=Fem", "вина"), ("вина", "Gender=Neut", "вино"), + ("жену", "Gender=Fem", "жена"), + ("жену́", "Gender=Fem", "жена"), ], ) def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma): From 3cc6c5068874273dc26dc9d70b8cb9b575406100 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 16:21:42 +0200 Subject: [PATCH 2/2] isort --- spacy/lang/ru/lemmatizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index ad2e36b0e..340bd911b 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,14 +1,14 @@ -from typing import Callable, Dict, List, Optional, Tuple import re +from typing import Callable, Dict, List, Optional, Tuple from thinc.api import Model -from ..char_classes import COMBINING_DIACRITICS from ...pipeline import Lemmatizer from ...pipeline.lemmatizer import lemmatizer_score from ...symbols import POS from ...tokens import Token from ...vocab import Vocab +from ..char_classes import COMBINING_DIACRITICS PUNCT_RULES = {"«": '"', "»": '"'}