diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index e9fbdebaf..b78a81f22 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -2,8 +2,8 @@ from typing import Optional, List, Dict, Tuple, Callable from thinc.api import Model +from ...pipeline import Lemmatizer from ...pipeline.lemmatizer import lemmatizer_score -from ...pipeline.pymorphy_lemmatizer import PyMorhpyLemmatizer from ...symbols import POS from ...tokens import Token from ...vocab import Vocab @@ -11,7 +11,8 @@ from ...vocab import Vocab PUNCT_RULES = {"«": '"', "»": '"'} -class RussianLemmatizer(PyMorhpyLemmatizer): + +class RussianLemmatizer(Lemmatizer): def __init__( self, vocab: Vocab, @@ -22,6 +23,192 @@ class RussianLemmatizer(PyMorhpyLemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: + if mode in {"pymorphy2", "pymorphy2_lookup"}: + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2" + "# for Ukrainian dictionaries:" + "pip install pymorphy2-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="ru") + elif mode in {"pymorphy3", "pymorphy3_lookup"}: + try: + from pymorphy3 import MorphAnalyzer + except ImportError: + raise ImportError( + "The lemmatizer mode 'pymorphy3' requires the " + "pymorphy3 library and dictionaries. Install them with: " + "pip install pymorphy3" + "# for Ukrainian dictionaries:" + "pip install pymorphy3-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="ru") super().__init__( - vocab, model, "ru", name=name, mode=mode, overwrite=overwrite, scorer=scorer, + vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + + def pymorphy2_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() + if univ_pos == "PUNCT": + return [PUNCT_RULES.get(string, string)] + if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): + # Skip unchangeable pos + return self.pymorphy2_lookup_lemmatize(token) + return [string.lower()] + analyses = self._morph.parse(string) + filtered_analyses = [] + for analysis in analyses: + if not analysis.is_known: + # Skip suggested parse variant for unknown word for pymorphy + continue + analysis_pos, _ = oc2ud(str(analysis.tag)) + if ( + analysis_pos == univ_pos + or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")) + or ((analysis_pos == "PRON") and (univ_pos == "DET")) + ): + filtered_analyses.append(analysis) + if not len(filtered_analyses): + return [string.lower()] + if morphology is None or (len(morphology) == 1 and POS in morphology): + return list( + dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) + ) + if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): + features_to_compare = ["Case", "Number", "Gender"] + elif univ_pos == "NUM": + features_to_compare = ["Case", "Gender"] + elif univ_pos == "PRON": + features_to_compare = ["Case", "Number", "Gender", "Person"] + else: # VERB + features_to_compare = [ + "Aspect", + "Gender", + "Mood", + "Number", + "Tense", + "VerbForm", + "Voice", + ] + analyses, filtered_analyses = filtered_analyses, [] + for analysis in analyses: + _, analysis_morph = oc2ud(str(analysis.tag)) + for feature in features_to_compare: + if ( + feature in morphology + and feature in analysis_morph + and morphology[feature].lower() != analysis_morph[feature].lower() + ): + break + else: + filtered_analyses.append(analysis) + if not len(filtered_analyses): + return [string.lower()] + return list( + dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) + ) + + def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text + analyses = self._morph.parse(string) + # often multiple forms would derive from the same normal form + # thus check _unique_ normal forms + normal_forms = set([an.normal_form for an in analyses]) + if len(normal_forms) == 1: + return [next(iter(normal_forms))] + return [string] + + def pymorphy3_lemmatize(self, token: Token) -> List[str]: + return self.pymorphy2_lemmatize(token) + + def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]: + return self.pymorphy2_lookup_lemmatize(token) + + +def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: + gram_map = { + "_POS": { + "ADJF": "ADJ", + "ADJS": "ADJ", + "ADVB": "ADV", + "Apro": "DET", + "COMP": "ADJ", # Can also be an ADV - unchangeable + "CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones + "GRND": "VERB", + "INFN": "VERB", + "INTJ": "INTJ", + "NOUN": "NOUN", + "NPRO": "PRON", + "NUMR": "NUM", + "NUMB": "NUM", + "PNCT": "PUNCT", + "PRCL": "PART", + "PREP": "ADP", + "PRTF": "VERB", + "PRTS": "VERB", + "VERB": "VERB", + }, + "Animacy": {"anim": "Anim", "inan": "Inan"}, + "Aspect": {"impf": "Imp", "perf": "Perf"}, + "Case": { + "ablt": "Ins", + "accs": "Acc", + "datv": "Dat", + "gen1": "Gen", + "gen2": "Gen", + "gent": "Gen", + "loc2": "Loc", + "loct": "Loc", + "nomn": "Nom", + "voct": "Voc", + }, + "Degree": {"COMP": "Cmp", "Supr": "Sup"}, + "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"}, + "Mood": {"impr": "Imp", "indc": "Ind"}, + "Number": {"plur": "Plur", "sing": "Sing"}, + "NumForm": {"NUMB": "Digit"}, + "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"}, + "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"}, + "Variant": {"ADJS": "Brev", "PRTS": "Brev"}, + "VerbForm": { + "GRND": "Conv", + "INFN": "Inf", + "PRTF": "Part", + "PRTS": "Part", + "VERB": "Fin", + }, + "Voice": {"actv": "Act", "pssv": "Pass"}, + "Abbr": {"Abbr": "Yes"}, + } + pos = "X" + morphology = dict() + unmatched = set() + grams = oc_tag.replace(" ", ",").split(",") + for gram in grams: + match = False + for categ, gmap in sorted(gram_map.items()): + if gram in gmap: + match = True + if categ == "_POS": + pos = gmap[gram] + else: + morphology[categ] = gmap[gram] + if not match: + unmatched.add(gram) + while len(unmatched) > 0: + gram = unmatched.pop() + if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): + pos = "PROPN" + elif gram == "Auxt": + pos = "AUX" + elif gram == "Pltm": + morphology["Number"] = "Ptan" + return pos, morphology diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 66e1ebc14..8337e7328 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -2,12 +2,12 @@ from typing import Optional, Callable from thinc.api import Model +from ..ru.lemmatizer import RussianLemmatizer from ...pipeline.lemmatizer import lemmatizer_score -from ...pipeline.pymorphy_lemmatizer import PyMorhpyLemmatizer from ...vocab import Vocab -class UkrainianLemmatizer(PyMorhpyLemmatizer): +class UkrainianLemmatizer(RussianLemmatizer): def __init__( self, vocab: Vocab, @@ -18,6 +18,28 @@ class UkrainianLemmatizer(PyMorhpyLemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: + if mode in {"pymorphy2", "pymorphy2_lookup"}: + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Ukrainian lemmatizer mode 'pymorphy2' requires the " + "pymorphy2 library and dictionaries. Install them with: " + "pip install pymorphy2 pymorphy2-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="uk") + elif mode == "pymorphy3": + try: + from pymorphy3 import MorphAnalyzer + except ImportError: + raise ImportError( + "The Ukrainian lemmatizer mode 'pymorphy3' requires the " + "pymorphy3 library and dictionaries. Install them with: " + "pip install pymorphy3 pymorphy3-dicts-uk" + ) from None + if getattr(self, "_morph", None) is None: + self._morph = MorphAnalyzer(lang="uk") super().__init__( - vocab, model, "uk", name=name, mode=mode, overwrite=overwrite, scorer=scorer, + vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) diff --git a/spacy/pipeline/pymorphy_lemmatizer.py b/spacy/pipeline/pymorphy_lemmatizer.py deleted file mode 100644 index 99f4a0224..000000000 --- a/spacy/pipeline/pymorphy_lemmatizer.py +++ /dev/null @@ -1,223 +0,0 @@ -from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple -from thinc.api import Model - -import warnings - -from .lemmatizer import Lemmatizer -from .lemmatizer import lemmatizer_score -from ..tokens import Doc, Token -from ..vocab import Vocab -from ..symbols import POS - - -PUNCT_RULES = {"«": '"', "»": '"'} - -class PyMorhpyLemmatizer(Lemmatizer): - """A wrapper around pymorphy3.MorphAnalyzer and pymorphy2.MorphAnalyzer for Russian and Ukrainian - Input: - - vocab: Vocab - - model: Optional[Model] - - lang: str in {"ru", "uk"} - - name: str -- pipe name - ... - - mode: str in {"pymorphy2", "pymorphy2_lookup", "pymorphy3", "pymorphy3_lookup"} - ... - """ - def __init__( - self, - vocab: Vocab, - model: Optional[Model], - lang: str, - name: str = "lemmatizer", - *, - mode: str = "pymorphy3", - overwrite: bool = False, - scorer: Optional[Callable] = lemmatizer_score, - ) -> None: - if mode in {"pymorphy2", "pymorphy2_lookup"}: - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The lemmatizer mode 'pymorphy2' requires the " - "pymorphy2 library and dictionaries. Install them with: " - "pip install pymorphy2" - "# for Ukrainian dictionaries:" - "pip install pymorphy2-dicts-uk" - ) from None - if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer(lang=lang) - elif mode in {"pymorphy3", "pymorphy3_lookup"}: - try: - from pymorphy3 import MorphAnalyzer - except ImportError: - raise ImportError( - "The lemmatizer mode 'pymorphy3' requires the " - "pymorphy3 library and dictionaries. Install them with: " - "pip install pymorphy3" - "# for Ukrainian dictionaries:" - "pip install pymorphy3-dicts-uk" - ) from None - if getattr(self, "_morph", None) is None: - self._morph = MorphAnalyzer(lang=lang) - super().__init__( - vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer - ) - - def pymorphy2_lemmatize(self, token: Token) -> List[str]: - string = token.text - univ_pos = token.pos_ - morphology = token.morph.to_dict() - if univ_pos == "PUNCT": - return [PUNCT_RULES.get(string, string)] - if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): - # Skip unchangeable pos - return self.pymorphy2_lookup_lemmatize(token) - return [string.lower()] - analyses = self._morph.parse(string) - filtered_analyses = [] - for analysis in analyses: - if not analysis.is_known: - # Skip suggested parse variant for unknown word for pymorphy - continue - analysis_pos, _ = oc2ud(str(analysis.tag)) - if analysis_pos == univ_pos or ( - analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") - ) or ((analysis_pos=="PRON") and (univ_pos=="DET")): - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - if morphology is None or (len(morphology) == 1 and POS in morphology): - return list( - dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) - ) - if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): - features_to_compare = ["Case", "Number", "Gender"] - elif univ_pos == "NUM": - features_to_compare = ["Case", "Gender"] - elif univ_pos == "PRON": - features_to_compare = ["Case", "Number", "Gender", "Person"] - else: # VERB - features_to_compare = [ - "Aspect", - "Gender", - "Mood", - "Number", - "Tense", - "VerbForm", - "Voice", - ] - analyses, filtered_analyses = filtered_analyses, [] - for analysis in analyses: - _, analysis_morph = oc2ud(str(analysis.tag)) - for feature in features_to_compare: - if ( - feature in morphology - and feature in analysis_morph - and morphology[feature].lower() != analysis_morph[feature].lower() - ): - break - else: - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - return list( - dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]) - ) - - def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: - string = token.text - analyses = self._morph.parse(string) - # often multiple forms would derive from the same normal form - # thus check _unique_ normal forms - normal_forms = set([an.normal_form for an in analyses]) - if len(normal_forms) == 1: - return [next(iter(normal_forms))] - return [string] - - def pymorphy3_lemmatize(self, token: Token) -> List[str]: - return self.pymorphy2_lemmatize(token) - - def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]: - return self.pymorphy2_lookup_lemmatize(token) - - -def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: - gram_map = { - "_POS": { - "ADJF": "ADJ", - "ADJS": "ADJ", - "ADVB": "ADV", - "Apro": "DET", - "COMP": "ADJ", # Can also be an ADV - unchangeable - "CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones - "GRND": "VERB", - "INFN": "VERB", - "INTJ": "INTJ", - "NOUN": "NOUN", - "NPRO": "PRON", - "NUMR": "NUM", - "NUMB": "NUM", - "PNCT": "PUNCT", - "PRCL": "PART", - "PREP": "ADP", - "PRTF": "VERB", - "PRTS": "VERB", - "VERB": "VERB", - }, - "Animacy": {"anim": "Anim", "inan": "Inan"}, - "Aspect": {"impf": "Imp", "perf": "Perf"}, - "Case": { - "ablt": "Ins", - "accs": "Acc", - "datv": "Dat", - "gen1": "Gen", - "gen2": "Gen", - "gent": "Gen", - "loc2": "Loc", - "loct": "Loc", - "nomn": "Nom", - "voct": "Voc", - }, - "Degree": {"COMP": "Cmp", "Supr": "Sup"}, - "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"}, - "Mood": {"impr": "Imp", "indc": "Ind"}, - "Number": {"plur": "Plur", "sing": "Sing"}, - "NumForm": {"NUMB": "Digit"}, - "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"}, - "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"}, - "Variant": {"ADJS": "Brev", "PRTS": "Brev"}, - "VerbForm": { - "GRND": "Conv", - "INFN": "Inf", - "PRTF": "Part", - "PRTS": "Part", - "VERB": "Fin", - }, - "Voice": {"actv": "Act", "pssv": "Pass"}, - "Abbr": {"Abbr": "Yes"}, - } - pos = "X" - morphology = dict() - unmatched = set() - grams = oc_tag.replace(" ", ",").split(",") - for gram in grams: - match = False - for categ, gmap in sorted(gram_map.items()): - if gram in gmap: - match = True - if categ == "_POS": - pos = gmap[gram] - else: - morphology[categ] = gmap[gram] - if not match: - unmatched.add(gram) - while len(unmatched) > 0: - gram = unmatched.pop() - if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): - pos = "PROPN" - elif gram == "Auxt": - pos = "AUX" - elif gram == "Pltm": - morphology["Number"] = "Ptan" - return pos, morphology