# coding: utf8 from __future__ import unicode_literals from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer from ...compat import unicode_ class RussianLemmatizer(Lemmatizer): _morph = None def __init__(self, lookups=None): super(RussianLemmatizer, self).__init__(lookups) try: from pymorphy2 import MorphAnalyzer except ImportError: raise ImportError( "The Russian lemmatizer requires the pymorphy2 library: " 'try to fix it with "pip install pymorphy2==0.8" ' 'or "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' "if you need Ukrainian too" ) if RussianLemmatizer._morph is None: RussianLemmatizer._morph = MorphAnalyzer() def __call__(self, string, univ_pos, morphology=None): univ_pos = self.normalize_univ_pos(univ_pos) if univ_pos == "PUNCT": return [PUNCT_RULES.get(string, string)] if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): # Skip unchangeable pos return [string.lower()] analyses = self._morph.parse(string) filtered_analyses = [] for analysis in analyses: if not analysis.is_known: # Skip suggested parse variant for unknown word for pymorphy continue analysis_pos, _ = oc2ud(str(analysis.tag)) if analysis_pos == univ_pos or ( analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") ): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): return list(set([analysis.normal_form for analysis in filtered_analyses])) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": features_to_compare = ["Case", "Gender"] elif univ_pos == "PRON": features_to_compare = ["Case", "Number", "Gender", "Person"] else: # VERB features_to_compare = [ "Aspect", "Gender", "Mood", "Number", "Tense", "VerbForm", "Voice", ] analyses, filtered_analyses = filtered_analyses, [] for analysis in analyses: _, analysis_morph = oc2ud(str(analysis.tag)) for feature in features_to_compare: if ( feature in morphology and feature in analysis_morph and morphology[feature] != analysis_morph[feature] ): break else: filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] return list(set([analysis.normal_form for analysis in filtered_analyses])) @staticmethod def normalize_univ_pos(univ_pos): if isinstance(univ_pos, unicode_): return univ_pos.upper() symbols_to_str = { ADJ: "ADJ", DET: "DET", NOUN: "NOUN", NUM: "NUM", PRON: "PRON", PROPN: "PROPN", PUNCT: "PUNCT", VERB: "VERB", } if univ_pos in symbols_to_str: return symbols_to_str[univ_pos] return None def lookup(self, string, orth=None): analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form return string def oc2ud(oc_tag): gram_map = { "_POS": { "ADJF": "ADJ", "ADJS": "ADJ", "ADVB": "ADV", "Apro": "DET", "COMP": "ADJ", # Can also be an ADV - unchangeable "CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones "GRND": "VERB", "INFN": "VERB", "INTJ": "INTJ", "NOUN": "NOUN", "NPRO": "PRON", "NUMR": "NUM", "NUMB": "NUM", "PNCT": "PUNCT", "PRCL": "PART", "PREP": "ADP", "PRTF": "VERB", "PRTS": "VERB", "VERB": "VERB", }, "Animacy": {"anim": "Anim", "inan": "Inan"}, "Aspect": {"impf": "Imp", "perf": "Perf"}, "Case": { "ablt": "Ins", "accs": "Acc", "datv": "Dat", "gen1": "Gen", "gen2": "Gen", "gent": "Gen", "loc2": "Loc", "loct": "Loc", "nomn": "Nom", "voct": "Voc", }, "Degree": {"COMP": "Cmp", "Supr": "Sup"}, "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"}, "Mood": {"impr": "Imp", "indc": "Ind"}, "Number": {"plur": "Plur", "sing": "Sing"}, "NumForm": {"NUMB": "Digit"}, "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"}, "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"}, "Variant": {"ADJS": "Brev", "PRTS": "Brev"}, "VerbForm": { "GRND": "Conv", "INFN": "Inf", "PRTF": "Part", "PRTS": "Part", "VERB": "Fin", }, "Voice": {"actv": "Act", "pssv": "Pass"}, "Abbr": {"Abbr": "Yes"}, } pos = "X" morphology = dict() unmatched = set() grams = oc_tag.replace(" ", ",").split(",") for gram in grams: match = False for categ, gmap in sorted(gram_map.items()): if gram in gmap: match = True if categ == "_POS": pos = gmap[gram] else: morphology[categ] = gmap[gram] if not match: unmatched.add(gram) while len(unmatched) > 0: gram = unmatched.pop() if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): pos = "PROPN" elif gram == "Auxt": pos = "AUX" elif gram == "Pltm": morphology["Number"] = "Ptan" return pos, morphology PUNCT_RULES = {"«": '"', "»": '"'}