mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Updated Russian Language, added lemmatizer, norm exceptions and lex
attrs
This commit is contained in:
		
							parent
							
								
									a0739a06d4
								
							
						
					
					
						commit
						52ee1f9bf9
					
				|  | @ -1,64 +1,36 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals, print_function | ||||
| 
 | ||||
| from ..language import Language | ||||
| from ..attrs import LANG | ||||
| from ..tokens import Doc | ||||
| from .language_data import * | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .norm_exceptions import NORM_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .lemmatizer import RussianLemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class RussianTokenizer(object): | ||||
|     _morph = None | ||||
| 
 | ||||
|     def __init__(self, spacy_tokenizer, cls, nlp=None): | ||||
|         try: | ||||
|             from pymorphy2 import MorphAnalyzer | ||||
|         except ImportError: | ||||
|             raise ImportError( | ||||
|                 "The Russian tokenizer requires the pymorphy2 library: " | ||||
|                 "try to fix it with " | ||||
|                 "pip install pymorphy2==0.8") | ||||
| 
 | ||||
|         RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer) | ||||
| 
 | ||||
|         self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) | ||||
|         self._spacy_tokenizer = spacy_tokenizer | ||||
| 
 | ||||
|     def __call__(self, text): | ||||
|         words = [self._normalize(RussianTokenizer._get_word(token)) | ||||
|                  for token in self._spacy_tokenizer(text)] | ||||
| 
 | ||||
|         return Doc(self.vocab, words, [False] * len(words)) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _get_word(token): | ||||
|         return token.lemma_ if len(token.lemma_) > 0 else token.text | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _normalize(cls, word): | ||||
|         return cls._morph.parse(word)[0].normal_form | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _create_morph(cls, morph_analyzer_class): | ||||
|         if not cls._morph: | ||||
|             cls._morph = morph_analyzer_class() | ||||
|         return cls._morph | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...util import update_exc, add_lookups | ||||
| from ...language import Language | ||||
| from ...attrs import LANG, LIKE_NUM, NORM | ||||
| 
 | ||||
| 
 | ||||
| class RussianDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
|     lex_attr_getters[LANG] = lambda text: 'ru' | ||||
| 
 | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||
|                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_tokenizer(cls, nlp=None): | ||||
|         tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp) | ||||
|         return RussianTokenizer(tokenizer, cls, nlp) | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return RussianLemmatizer() | ||||
| 
 | ||||
| 
 | ||||
| class Russian(Language): | ||||
|     lang = 'ru' | ||||
| 
 | ||||
|     Defaults = RussianDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ['Russian'] | ||||
|  |  | |||
|  | @ -1,18 +0,0 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .. import language_data as base | ||||
| from ..language_data import update_exc, strings_to_exc | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| STOP_WORDS = set(STOP_WORDS) | ||||
| TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) | ||||
| 
 | ||||
| 
 | ||||
| update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] | ||||
							
								
								
									
										232
									
								
								spacy/lang/ru/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										232
									
								
								spacy/lang/ru/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,232 @@ | |||
| # coding: utf8 | ||||
| from ...symbols import ( | ||||
|     ADJ, DET, NOUN, NUM, PRON, PROPN, VERB | ||||
| ) | ||||
| from ...lemmatizer import Lemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class RussianLemmatizer(Lemmatizer): | ||||
|     _morph = None | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         super().__init__() | ||||
|         try: | ||||
|             from pymorphy2 import MorphAnalyzer | ||||
|         except ImportError: | ||||
|             raise ImportError( | ||||
|                 'The Russian lemmatizer requires the pymorphy2 library: ' | ||||
|                 'try to fix it with "pip install pymorphy2"') | ||||
| 
 | ||||
|         if RussianLemmatizer._morph is None: | ||||
|             RussianLemmatizer._morph = MorphAnalyzer() | ||||
| 
 | ||||
|     def __call__(self, string, univ_pos, morphology=None): | ||||
|         univ_pos = self.normalize_univ_pos(univ_pos) | ||||
|         if univ_pos not in ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'): | ||||
|             # Skip unchangeable pos | ||||
|             return [string.lower()] | ||||
| 
 | ||||
|         analyses = self._morph.parse(string) | ||||
|         filtered_analyses = [] | ||||
|         for analysis in analyses: | ||||
|             if not analysis.is_known: | ||||
|                 # Skip suggested parse variant for unknown word for pymorphy | ||||
|                 continue | ||||
|             analysis_pos, _ = oc2ud(str(analysis.tag)) | ||||
|             if analysis_pos == univ_pos \ | ||||
|                     or (analysis_pos in ('NOUN', 'PROPN') and univ_pos in ('NOUN', 'PROPN')): | ||||
|                 filtered_analyses.append(analysis) | ||||
| 
 | ||||
|         if not len(filtered_analyses): | ||||
|             return [string.lower()] | ||||
|         if morphology is None: | ||||
|             return list(set([analysis.normal_form for analysis in filtered_analyses])) | ||||
| 
 | ||||
|         if univ_pos in ('ADJ', 'DET', 'NOUN', 'PROPN'): | ||||
|             features_to_compare = ['Case', 'Number', 'Gender'] | ||||
|         elif univ_pos == 'NUM': | ||||
|             features_to_compare = ['Case', 'Gender'] | ||||
|         elif univ_pos == 'PRON': | ||||
|             features_to_compare = ['Case', 'Number', 'Gender', 'Person'] | ||||
|         else:  # VERB | ||||
|             features_to_compare = ['Aspect', 'Gender', 'Mood', 'Number', 'Tense', 'VerbForm', 'Voice'] | ||||
| 
 | ||||
|         analyses, filtered_analyses = filtered_analyses, [] | ||||
|         for analysis in analyses: | ||||
|             _, analysis_morph = oc2ud(str(analysis.tag)) | ||||
|             for feature in features_to_compare: | ||||
|                 if feature in morphology and morphology[feature] != analysis_morph[feature]: | ||||
|                     break | ||||
|             else: | ||||
|                 filtered_analyses.append(analysis) | ||||
| 
 | ||||
|         if not len(filtered_analyses): | ||||
|             return [string.lower()] | ||||
|         return list(set([analysis.normal_form for analysis in filtered_analyses])) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def normalize_univ_pos(univ_pos): | ||||
|         if isinstance(univ_pos, str): | ||||
|             return univ_pos.upper() | ||||
| 
 | ||||
|         symbols_to_str = { | ||||
|             ADJ: 'ADJ', | ||||
|             DET: 'DET', | ||||
|             NOUN: 'NOUN', | ||||
|             NUM: 'NUM', | ||||
|             PRON: 'PRON', | ||||
|             PROPN: 'PROPN', | ||||
|             VERB: 'VERB' | ||||
|         } | ||||
|         if univ_pos in symbols_to_str: | ||||
|             return symbols_to_str[univ_pos] | ||||
|         return None | ||||
| 
 | ||||
|     def is_base_form(self, univ_pos, morphology=None): | ||||
|         # TODO | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     # ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'): | ||||
|     def det(self, string, morphology=None): | ||||
|         return self(string, 'det', morphology) | ||||
| 
 | ||||
|     def num(self, string, morphology=None): | ||||
|         return self(string, 'num', morphology) | ||||
| 
 | ||||
|     def pron(self, string, morphology=None): | ||||
|         return self(string, 'pron', morphology) | ||||
| 
 | ||||
|     def lookup(self, string): | ||||
|         analyses = self._morph.parse(string) | ||||
|         if len(analyses) == 1: | ||||
|             return analyses[0].normal_form | ||||
|         return string | ||||
| 
 | ||||
| 
 | ||||
| def oc2ud(oc_tag): | ||||
|     gram_map = { | ||||
|         '_POS': { | ||||
|             'ADJF': 'ADJ', | ||||
|             'ADJS': 'ADJ', | ||||
|             'ADVB': 'ADV', | ||||
|             'Apro': 'DET', | ||||
|             'COMP': 'ADJ',  # Can also be an ADV - unchangeable | ||||
|             'CONJ': 'CCONJ',  # Can also be a SCONJ - both unchangeable ones | ||||
|             'GRND': 'VERB', | ||||
|             'INFN': 'VERB', | ||||
|             'INTJ': 'INTJ', | ||||
|             'NOUN': 'NOUN', | ||||
|             'NPRO': 'PRON', | ||||
|             'NUMR': 'NUM', | ||||
|             'NUMB': 'NUM', | ||||
|             'PNCT': 'PUNCT', | ||||
|             'PRCL': 'PART', | ||||
|             'PREP': 'ADP', | ||||
|             'PRTF': 'VERB', | ||||
|             'PRTS': 'VERB', | ||||
|             'VERB': 'VERB', | ||||
|         }, | ||||
|         'Animacy': { | ||||
|             'anim': 'Anim', | ||||
|             'inan': 'Inan', | ||||
|         }, | ||||
|         'Aspect': { | ||||
|             'impf': 'Imp', | ||||
|             'perf': 'Perf', | ||||
|         }, | ||||
|         'Case': { | ||||
|             'ablt': 'Ins', | ||||
|             'accs': 'Acc', | ||||
|             'datv': 'Dat', | ||||
|             'gen1': 'Gen', | ||||
|             'gen2': 'Gen', | ||||
|             'gent': 'Gen', | ||||
|             'loc2': 'Loc', | ||||
|             'loct': 'Loc', | ||||
|             'nomn': 'Nom', | ||||
|             'voct': 'Voc', | ||||
|         }, | ||||
|         'Degree': { | ||||
|             'COMP': 'Cmp', | ||||
|             'Supr': 'Sup', | ||||
|         }, | ||||
|         'Gender': { | ||||
|             'femn': 'Fem', | ||||
|             'masc': 'Masc', | ||||
|             'neut': 'Neut', | ||||
|         }, | ||||
|         'Mood': { | ||||
|             'impr': 'Imp', | ||||
|             'indc': 'Ind', | ||||
|         }, | ||||
|         'Number': { | ||||
|             'plur': 'Plur', | ||||
|             'sing': 'Sing', | ||||
|         }, | ||||
|         'NumForm': { | ||||
|             'NUMB': 'Digit', | ||||
|         }, | ||||
|         'Person': { | ||||
|             '1per': '1', | ||||
|             '2per': '2', | ||||
|             '3per': '3', | ||||
|             'excl': '2', | ||||
|             'incl': '1', | ||||
|         }, | ||||
|         'Tense': { | ||||
|             'futr': 'Fut', | ||||
|             'past': 'Past', | ||||
|             'pres': 'Pres', | ||||
|         }, | ||||
|         'Variant': { | ||||
|             'ADJS': 'Brev', | ||||
|             'PRTS': 'Brev', | ||||
|         }, | ||||
|         'VerbForm': { | ||||
|             'GRND': 'Conv', | ||||
|             'INFN': 'Inf', | ||||
|             'PRTF': 'Part', | ||||
|             'PRTS': 'Part', | ||||
|             'VERB': 'Fin', | ||||
|         }, | ||||
|         'Voice': { | ||||
|             'actv': 'Act', | ||||
|             'pssv': 'Pass', | ||||
|         }, | ||||
|         'Abbr': { | ||||
|             'Abbr': 'Yes' | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pos = 'X' | ||||
|     morphology = dict() | ||||
|     unmatched = set() | ||||
| 
 | ||||
|     grams = oc_tag.replace(' ', ',').split(',') | ||||
|     for gram in grams: | ||||
|         match = False | ||||
|         for categ, gmap in sorted(gram_map.items()): | ||||
|             if gram in gmap: | ||||
|                 match = True | ||||
|                 if categ == '_POS': | ||||
|                     pos = gmap[gram] | ||||
|                 else: | ||||
|                     morphology[categ] = gmap[gram] | ||||
|         if not match: | ||||
|             unmatched.add(gram) | ||||
| 
 | ||||
|     while len(unmatched) > 0: | ||||
|         gram = unmatched.pop() | ||||
|         if gram in ('Name', 'Patr', 'Surn', 'Geox', 'Orgn'): | ||||
|             pos = 'PROPN' | ||||
|         elif gram == 'Auxt': | ||||
|             pos = 'AUX' | ||||
|         elif gram == 'Pltm': | ||||
|             morphology['Number'] = 'Ptan' | ||||
| 
 | ||||
|     return pos, morphology | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     l = RussianLemmatizer() | ||||
|     print(l.noun('гвоздики', {'Gender': 'Fem'})) | ||||
							
								
								
									
										35
									
								
								spacy/lang/ru/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								spacy/lang/ru/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     'ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять', | ||||
| 
 | ||||
|     'десять', 'одиннадцать', 'двенадцать', 'тринадцать', 'четырнадцать', | ||||
|     'пятнадцать', 'шестнадцать', 'семнадцать', 'восемнадцать', 'девятнадцать', | ||||
| 
 | ||||
|     'двадцать', 'тридцать', 'сорок', 'пятдесят', 'шестдесят', 'семдесят', 'восемдесят', 'девяносто', | ||||
| 
 | ||||
|     'сто', 'двести', 'триста', 'четыреста', 'пятсот', 'шестсот', 'семсот', 'восемсот', 'девятсот', | ||||
| 
 | ||||
|     'тысяча', 'миллион', 'миллиад', 'триллион', 'квадриллион', 'квинтиллион'] | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     text = text.replace(',', '').replace('.', '') | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count('/') == 1: | ||||
|         num, denom = text.split('/') | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     if text in _num_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = { | ||||
|     LIKE_NUM: like_num | ||||
| } | ||||
							
								
								
									
										34
									
								
								spacy/lang/ru/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								spacy/lang/ru/norm_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,34 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| _exc = { | ||||
|     # Slang | ||||
|     'прив': 'привет', | ||||
| 
 | ||||
|     # Weekdays abbreviations | ||||
|     "пн.": "понедельник", | ||||
|     "вт.": "вторник", | ||||
|     "ср.": "среда", | ||||
|     "чт.": "четверг", | ||||
|     "пт.": "пятница", | ||||
|     "сб.": "суббота", | ||||
|     "вс.": "воскресенье", | ||||
| 
 | ||||
|     # Months abbreviations | ||||
|     "янв.": "январь", | ||||
|     "фев.": "февраль", | ||||
|     "мар.": "март", | ||||
|     "апр.": "апрель", | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| NORM_EXCEPTIONS = {} | ||||
| 
 | ||||
| for string, norm in _exc.items(): | ||||
|     NORM_EXCEPTIONS[string] = norm | ||||
|     NORM_EXCEPTIONS[string.title()] = norm | ||||
|     if string.endswith('.'): | ||||
|         NORM_EXCEPTIONS[string[:-1]] = norm | ||||
|         NORM_EXCEPTIONS[string.title()[:-1]] = norm | ||||
|  | @ -1,29 +1,9 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..symbols import * | ||||
| from ...symbols import ORTH, LEMMA | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
|     "Пн.": [ | ||||
|         {ORTH: "Пн.", LEMMA: "Понедельник"} | ||||
|     ], | ||||
|     "Вт.": [ | ||||
|         {ORTH: "Вт.", LEMMA: "Вторник"} | ||||
|     ], | ||||
|     "Ср.": [ | ||||
|         {ORTH: "Ср.", LEMMA: "Среда"} | ||||
|     ], | ||||
|     "Чт.": [ | ||||
|         {ORTH: "Чт.", LEMMA: "Четверг"} | ||||
|     ], | ||||
|     "Пт.": [ | ||||
|         {ORTH: "Пт.", LEMMA: "Пятница"} | ||||
|     ], | ||||
|     "Сб.": [ | ||||
|         {ORTH: "Сб.", LEMMA: "Суббота"} | ||||
|     ], | ||||
|     "Вс.": [ | ||||
|         {ORTH: "Вс.", LEMMA: "Воскресенье"} | ||||
|     ], | ||||
| } | ||||
| 
 | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user