spaCy/spacy/ru/__init__.py

79 lines
2.3 KiB
Python
Raw Normal View History

2017-10-12 18:24:20 +03:00
# encoding: utf8
from __future__ import unicode_literals, print_function
from ..language import Language
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class RussianTokenizer(object):
_morph = None
2017-10-12 18:24:20 +03:00
def __init__(self, spacy_tokenizer, cls, nlp=None):
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian tokenizer requires the pymorphy2 library: "
"try to fix it with "
"pip install pymorphy2==0.8")
RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
2017-10-12 18:24:20 +03:00
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
self._spacy_tokenizer = spacy_tokenizer
def __call__(self, text):
get_norm = RussianTokenizer._get_norm
has_space = RussianTokenizer._has_space
2017-10-12 18:24:20 +03:00
words_with_space_flags = [(get_norm(token), has_space(token, text))
for token in self._spacy_tokenizer(text)]
words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
return Doc(self.vocab, words, spaces)
2017-10-12 18:24:20 +03:00
@staticmethod
def _get_word(token):
return token.lemma_ if len(token.lemma_) > 0 else token.text
@staticmethod
def _has_space(token, text):
pos_after_token = token.idx + len(token.text)
return pos_after_token < len(text) and text[pos_after_token] == ' '
@classmethod
def _get_norm(cls, token):
return cls._normalize(cls._get_word(token))
2017-10-12 18:24:20 +03:00
@classmethod
def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form
@classmethod
def _create_morph(cls, morph_analyzer_class):
if not cls._morph:
cls._morph = morph_analyzer_class()
return cls._morph
2017-10-12 18:24:20 +03:00
class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ru'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
return RussianTokenizer(tokenizer, cls, nlp)
class Russian(Language):
lang = 'ru'
Defaults = RussianDefaults