updated Russian tokenizer

moved the trying to import pymorph into __init__
This commit is contained in:
yuukos 2017-10-13 13:57:29 +07:00 committed by Vadim Mazaev
parent 3aad66cf00
commit 7401152289

View File

@ -8,17 +8,19 @@ from .language_data import *
class RussianTokenizer(object): class RussianTokenizer(object):
try: _morph = None
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian tokenizer requires the pymorphy2 library: "
"try to fix it with "
"pip install pymorphy2==0.8")
_morph = MorphAnalyzer()
def __init__(self, spacy_tokenizer, cls, nlp=None): def __init__(self, spacy_tokenizer, cls, nlp=None):
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian tokenizer requires the pymorphy2 library: "
"try to fix it with "
"pip install pymorphy2==0.8")
RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
self._spacy_tokenizer = spacy_tokenizer self._spacy_tokenizer = spacy_tokenizer
@ -36,6 +38,12 @@ class RussianTokenizer(object):
def _normalize(cls, word): def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form return cls._morph.parse(word)[0].normal_form
@classmethod
def _create_morph(cls, morph_analyzer_class):
if not cls._morph:
cls._morph = morph_analyzer_class()
return cls._morph
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)