fixed Russian Tokenizer

- added trailing space flags for tokens
This commit is contained in:
yuukos 2017-10-16 13:37:05 +07:00
parent a229b6e0de
commit 241d19a3e6
4 changed files with 21 additions and 6 deletions

View File

@ -25,15 +25,29 @@ class RussianTokenizer(object):
self._spacy_tokenizer = spacy_tokenizer
def __call__(self, text):
words = [self._normalize(RussianTokenizer._get_word(token))
get_norm = RussianTokenizer._get_norm
has_space = RussianTokenizer._has_space
words_with_space_flags = [(get_norm(token), has_space(token, text))
for token in self._spacy_tokenizer(text)]
return Doc(self.vocab, words, [False] * len(words))
words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
return Doc(self.vocab, words, spaces)
@staticmethod
def _get_word(token):
return token.lemma_ if len(token.lemma_) > 0 else token.text
@staticmethod
def _has_space(token, text):
pos_after_token = token.idx + len(token.text)
return pos_after_token < len(text) and text[pos_after_token] == ' '
@classmethod
def _get_norm(cls, token):
return cls._normalize(cls._get_word(token))
@classmethod
def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form

View File

@ -27,3 +27,4 @@ TOKENIZER_EXCEPTIONS = {
{ORTH: "Вс.", LEMMA: "Воскресенье"}
],
}