fixed Russian Tokenizer

- added trailing space flags for tokens
This commit is contained in:
yuukos 2017-10-16 13:37:05 +07:00
parent a229b6e0de
commit 241d19a3e6
4 changed files with 21 additions and 6 deletions

View File

@ -25,15 +25,29 @@ class RussianTokenizer(object):
self._spacy_tokenizer = spacy_tokenizer
def __call__(self, text):
words = [self._normalize(RussianTokenizer._get_word(token))
for token in self._spacy_tokenizer(text)]
get_norm = RussianTokenizer._get_norm
has_space = RussianTokenizer._has_space
return Doc(self.vocab, words, [False] * len(words))
words_with_space_flags = [(get_norm(token), has_space(token, text))
for token in self._spacy_tokenizer(text)]
words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
return Doc(self.vocab, words, spaces)
@staticmethod
def _get_word(token):
return token.lemma_ if len(token.lemma_) > 0 else token.text
@staticmethod
def _has_space(token, text):
pos_after_token = token.idx + len(token.text)
return pos_after_token < len(text) and text[pos_after_token] == ' '
@classmethod
def _get_norm(cls, token):
return cls._normalize(cls._get_word(token))
@classmethod
def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form

View File

@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]

View File

@ -51,4 +51,4 @@ STOP_WORDS = set("""
эта эти этим этими этих это этого этой этом этому этот этою эту
я
""".split())
""".split())

View File

@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = {
"Вс.": [
{ORTH: "Вс.", LEMMA: "Воскресенье"}
],
}
}