mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Merge branch 'russian_language'
This commit is contained in:
commit
92931a2efd
|
@ -25,15 +25,29 @@ class RussianTokenizer(object):
|
|||
self._spacy_tokenizer = spacy_tokenizer
|
||||
|
||||
def __call__(self, text):
|
||||
words = [self._normalize(RussianTokenizer._get_word(token))
|
||||
for token in self._spacy_tokenizer(text)]
|
||||
get_norm = RussianTokenizer._get_norm
|
||||
has_space = RussianTokenizer._has_space
|
||||
|
||||
return Doc(self.vocab, words, [False] * len(words))
|
||||
words_with_space_flags = [(get_norm(token), has_space(token, text))
|
||||
for token in self._spacy_tokenizer(text)]
|
||||
|
||||
words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
|
||||
|
||||
return Doc(self.vocab, words, spaces)
|
||||
|
||||
@staticmethod
|
||||
def _get_word(token):
|
||||
return token.lemma_ if len(token.lemma_) > 0 else token.text
|
||||
|
||||
@staticmethod
|
||||
def _has_space(token, text):
|
||||
pos_after_token = token.idx + len(token.text)
|
||||
return pos_after_token < len(text) and text[pos_after_token] == ' '
|
||||
|
||||
@classmethod
|
||||
def _get_norm(cls, token):
|
||||
return cls._normalize(cls._get_word(token))
|
||||
|
||||
@classmethod
|
||||
def _normalize(cls, word):
|
||||
return cls._morph.parse(word)[0].normal_form
|
||||
|
|
|
@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
|||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
|
||||
__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
|
||||
|
|
|
@ -51,4 +51,4 @@ STOP_WORDS = set("""
|
|||
эта эти этим этими этих это этого этой этом этому этот этою эту
|
||||
|
||||
я
|
||||
""".split())
|
||||
""".split())
|
||||
|
|
|
@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = {
|
|||
"Вс.": [
|
||||
{ORTH: "Вс.", LEMMA: "Воскресенье"}
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user