Merge branch 'russian_language'

2025-08-09 14:44:52 +03:00 · 2017-10-16 13:46:28 +07:00 · 2017-10-16 13:46:28 +07:00 · 92931a2efd
commit 92931a2efd
parent 6fb9d75bd2 241d19a3e6
4 changed files with 21 additions and 6 deletions
--- a/spacy/ru/init.py
+++ b/spacy/ru/init.py
@ -25,15 +25,29 @@ class RussianTokenizer(object):
        self._spacy_tokenizer = spacy_tokenizer

    def __call__(self, text):
-        words = [self._normalize(RussianTokenizer._get_word(token))
-                 for token in self._spacy_tokenizer(text)]
+        get_norm = RussianTokenizer._get_norm
+        has_space = RussianTokenizer._has_space

-        return Doc(self.vocab, words, [False] * len(words))
+        words_with_space_flags = [(get_norm(token), has_space(token, text))
+                                  for token in self._spacy_tokenizer(text)]
+
+        words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
+
+        return Doc(self.vocab, words, spaces)

    @staticmethod
    def _get_word(token):
        return token.lemma_ if len(token.lemma_) > 0 else token.text

+    @staticmethod
+    def _has_space(token, text):
+        pos_after_token = token.idx + len(token.text)
+        return pos_after_token < len(text) and text[pos_after_token] == ' '
+
+    @classmethod
+    def _get_norm(cls, token):
+        return cls._normalize(cls._get_word(token))
+
    @classmethod
    def _normalize(cls, word):
        return cls._morph.parse(word)[0].normal_form
--- a/spacy/ru/language_data.py
+++ b/spacy/ru/language_data.py
@ -15,4 +15,4 @@ TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))


-__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
+__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
--- a/spacy/ru/stop_words.py
+++ b/spacy/ru/stop_words.py
@ -51,4 +51,4 @@ STOP_WORDS = set("""
 эта эти этим этими этих это этого этой этом этому этот этою эту

 я
-""".split())
+""".split())
--- a/spacy/ru/tokenizer_exceptions.py
+++ b/spacy/ru/tokenizer_exceptions.py
@ -26,4 +26,5 @@ TOKENIZER_EXCEPTIONS = {
    "Вс.": [
        {ORTH: "Вс.", LEMMA: "Воскресенье"}
    ],
-}
+}
+