fixed Russian Tokenizer

- added trailing space flags for tokens
2025-10-18 09:44:16 +03:00 · 2017-10-16 13:37:05 +07:00 · 2017-10-16 13:37:05 +07:00 · 241d19a3e6
commit 241d19a3e6
parent a229b6e0de
4 changed files with 21 additions and 6 deletions
--- a/spacy/ru/init.py
+++ b/spacy/ru/init.py
@ -25,15 +25,29 @@ class RussianTokenizer(object):
        self._spacy_tokenizer = spacy_tokenizer

    def __call__(self, text):
-        words = [self._normalize(RussianTokenizer._get_word(token))
+        get_norm = RussianTokenizer._get_norm
+        has_space = RussianTokenizer._has_space
+
+        words_with_space_flags = [(get_norm(token), has_space(token, text))
                                  for token in self._spacy_tokenizer(text)]

-        return Doc(self.vocab, words, [False] * len(words))
+        words, spaces = map(lambda s: list(s), zip(*words_with_space_flags))
+
+        return Doc(self.vocab, words, spaces)

    @staticmethod
    def _get_word(token):
        return token.lemma_ if len(token.lemma_) > 0 else token.text

+    @staticmethod
+    def _has_space(token, text):
+        pos_after_token = token.idx + len(token.text)
+        return pos_after_token < len(text) and text[pos_after_token] == ' '
+
+    @classmethod
+    def _get_norm(cls, token):
+        return cls._normalize(cls._get_word(token))
+
    @classmethod
    def _normalize(cls, word):
        return cls._morph.parse(word)[0].normal_form
--- a/spacy/ru/tokenizer_exceptions.py
+++ b/spacy/ru/tokenizer_exceptions.py
@ -27,3 +27,4 @@ TOKENIZER_EXCEPTIONS = {
        {ORTH: "Вс.", LEMMA: "Воскресенье"}
    ],
 }
+