Fix half-width space handling in JA (#4284) (closes #4262)

Before this patch, half-width spaces between words were simply lost in Japanese text. This wasn't immediately noticeable because much Japanese text never uses spaces at all.
2025-07-21 13:39:48 +03:00 · 2019-09-13 23:28:12 +09:00 · 2019-09-13 23:28:12 +09:00 · 29a9e636eb
commit 29a9e636eb
parent 3c3658ef9f
3 changed files with 29 additions and 4 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -37,6 +37,11 @@ def resolve_pos(token):
    in the sentence. This function adds information to the POS tag to
    resolve ambiguous mappings.
    """
+
+    # this is only used for consecutive ascii spaces
+    if token.pos == '空白':
+        return '空白'
+
    # TODO: This is a first take. The rules here are crude approximations.
    # For many of these, full dependencies are needed to properly resolve
    # PoS mappings.
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
    node = tokenizer.parseToNode(text)
    node = node.next  # first node is beginning of sentence and empty, skip it
    words = []
+    spaces = []
    while node.posid != 0:
        surface = node.surface
        base = surface  # a default value. Updated if available later.
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
            # dictionary
            base = parts[7]
        words.append(ShortUnitWord(surface, base, pos))
+
+        # The way MeCab stores spaces is that the rlength of the next token is
+        # the length of that token plus any preceding whitespace, **in bytes**.
+        # also note that this is only for half-width / ascii spaces. Full width
+        # spaces just become tokens.
+        scount = node.next.rlength - node.next.length
+        spaces.append(bool(scount))
+        while scount > 1:
+            words.append(ShortUnitWord(' ', ' ', '空白'))
+            spaces.append(False)
+            scount -= 1
+
        node = node.next
-    return words
+    return words, spaces


 class JapaneseTokenizer(DummyTokenizer):
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
        self.tokenizer.parseToNode("")  # see #2901

    def __call__(self, text):
-        dtokens = detailed_tokens(self.tokenizer, text)
+        dtokens, spaces = detailed_tokens(self.tokenizer, text)
        words = [x.surface for x in dtokens]
-        spaces = [False] * len(words)
        doc = Doc(self.vocab, words=words, spaces=spaces)
        mecab_tags = []
        for token, dtoken in zip(doc, dtokens):
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals

 from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
+from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE


 TAG_MAP = {
@ -21,6 +21,8 @@ TAG_MAP = {
    "感動詞,一般,*,*": {POS: INTJ},
    # this is specifically for unicode full-width space
    "空白,*,*,*": {POS: X},
+    # This is used when sequential half-width spaces are present
+    "空白": {POS: SPACE},
    "形状詞,一般,*,*": {POS: ADJ},
    "形状詞,タリ,*,*": {POS: ADJ},
    "形状詞,助動詞語幹,*,*": {POS: ADJ},
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
 def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
    pos = [token.pos_ for token in ja_tokenizer(text)]
    assert pos == expected_pos
+
+def test_extra_spaces(ja_tokenizer):
+    # note: three spaces after "I"
+    tokens = ja_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == ' '
+    assert tokens[2].orth_ == ' '