From 29a9e636eb8b0f22f91eee85b4a10b8cdade4ed2 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 13 Sep 2019 23:28:12 +0900
Subject: [PATCH] Fix half-width space handling in JA (#4284) (closes #4262)

Before this patch, half-width spaces between words were simply lost in
Japanese text. This wasn't immediately noticeable because much Japanese
text never uses spaces at all.
---
 spacy/lang/ja/__init__.py             | 23 ++++++++++++++++++++---
 spacy/lang/ja/tag_map.py              |  4 +++-
 spacy/tests/lang/ja/test_tokenizer.py |  6 ++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 3a6074bba..791b1ec33 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -37,6 +37,11 @@ def resolve_pos(token):
     in the sentence. This function adds information to the POS tag to
     resolve ambiguous mappings.
     """
+
+    # this is only used for consecutive ascii spaces
+    if token.pos == '空白':
+        return '空白'
+
     # TODO: This is a first take. The rules here are crude approximations.
     # For many of these, full dependencies are needed to properly resolve
     # PoS mappings.
@@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
     node = tokenizer.parseToNode(text)
     node = node.next  # first node is beginning of sentence and empty, skip it
     words = []
+    spaces = []
     while node.posid != 0:
         surface = node.surface
         base = surface  # a default value. Updated if available later.
@@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
             # dictionary
             base = parts[7]
         words.append(ShortUnitWord(surface, base, pos))
+
+        # The way MeCab stores spaces is that the rlength of the next token is
+        # the length of that token plus any preceding whitespace, **in bytes**.
+        # also note that this is only for half-width / ascii spaces. Full width
+        # spaces just become tokens.
+        scount = node.next.rlength - node.next.length
+        spaces.append(bool(scount))
+        while scount > 1:
+            words.append(ShortUnitWord(' ', ' ', '空白'))
+            spaces.append(False)
+            scount -= 1
+
         node = node.next
-    return words
+    return words, spaces
 
 
 class JapaneseTokenizer(DummyTokenizer):
@@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
         self.tokenizer.parseToNode("")  # see #2901
 
     def __call__(self, text):
-        dtokens = detailed_tokens(self.tokenizer, text)
+        dtokens, spaces = detailed_tokens(self.tokenizer, text)
         words = [x.surface for x in dtokens]
-        spaces = [False] * len(words)
         doc = Doc(self.vocab, words=words, spaces=spaces)
         mecab_tags = []
         for token, dtoken in zip(doc, dtokens):
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index 6b114eb10..4ff0a35ee 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
+from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
 
 
 TAG_MAP = {
@@ -21,6 +21,8 @@ TAG_MAP = {
     "感動詞,一般,*,*": {POS: INTJ},
     # this is specifically for unicode full-width space
     "空白,*,*,*": {POS: X},
+    # This is used when sequential half-width spaces are present
+    "空白": {POS: SPACE},
     "形状詞,一般,*,*": {POS: ADJ},
     "形状詞,タリ,*,*": {POS: ADJ},
     "形状詞,助動詞語幹,*,*": {POS: ADJ},
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index c95e7bc40..38ca37bc9 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
 def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
     pos = [token.pos_ for token in ja_tokenizer(text)]
     assert pos == expected_pos
+
+def test_extra_spaces(ja_tokenizer):
+    # note: three spaces after "I"
+    tokens = ja_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == ' '
+    assert tokens[2].orth_ == ' '