From 332803eda9e9999434d4da41e56d1689f353bbd8 Mon Sep 17 00:00:00 2001
From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com>
Date: Tue, 25 Aug 2020 21:16:24 +0900
Subject: [PATCH] fix ja leading spaces (#5969)

* change condition for space after

* add NAUGHTY_STRINGS test example
---
 spacy/lang/ja/__init__.py                     | 4 ++--
 spacy/tests/tokenizer/test_naughty_strings.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 30e73fd84..80cb7a837 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
         return text_dtokens, text_spaces
 
     # align words and dtokens by referring text, and insert gap tokens for the space char spans
-    for word, dtoken in zip(words, dtokens):
+    for i, (word, dtoken) in enumerate(zip(words, dtokens)):
         # skip all space tokens
         if word.isspace():
             continue
@@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
         text_spaces.append(False)
         text_pos += len(word)
         # poll a space char after the word
-        if text_pos < len(text) and text[text_pos] == " ":
+        if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ":
             text_spaces[-1] = True
             text_pos += 1
 
diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py
index 36c69611e..9737b15cf 100644
--- a/spacy/tests/tokenizer/test_naughty_strings.py
+++ b/spacy/tests/tokenizer/test_naughty_strings.py
@@ -32,6 +32,7 @@ NAUGHTY_STRINGS = [
     r"₀₁₂",
     r"⁰⁴⁵₀₁₂",
     r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
+    r" ̄  ̄",
     # Two-Byte Characters
     r"田中さんにあげて下さい",
     r"パーティーへ行かないか",