diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 30e73fd84..80cb7a837 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): return text_dtokens, text_spaces # align words and dtokens by referring text, and insert gap tokens for the space char spans - for word, dtoken in zip(words, dtokens): + for i, (word, dtoken) in enumerate(zip(words, dtokens)): # skip all space tokens if word.isspace(): continue @@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): text_spaces.append(False) text_pos += len(word) # poll a space char after the word - if text_pos < len(text) and text[text_pos] == " ": + if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ": text_spaces[-1] = True text_pos += 1 diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 36c69611e..9737b15cf 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -32,6 +32,7 @@ NAUGHTY_STRINGS = [ r"₀₁₂", r"⁰⁴⁵₀₁₂", r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", + r" ̄ ̄", # Two-Byte Characters r"田中さんにあげて下さい", r"パーティーへ行かないか",