From 332803eda9e9999434d4da41e56d1689f353bbd8 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Tue, 25 Aug 2020 21:16:24 +0900 Subject: [PATCH] fix ja leading spaces (#5969) * change condition for space after * add NAUGHTY_STRINGS test example --- spacy/lang/ja/__init__.py | 4 ++-- spacy/tests/tokenizer/test_naughty_strings.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 30e73fd84..80cb7a837 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): return text_dtokens, text_spaces # align words and dtokens by referring text, and insert gap tokens for the space char spans - for word, dtoken in zip(words, dtokens): + for i, (word, dtoken) in enumerate(zip(words, dtokens)): # skip all space tokens if word.isspace(): continue @@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): text_spaces.append(False) text_pos += len(word) # poll a space char after the word - if text_pos < len(text) and text[text_pos] == " ": + if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ": text_spaces[-1] = True text_pos += 1 diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 36c69611e..9737b15cf 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -32,6 +32,7 @@ NAUGHTY_STRINGS = [ r"₀₁₂", r"⁰⁴⁵₀₁₂", r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", + r" ̄ ̄", # Two-Byte Characters r"田中さんにあげて下さい", r"パーティーへ行かないか",