fix ja leading spaces (#5969)

* change condition for space after

* add NAUGHTY_STRINGS test example
This commit is contained in:
Hiroshi Matsuda 2020-08-25 21:16:24 +09:00 committed by GitHub
parent 450720aca2
commit 332803eda9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 3 additions and 2 deletions

View File

@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces
# align words and dtokens by referring text, and insert gap tokens for the space char spans
for word, dtoken in zip(words, dtokens):
for i, (word, dtoken) in enumerate(zip(words, dtokens)):
# skip all space tokens
if word.isspace():
continue
@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
text_spaces.append(False)
text_pos += len(word)
# poll a space char after the word
if text_pos < len(text) and text[text_pos] == " ":
if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ":
text_spaces[-1] = True
text_pos += 1

View File

@ -32,6 +32,7 @@ NAUGHTY_STRINGS = [
r"₀₁₂",
r"⁰⁴⁵₀₁₂",
r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
r" ̄ ̄",
# Two-Byte Characters
r"田中さんにあげて下さい",
r"パーティーへ行かないか",