mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
fix ja leading spaces (#5969)
* change condition for space after * add NAUGHTY_STRINGS test example
This commit is contained in:
parent
450720aca2
commit
332803eda9
|
@ -98,7 +98,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
return text_dtokens, text_spaces
|
||||
|
||||
# align words and dtokens by referring text, and insert gap tokens for the space char spans
|
||||
for word, dtoken in zip(words, dtokens):
|
||||
for i, (word, dtoken) in enumerate(zip(words, dtokens)):
|
||||
# skip all space tokens
|
||||
if word.isspace():
|
||||
continue
|
||||
|
@ -119,7 +119,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
text_spaces.append(False)
|
||||
text_pos += len(word)
|
||||
# poll a space char after the word
|
||||
if text_pos < len(text) and text[text_pos] == " ":
|
||||
if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ":
|
||||
text_spaces[-1] = True
|
||||
text_pos += 1
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@ NAUGHTY_STRINGS = [
|
|||
r"₀₁₂",
|
||||
r"⁰⁴⁵₀₁₂",
|
||||
r"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
|
||||
r" ̄ ̄",
|
||||
# Two-Byte Characters
|
||||
r"田中さんにあげて下さい",
|
||||
r"パーティーへ行かないか",
|
||||
|
|
Loading…
Reference in New Issue
Block a user