mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Handle empty and whitespace-only docs for Japanese (#5564)
Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer.
This commit is contained in:
parent
de00f967ce
commit
f162815f45
|
@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
|
|||
text_tags = []
|
||||
text_spaces = []
|
||||
text_pos = 0
|
||||
# handle empty and whitespace-only texts
|
||||
if len(words) == 0:
|
||||
return text_words, text_lemmas, text_tags, text_spaces
|
||||
elif len([word for word in words if not word.isspace()]) == 0:
|
||||
assert text.isspace()
|
||||
text_words = [text]
|
||||
text_lemmas = [text]
|
||||
text_tags = [gap_tag]
|
||||
text_spaces = [False]
|
||||
return text_words, text_lemmas, text_tags, text_spaces
|
||||
# normalize words to remove all whitespace tokens
|
||||
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
|
||||
# align words with text
|
||||
|
|
|
@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
|||
assert len(nlp_a(text)) == len_a
|
||||
assert len(nlp_b(text)) == len_b
|
||||
assert len(nlp_c(text)) == len_c
|
||||
|
||||
|
||||
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||
doc = ja_tokenizer("")
|
||||
assert len(doc) == 0
|
||||
doc = ja_tokenizer(" ")
|
||||
assert len(doc) == 1
|
||||
doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
|
||||
assert len(doc) == 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user