From f162815f45c69dd71e194361284dbef3939fb9fc Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 8 Jun 2020 21:09:23 +0200 Subject: [PATCH] Handle empty and whitespace-only docs for Japanese (#5564) Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer. --- spacy/lang/ja/__init__.py | 10 ++++++++++ spacy/tests/lang/ja/test_tokenizer.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 294c6b38d..39e0445c2 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): text_tags = [] text_spaces = [] text_pos = 0 + # handle empty and whitespace-only texts + if len(words) == 0: + return text_words, text_lemmas, text_tags, text_spaces + elif len([word for word in words if not word.isspace()]) == 0: + assert text.isspace() + text_words = [text] + text_lemmas = [text] + text_tags = [gap_tag] + text_spaces = [False] + return text_words, text_lemmas, text_tags, text_spaces # normalize words to remove all whitespace tokens norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) # align words with text diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 82c43fe4c..30cba42b1 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_a(text)) == len_a assert len(nlp_b(text)) == len_b assert len(nlp_c(text)) == len_c + + +def test_ja_tokenizer_emptyish_texts(ja_tokenizer): + doc = ja_tokenizer("") + assert len(doc) == 0 + doc = ja_tokenizer(" ") + assert len(doc) == 1 + doc = ja_tokenizer("\n\n\n \t\t \n\n\n") + assert len(doc) == 1