From 5af432e0f2db1d6aeba7a031a8a707fb90b6332a Mon Sep 17 00:00:00 2001 From: Yohei Tamura Date: Thu, 3 Sep 2020 17:09:03 +0900 Subject: [PATCH] fix for empty string (#5936) --- spacy/tests/doc/test_doc_api.py | 19 ++++++++++--------- spacy/tokens/doc.pyx | 6 ++++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 954181df5..b37a31e43 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -317,7 +317,8 @@ def test_doc_from_array_morph(en_vocab): def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = ["Merging the docs is fun.", "They don't think alike."] + en_texts = ["Merging the docs is fun.", "", "They don't think alike."] + en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] docs_idx = en_texts[0].index("docs") @@ -338,14 +339,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): Doc.from_docs(en_docs + [de_doc]) m_doc = Doc.from_docs(en_docs) - assert len(en_docs) == len(list(m_doc.sents)) + assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts) + assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") + think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx with pytest.raises(AttributeError): # not callable, because it was not set via set_extension @@ -353,14 +354,14 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) - assert len(en_docs) == len(list(m_doc.sents)) - assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1]) + assert len(en_texts_without_empty) == len(list(m_doc.sents)) + assert len(str(m_doc)) == sum(len(t) for t in en_texts) assert str(m_doc) == "".join(en_texts) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 0 + en_texts[1].index("think") + think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think") assert m_doc[9].idx == think_idx m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) @@ -369,12 +370,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert list(m_doc.sents) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert str(m_doc) == " ".join(en_texts) + assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) - think_idx = len(en_texts[0]) + 1 + en_texts[1].index("think") + think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cd080bf35..3c7b4f8b3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -920,7 +920,9 @@ cdef class Doc: warnings.warn(Warnings.W101.format(name=name)) else: warnings.warn(Warnings.W102.format(key=key, value=value)) - char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1 + char_offset += len(doc.text) + if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space): + char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] @@ -932,7 +934,7 @@ cdef class Doc: token_offset = -1 for doc in docs[:-1]: token_offset += len(doc) - if not doc[-1].is_space: + if not (len(doc) > 0 and doc[-1].is_space): concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays)