From 4dfd559e5569f73846d5280d86487104f8550b0d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 18 Jan 2022 17:12:42 +0100 Subject: [PATCH] Fix spaces in Doc.from_docs for empty docs (#10052) Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc. --- spacy/tests/doc/test_doc_api.py | 5 +++-- spacy/tokens/doc.pyx | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index c6195d7e2..10700b787 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): "Merging the docs is fun.", "", "They don't think alike. ", + "", "Another doc.", ] en_texts_without_empty = [t for t in en_texts if len(t)] @@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - en_docs[3].spans["group"] = [en_docs[3][0:1]] + en_docs[4].spans["group"] = [en_docs[4][0:1]] span_group_texts = sorted( - [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 362a17784..2f82a0d1b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1183,7 +1183,7 @@ cdef class Doc: token_offset = -1 for doc in docs[:-1]: token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and not doc[-1].is_space: concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays)