Fix spaces in Doc.from_docs for empty docs (#10052)

Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc.
2025-07-17 11:42:30 +03:00 · 2022-01-18 17:12:42 +01:00 · 2022-01-18 17:12:42 +01:00 · 4dfd559e55
commit 4dfd559e55
parent c28e33637b
2 changed files with 4 additions and 3 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
        "",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
+    en_docs[4].spans["group"] = [en_docs[4][0:1]]
    span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
+        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1183,7 +1183,7 @@ cdef class Doc:
                token_offset = -1
                for doc in docs[:-1]:
                    token_offset += len(doc)
-                    if not (len(doc) > 0 and doc[-1].is_space):
+                    if len(doc) > 0 and not doc[-1].is_space:
                        concat_spaces[token_offset] = True
        concat_array = numpy.concatenate(arrays)