Fix spaces in Doc.from_docs for empty docs (#10052)

Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an
doc ending in whitespace is followed by an empty doc.
This commit is contained in:
Adriane Boyd 2022-01-18 17:12:42 +01:00 committed by GitHub
parent c28e33637b
commit 4dfd559e55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 3 deletions

View File

@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.", "Merging the docs is fun.",
"", "",
"They don't think alike. ", "They don't think alike. ",
"",
"Another doc.", "Another doc.",
] ]
en_texts_without_empty = [t for t in en_texts if len(t)] en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts] en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]]
en_docs[3].spans["group"] = [en_docs[3][0:1]] en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted( span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
) )
de_doc = de_tokenizer(de_text) de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False) Token.set_extension("is_ambiguous", default=False)

View File

@ -1183,7 +1183,7 @@ cdef class Doc:
token_offset = -1 token_offset = -1
for doc in docs[:-1]: for doc in docs[:-1]:
token_offset += len(doc) token_offset += len(doc)
if not (len(doc) > 0 and doc[-1].is_space): if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays) concat_array = numpy.concatenate(arrays)