Fix Docs.from_docs for all empty docs (#8009)

This commit is contained in:
Adriane Boyd 2021-05-05 18:44:14 +02:00 committed by GitHub
parent debaab7021
commit a71194362f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 5 deletions

View File

@ -411,6 +411,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert "group" in m_doc.spans assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
# can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10)
def test_doc_api_from_docs_ents(en_tokenizer): def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."] texts = ["Merging the docs is fun.", "They don't think alike."]

View File

@ -1158,11 +1158,12 @@ cdef class Doc:
for i, array in enumerate(arrays[:-1]): for i, array in enumerate(arrays[:-1]):
if len(array) > 0 and not docs[i][-1].is_space: if len(array) > 0 and not docs[i][-1].is_space:
array[-1][spacy_index] = 1 array[-1][spacy_index] = 1
token_offset = -1 if len(concat_spaces) > 0:
for doc in docs[:-1]: token_offset = -1
token_offset += len(doc) for doc in docs[:-1]:
if not (len(doc) > 0 and doc[-1].is_space): token_offset += len(doc)
concat_spaces[token_offset] = True if not (len(doc) > 0 and doc[-1].is_space):
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays) concat_array = numpy.concatenate(arrays)