mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Fix Docs.from_docs for all empty docs (#8009)
This commit is contained in:
parent
debaab7021
commit
a71194362f
|
@ -411,6 +411,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert "group" in m_doc.spans
|
||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||
|
||||
# can merge empty docs
|
||||
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
||||
|
||||
|
||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
|
|
|
@ -1158,11 +1158,12 @@ cdef class Doc:
|
|||
for i, array in enumerate(arrays[:-1]):
|
||||
if len(array) > 0 and not docs[i][-1].is_space:
|
||||
array[-1][spacy_index] = 1
|
||||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
concat_spaces[token_offset] = True
|
||||
if len(concat_spaces) > 0:
|
||||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user