mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Fix Docs.from_docs for all empty docs (#8009)
This commit is contained in:
parent
debaab7021
commit
a71194362f
|
@ -411,6 +411,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
assert "group" in m_doc.spans
|
assert "group" in m_doc.spans
|
||||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||||
|
|
||||||
|
# can merge empty docs
|
||||||
|
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||||
|
|
|
@ -1158,11 +1158,12 @@ cdef class Doc:
|
||||||
for i, array in enumerate(arrays[:-1]):
|
for i, array in enumerate(arrays[:-1]):
|
||||||
if len(array) > 0 and not docs[i][-1].is_space:
|
if len(array) > 0 and not docs[i][-1].is_space:
|
||||||
array[-1][spacy_index] = 1
|
array[-1][spacy_index] = 1
|
||||||
token_offset = -1
|
if len(concat_spaces) > 0:
|
||||||
for doc in docs[:-1]:
|
token_offset = -1
|
||||||
token_offset += len(doc)
|
for doc in docs[:-1]:
|
||||||
if not (len(doc) > 0 and doc[-1].is_space):
|
token_offset += len(doc)
|
||||||
concat_spaces[token_offset] = True
|
if not (len(doc) > 0 and doc[-1].is_space):
|
||||||
|
concat_spaces[token_offset] = True
|
||||||
|
|
||||||
concat_array = numpy.concatenate(arrays)
|
concat_array = numpy.concatenate(arrays)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user