mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Fix spaces in Doc.from_docs for empty docs (#10052)
Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc.
This commit is contained in:
parent
c28e33637b
commit
4dfd559e55
|
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
"Merging the docs is fun.",
|
||||
"",
|
||||
"They don't think alike. ",
|
||||
"",
|
||||
"Another doc.",
|
||||
]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
|
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||
en_docs[4].spans["group"] = [en_docs[4][0:1]]
|
||||
span_group_texts = sorted(
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
|
||||
)
|
||||
de_doc = de_tokenizer(de_text)
|
||||
Token.set_extension("is_ambiguous", default=False)
|
||||
|
|
|
@ -1183,7 +1183,7 @@ cdef class Doc:
|
|||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
if len(doc) > 0 and not doc[-1].is_space:
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
|
Loading…
Reference in New Issue
Block a user