mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 00:04:15 +03:00
Fix spaces in Doc.from_docs for empty docs (#10052)
Fix spaces in `Doc.from_docs(ensure_whitespace=True)` for cases where an doc ending in whitespace is followed by an empty doc.
This commit is contained in:
parent
c28e33637b
commit
4dfd559e55
|
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
"Merging the docs is fun.",
|
"Merging the docs is fun.",
|
||||||
"",
|
"",
|
||||||
"They don't think alike. ",
|
"They don't think alike. ",
|
||||||
|
"",
|
||||||
"Another doc.",
|
"Another doc.",
|
||||||
]
|
]
|
||||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||||
|
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
en_docs[4].spans["group"] = [en_docs[4][0:1]]
|
||||||
span_group_texts = sorted(
|
span_group_texts = sorted(
|
||||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
|
||||||
)
|
)
|
||||||
de_doc = de_tokenizer(de_text)
|
de_doc = de_tokenizer(de_text)
|
||||||
Token.set_extension("is_ambiguous", default=False)
|
Token.set_extension("is_ambiguous", default=False)
|
||||||
|
|
|
@ -1183,7 +1183,7 @@ cdef class Doc:
|
||||||
token_offset = -1
|
token_offset = -1
|
||||||
for doc in docs[:-1]:
|
for doc in docs[:-1]:
|
||||||
token_offset += len(doc)
|
token_offset += len(doc)
|
||||||
if not (len(doc) > 0 and doc[-1].is_space):
|
if len(doc) > 0 and not doc[-1].is_space:
|
||||||
concat_spaces[token_offset] = True
|
concat_spaces[token_offset] = True
|
||||||
|
|
||||||
concat_array = numpy.concatenate(arrays)
|
concat_array = numpy.concatenate(arrays)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user