Various fixes for spans in Docs.from_docs (#8487)

* Fix spans offsets if a doc ends in a single space and no space is
  inserted
* Also include spans key in merged doc for empty spans lists
This commit is contained in:
Adriane Boyd 2021-06-23 15:51:35 +02:00 committed by GitHub
parent ed1ba13439
commit 393c3c70d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 11 deletions

View File

@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab):
@pytest.mark.usefixtures("clean_underscore") @pytest.mark.usefixtures("clean_underscore")
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "", "They don't think alike."] en_texts = [
"Merging the docs is fun.",
"",
"They don't think alike. ",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)] en_texts_without_empty = [t for t in en_texts if len(t)]
de_text = "Wie war die Frage?" de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts] en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]]
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) en_docs[3].spans["group"] = [en_docs[3][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
)
de_doc = de_tokenizer(de_text) de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False) Token.set_extension("is_ambiguous", default=False)
en_docs[0][2]._.is_ambiguous = True # docs en_docs[0][2]._.is_ambiguous = True # docs
en_docs[2][3]._.is_ambiguous = True # think en_docs[2][3]._.is_ambiguous = True # think
assert Doc.from_docs([]) is None assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc]) assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc])) assert str(de_doc) == str(Doc.from_docs([de_doc]))
@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
m_doc = Doc.from_docs(en_docs) m_doc = Doc.from_docs(en_docs)
assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts_without_empty) assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_) assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) == sum(len(t) for t in en_texts) assert len(m_doc.text) == sum(len(t) for t in en_texts)
assert str(m_doc) == "".join(en_texts) assert m_doc.text == "".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_) assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing # space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty) assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_) assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
# can merge empty docs # can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10) doc = Doc.from_docs([en_tokenizer("")] * 10)
# empty but set spans keys are preserved
en_docs = [en_tokenizer(text) for text in en_texts]
m_doc = Doc.from_docs(en_docs)
assert "group" not in m_doc.spans
for doc in en_docs:
doc.spans["group"] = []
m_doc = Doc.from_docs(en_docs)
assert "group" in m_doc.spans
assert len(m_doc.spans["group"]) == 0
def test_doc_api_from_docs_ents(en_tokenizer): def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."] texts = ["Merging the docs is fun.", "They don't think alike."]

View File

@ -1141,6 +1141,10 @@ cdef class Doc:
else: else:
warnings.warn(Warnings.W102.format(key=key, value=value)) warnings.warn(Warnings.W102.format(key=key, value=value))
for key in doc.spans: for key in doc.spans:
# if a spans key is in any doc, include it in the merged doc
# even if it is empty
if key not in concat_spans:
concat_spans[key] = []
for span in doc.spans[key]: for span in doc.spans[key]:
concat_spans[key].append(( concat_spans[key].append((
span.start_char + char_offset, span.start_char + char_offset,
@ -1150,7 +1154,7 @@ cdef class Doc:
span.text, # included as a check span.text, # included as a check
)) ))
char_offset += len(doc.text) char_offset += len(doc.text)
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
char_offset += 1 char_offset += 1
arrays = [doc.to_array(attrs) for doc in docs] arrays = [doc.to_array(attrs) for doc in docs]