Handle partial entities in Span.as_doc (#8055)

* Handle partial entities in Span.as_doc

In `Span.as_doc` replace partial entities at the beginning or end of the
span with missing entity annotation.

Fixes a bug where invalid entity annotation (no initial `B`) was
returned for an initial partial entity.

* Check for empty span in ents conversion

Note: `Span.as_doc()` will still fail on an empty span due to failures
in `Span.vector`.
This commit is contained in:
Adriane Boyd 2021-05-11 17:10:16 +02:00 committed by GitHub
parent 3883d49446
commit d5bbd1f94f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 32 additions and 1 deletions

View File

@ -14,9 +14,11 @@ def doc(en_tokenizer):
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12] heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
"attr", "punct", "ROOT", "det", "npadvmod", "punct"] "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
"O", "O", "O", "O", "O"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)
@pytest.fixture @pytest.fixture
@ -220,6 +222,17 @@ def test_span_as_doc(doc):
assert span_doc is not doc assert span_doc is not doc
assert span_doc[0].idx == 0 assert span_doc[0].idx == 0
# partial initial entity is removed
assert len(span_doc.ents) == 0
# full entity is preserved
span_doc = doc[2:10].as_doc()
assert len(span_doc.ents) == 1
# partial final entity is removed
span_doc = doc[0:5].as_doc()
assert len(span_doc.ents) == 0
@pytest.mark.usefixtures("clean_underscore") @pytest.mark.usefixtures("clean_underscore")
def test_span_as_doc_user_data(doc): def test_span_as_doc_user_data(doc):

View File

@ -228,7 +228,25 @@ cdef class Span:
array = self.doc.to_array(array_head) array = self.doc.to_array(array_head)
array = array[self.start : self.end] array = array[self.start : self.end]
self._fix_dep_copy(array_head, array) self._fix_dep_copy(array_head, array)
# Fix initial IOB so the entities are valid for doc.ents below.
if len(array) > 0 and ENT_IOB in array_head:
ent_iob_col = array_head.index(ENT_IOB)
if array[0][ent_iob_col] == 1:
array[0][ent_iob_col] = 3
doc.from_array(array_head, array) doc.from_array(array_head, array)
# Set partial entities at the beginning or end of the span to have
# missing entity annotation. Note: the initial partial entity could be
# detected from the IOB annotation but the final partial entity can't,
# so detect and remove both in the same way by checking self.ents.
span_ents = {(ent.start, ent.end) for ent in self.ents}
doc_ents = doc.ents
if len(doc_ents) > 0:
# Remove initial partial ent
if (doc_ents[0].start + self.start, doc_ents[0].end + self.start) not in span_ents:
doc.set_ents([], missing=[doc_ents[0]], default="unmodified")
# Remove final partial ent
if (doc_ents[-1].start + self.start, doc_ents[-1].end + self.start) not in span_ents:
doc.set_ents([], missing=[doc_ents[-1]], default="unmodified")
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
doc.user_hooks = self.doc.user_hooks doc.user_hooks = self.doc.user_hooks
doc.user_span_hooks = self.doc.user_span_hooks doc.user_span_hooks = self.doc.user_span_hooks