mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Handle partial entities in Span.as_doc (#8055)
* Handle partial entities in Span.as_doc In `Span.as_doc` replace partial entities at the beginning or end of the span with missing entity annotation. Fixes a bug where invalid entity annotation (no initial `B`) was returned for an initial partial entity. * Check for empty span in ents conversion Note: `Span.as_doc()` will still fail on an empty span due to failures in `Span.vector`.
This commit is contained in:
parent
3883d49446
commit
d5bbd1f94f
|
@ -14,9 +14,11 @@ def doc(en_tokenizer):
|
|||
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
|
||||
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||
ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
|
||||
"O", "O", "O", "O", "O"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -220,6 +222,17 @@ def test_span_as_doc(doc):
|
|||
assert span_doc is not doc
|
||||
assert span_doc[0].idx == 0
|
||||
|
||||
# partial initial entity is removed
|
||||
assert len(span_doc.ents) == 0
|
||||
|
||||
# full entity is preserved
|
||||
span_doc = doc[2:10].as_doc()
|
||||
assert len(span_doc.ents) == 1
|
||||
|
||||
# partial final entity is removed
|
||||
span_doc = doc[0:5].as_doc()
|
||||
assert len(span_doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("clean_underscore")
|
||||
def test_span_as_doc_user_data(doc):
|
||||
|
|
|
@ -228,7 +228,25 @@ cdef class Span:
|
|||
array = self.doc.to_array(array_head)
|
||||
array = array[self.start : self.end]
|
||||
self._fix_dep_copy(array_head, array)
|
||||
# Fix initial IOB so the entities are valid for doc.ents below.
|
||||
if len(array) > 0 and ENT_IOB in array_head:
|
||||
ent_iob_col = array_head.index(ENT_IOB)
|
||||
if array[0][ent_iob_col] == 1:
|
||||
array[0][ent_iob_col] = 3
|
||||
doc.from_array(array_head, array)
|
||||
# Set partial entities at the beginning or end of the span to have
|
||||
# missing entity annotation. Note: the initial partial entity could be
|
||||
# detected from the IOB annotation but the final partial entity can't,
|
||||
# so detect and remove both in the same way by checking self.ents.
|
||||
span_ents = {(ent.start, ent.end) for ent in self.ents}
|
||||
doc_ents = doc.ents
|
||||
if len(doc_ents) > 0:
|
||||
# Remove initial partial ent
|
||||
if (doc_ents[0].start + self.start, doc_ents[0].end + self.start) not in span_ents:
|
||||
doc.set_ents([], missing=[doc_ents[0]], default="unmodified")
|
||||
# Remove final partial ent
|
||||
if (doc_ents[-1].start + self.start, doc_ents[-1].end + self.start) not in span_ents:
|
||||
doc.set_ents([], missing=[doc_ents[-1]], default="unmodified")
|
||||
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
||||
doc.user_hooks = self.doc.user_hooks
|
||||
doc.user_span_hooks = self.doc.user_span_hooks
|
||||
|
|
Loading…
Reference in New Issue
Block a user