Fix sentence indexing bug in Span.sents (#12405)

* Add test for partial sentences in ent.sents.

* Removed unneeded import.

* Format. Simplify code.
This commit is contained in:
Raphael Mitsch 2023-03-14 10:21:53 +01:00 committed by GitHub
parent ea6de64596
commit e8cab4625c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 3 deletions

View File

@ -700,3 +700,19 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2
def test_for_partial_ent_sents():
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
which this tests for.
"""
doc = Doc(
English().vocab,
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
sent_starts=[1, 0, 0, 1, 0, 0],
)
doc.set_ents([Span(doc, 1, 4, "WORK")])
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
# equal to the sentences referenced in ent.sents.
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
assert doc_sent == ent_sent

View File

@ -460,9 +460,8 @@ cdef class Span:
start = i
if start >= self.end:
break
if start < self.end:
yield Span(self.doc, start, self.end)
elif i == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length)
@property
def ents(self):