From e8cab4625c12666ef599f19eb60403500af2a385 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 14 Mar 2023 10:21:53 +0100 Subject: [PATCH] Fix sentence indexing bug in `Span.sents` (#12405) * Add test for partial sentences in ent.sents. * Removed unneeded import. * Format. Simplify code. --- spacy/tests/doc/test_span.py | 16 ++++++++++++++++ spacy/tokens/span.pyx | 5 ++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index b4631037a..adef5922f 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -700,3 +700,19 @@ def test_span_group_copy(doc): assert len(doc.spans["test"]) == 3 # check that the copy spans were not modified and this is an isolated doc assert len(doc_copy.spans["test"]) == 2 + + +def test_for_partial_ent_sents(): + """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences, + which this tests for. + """ + doc = Doc( + English().vocab, + words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."], + sent_starts=[1, 0, 0, 1, 0, 0], + ) + doc.set_ents([Span(doc, 1, 4, "WORK")]) + # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be + # equal to the sentences referenced in ent.sents. + for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): + assert doc_sent == ent_sent diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index cfe1236df..7750b16ed 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -460,9 +460,8 @@ cdef class Span: start = i if start >= self.end: break - if start < self.end: - yield Span(self.doc, start, self.end) - + elif i == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self):