From 8d064872ff25c23ed6bfe0a7758456ce31a2ddf7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 29 Mar 2023 18:54:47 +0200 Subject: [PATCH] Fix Span.sents for edge case of Span being the only Span in the last sentence of a Doc. (#12484) --- spacy/tests/doc/test_span.py | 15 +++++++++++++++ spacy/tokens/span.pyx | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index adef5922f..a5c512dc0 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -716,3 +716,18 @@ def test_for_partial_ent_sents(): # equal to the sentences referenced in ent.sents. for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents): assert doc_sent == ent_sent + + +def test_for_no_ent_sents(): + """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full + sentence. + """ + doc = Doc( + English().vocab, + words=["This", "is", "a", "test.", "ENTITY"], + sent_starts=[1, 0, 0, 0, 1], + ) + doc.set_ents([Span(doc, 4, 5, "WORK")]) + sents = list(doc.ents[0].sents) + assert len(sents) == 1 + assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7750b16ed..29b8ce703 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -463,6 +463,10 @@ cdef class Span: elif i == self.doc.length - 1: yield Span(self.doc, start, self.doc.length) + # Ensure that trailing parts of the Span instance are included in last element of .sents. + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) + @property def ents(self): """The named entities that fall completely within the span. Returns