From 75a9d9b9ade3d97dfb50ea0f989b9658866900a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 May 2025 17:04:23 +0200 Subject: [PATCH] Test and fix issue13769 --- spacy/tests/doc/test_span.py | 14 +++++++++++++- spacy/tokens/span.pyx | 9 +++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 98a74bc21..38e77de29 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer): def test_issue1537(): """Test that Span.as_doc() doesn't segfault.""" string = "The sky is blue . The man is pink . The dog is purple ." - doc = Doc(Vocab(), words=string.split()) + doc = Doc(Vocab(), words=list(string.split())) doc[0].sent_start = True for word in doc[1:]: if word.nbor(-1).text == ".": @@ -225,6 +225,18 @@ def test_spans_span_sent(doc, doc_not_parsed): assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] +def test_issue13769(): + # Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span. + doc = Doc(Vocab(), words=list("This is a sentence . This is another sentence . Third".split())) + doc[0].is_sent_start = True + doc[5].is_sent_start = True + doc[10].is_sent_start = True + doc.ents = [('ENTITY', 7, 9)] # "another sentence" phrase in the second sentence + entity = doc.ents[0] + ent_sents = list(entity.sents) + assert len(ent_sents) == 1 + + @pytest.mark.parametrize( "start,end,expected_sentence", [ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 64b8d7c6c..a7faf0d62 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -479,10 +479,11 @@ cdef class Span: break elif i == self.doc.length - 1: yield Span(self.doc, start, self.doc.length) - - # Ensure that trailing parts of the Span instance are included in last element of .sents. - if start == self.doc.length - 1: - yield Span(self.doc, start, self.doc.length) + else: + # Ensure that trailing parts of the Span instance are included in last element of .sents. + # We only want to do this if we didn't break above + if start == self.doc.length - 1: + yield Span(self.doc, start, self.doc.length) @property def ents(self):