Test and fix issue13769

This commit is contained in:
Matthew Honnibal 2025-05-28 17:04:23 +02:00
parent bec546cec0
commit 75a9d9b9ad
2 changed files with 18 additions and 5 deletions

View File

@ -49,7 +49,7 @@ def doc_not_parsed(en_tokenizer):
def test_issue1537(): def test_issue1537():
"""Test that Span.as_doc() doesn't segfault.""" """Test that Span.as_doc() doesn't segfault."""
string = "The sky is blue . The man is pink . The dog is purple ." string = "The sky is blue . The man is pink . The dog is purple ."
doc = Doc(Vocab(), words=string.split()) doc = Doc(Vocab(), words=list(string.split()))
doc[0].sent_start = True doc[0].sent_start = True
for word in doc[1:]: for word in doc[1:]:
if word.nbor(-1).text == ".": if word.nbor(-1).text == ".":
@ -225,6 +225,18 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
def test_issue13769():
# Test issue 13769: Incorrect output of span.sents when final token is a sentence outside of the span.
doc = Doc(Vocab(), words=list("This is a sentence . This is another sentence . Third".split()))
doc[0].is_sent_start = True
doc[5].is_sent_start = True
doc[10].is_sent_start = True
doc.ents = [('ENTITY', 7, 9)] # "another sentence" phrase in the second sentence
entity = doc.ents[0]
ent_sents = list(entity.sents)
assert len(ent_sents) == 1
@pytest.mark.parametrize( @pytest.mark.parametrize(
"start,end,expected_sentence", "start,end,expected_sentence",
[ [

View File

@ -479,8 +479,9 @@ cdef class Span:
break break
elif i == self.doc.length - 1: elif i == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length) yield Span(self.doc, start, self.doc.length)
else:
# Ensure that trailing parts of the Span instance are included in last element of .sents. # Ensure that trailing parts of the Span instance are included in last element of .sents.
# We only want to do this if we didn't break above
if start == self.doc.length - 1: if start == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length) yield Span(self.doc, start, self.doc.length)