Add test for partial sentences in ent.sents.

2025-09-20 19:12:36 +03:00 · 2023-03-13 11:24:24 +01:00 · 2023-03-13 11:24:24 +01:00 · 8cfbe5a4d1
commit 8cfbe5a4d1
parent f27bce67fd
2 changed files with 18 additions and 3 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -7,6 +7,7 @@ from spacy.lang.en import English
 from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
+from spacy.training import Example
 from thinc.api import get_current_ops

 from ..util import add_vecs_to_vocab
@ -700,3 +701,18 @@ def test_span_group_copy(doc):
    assert len(doc.spans["test"]) == 3
    # check that the copy spans were not modified and this is an isolated doc
    assert len(doc_copy.spans["test"]) == 2
+
+
+def test_for_partial_ent_sents():
+    """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
+    which this tests for.
+    """
+    nlp = English()
+    text = ["Mahler's", "Symphony", "No.", "8", "was", "beautiful."]
+    doc = Doc(nlp.vocab, words=text, sent_starts=[1, 0, 0, 1, 0, 0])
+    doc.set_ents([Span(doc, 1, 4, "WORK")])
+    # The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
+    # equal to the sentences referenced in ent.sents.
+    for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
+        assert doc_sent == ent_sent
+
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -460,9 +460,8 @@ cdef class Span:
                    start = i
                    if start >= self.end:
                        break
-            if start < self.end:
-                yield Span(self.doc, start, self.end)
-
+                elif i == self.doc.length - 1:
+                    yield Span(self.doc, start, i + 1)

    @property
    def ents(self):