Added sents property to Span for Spans spanning over several sentences (#9699)

* Added sents property to Span class that returns a generator of sentences the Span belongs to * Added description to Span.sents property * Update test_span to clarify the difference between span.sent and span.sents Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/doc/test_span.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix documentation typos in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update Span.sents doc string in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Parametrized test_span_spans * Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided * Corrected Span ocumentation copy/paste issue * Put back accidentally deleted lines * Fixed formatting in span.pyx * Moved check for SENT_START annotation after user hooks in Span.sents * add version where the property was introduced Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-08-05 21:00:19 +03:00 · 2021-12-06 01:58:01 -07:00 · 2021-12-06 01:58:01 -07:00 · 472740d613
commit 472740d613
parent 7d50804644
3 changed files with 136 additions and 0 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
    assert doc[:2].sent.root.text == "is"
    assert doc[:2].sent.text == "This is a sentence."
    assert doc[6:7].sent.root.left_edge.text == "This"
+    assert doc[0 : len(doc)].sent == list(doc.sents)[0]
+    assert list(doc[0 : len(doc)].sents) == list(doc.sents)
+
+    with pytest.raises(ValueError):
+        doc_not_parsed[:2].sent
+
    # test on manual sbd
    doc_not_parsed[0].is_sent_start = True
    doc_not_parsed[5].is_sent_start = True
@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]


+@pytest.mark.parametrize(
+    "start,end,expected_sentence",
+    [
+        (0, 14, "This is"),  # Entire doc
+        (1, 4, "This is"),  # Overlapping with 2 sentences
+        (0, 2, "This is"),  # Beginning of the Doc. Full sentence
+        (0, 1, "This is"),  # Beginning of the Doc. Part of a sentence
+        (10, 14, "And a"),  # End of the Doc. Overlapping with 2 senteces
+        (12, 14, "third."),  # End of the Doc. Full sentence
+        (1, 1, "This is"),  # Empty Span
+    ],
+)
+def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
+
+    # Doc-level sents hook
+    def user_hook(doc):
+        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+    doc.user_hooks["sents"] = user_hook
+
+    # Make sure doc-level sents hook works
+    assert doc[start:end].sent.text == expected_sentence
+
+    # Span-level sent hook
+    doc.user_span_hooks["sent"] = lambda x: x
+    # Now, span=level sent hook overrides the doc-level sents hook
+    assert doc[start:end].sent == doc[start:end]
+
+
 def test_spans_lca_matrix(en_tokenizer):
    """Test span's lca matrix generation"""
    tokens = en_tokenizer("the lazy dog slept")
@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
    # single-token span with vector
    assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
    doc.vocab.vectors = prev_vectors
+
+
+@pytest.mark.parametrize(
+    "start,end,expected_sentences,expected_sentences_with_hook",
+    [
+        (0, 14, 3, 7),  # Entire doc
+        (3, 6, 2, 2),  # Overlapping with 2 sentences
+        (0, 4, 1, 2),  # Beginning of the Doc. Full sentence
+        (0, 3, 1, 2),  # Beginning of the Doc. Part of a sentence
+        (9, 14, 2, 3),  # End of the Doc. Overlapping with 2 senteces
+        (10, 14, 1, 2),  # End of the Doc. Full sentence
+        (11, 14, 1, 2),  # End of the Doc. Partial sentence
+        (0, 0, 1, 1),  # Empty Span
+    ],
+)
+def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
+
+    assert len(list(doc[start:end].sents)) == expected_sentences
+
+    def user_hook(doc):
+        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
+
+    doc.user_hooks["sents"] = user_hook
+
+    assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
+
+    doc.user_span_hooks["sents"] = lambda x: [x]
+
+    assert list(doc[start:end].sents)[0] == doc[start:end]
+    assert len(list(doc[start:end].sents)) == 1
+
+
+def test_span_sents_not_parsed(doc_not_parsed):
+    with pytest.raises(ValueError):
+        list(Span(doc_not_parsed, 0, 3).sents)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -404,6 +404,10 @@ cdef class Span:
        """
        if "sent" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["sent"](self)
+        elif "sents" in self.doc.user_hooks:
+            for sentence in self.doc.user_hooks["sents"](self.doc):
+                if sentence.start <= self.start < sentence.end:
+                    return sentence
        # Use `sent_start` token attribute to find sentence boundaries
        cdef int n = 0
        if self.doc.has_annotation("SENT_START"):
@ -422,6 +426,47 @@ cdef class Span:
        else:
            raise ValueError(Errors.E030)

+    @property
+    def sents(self):
+        """Obtain the sentences that contain this span. If the given span
+        crosses sentence boundaries, return all sentences it is a part of.
+
+        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+
+         DOCS: https://spacy.io/api/span#sents
+        """
+        cdef int start
+        cdef int i
+
+        if "sents" in self.doc.user_span_hooks:
+            yield from self.doc.user_span_hooks["sents"](self)
+        elif "sents" in self.doc.user_hooks:
+            for sentence in self.doc.user_hooks["sents"](self.doc):
+                if sentence.end > self.start:
+                    if sentence.start < self.end or sentence.start == self.start == self.end:
+                        yield sentence
+                    else:
+                        break
+        else:
+            if not self.doc.has_annotation("SENT_START"):
+                raise ValueError(Errors.E030)
+            # Use `sent_start` token attribute to find sentence boundaries
+            # Find start of the 1st sentence of the Span
+            start = self.start
+            while self.doc.c[start].sent_start != 1 and start > 0:
+                start -= 1
+
+            # Now, find all the sentences in the span
+            for i in range(start + 1, self.doc.length):
+                if self.doc.c[i].sent_start == 1:
+                    yield Span(self.doc, start, i)
+                    start = i
+                    if start >= self.end:
+                        break
+            if start < self.end:
+                yield Span(self.doc, start, self.end)
+
+
    @property
    def ents(self):
        """The named entities in the span. Returns a tuple of named entity
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
 | ----------- | ------------------------------------------------------- |
 | **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |

+## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
+
+Returns a generator over the sentences the span belongs to. This property is only available
+when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
+document by the `parser`, `senter`, `sentencizer` or some custom function. It
+will raise an error otherwise.
+
+If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> span = doc[2:4]
+> assert len(span.sents) == 2
+> ```
+
+| Name        | Description                                                                |
+| ----------- | -------------------------------------------------------------------------- |
+| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+
 ## Attributes {#attributes}

 | Name                                    | Description                                                                                                                   |