Added sents property to Span for Spans spanning over several sentences (#9699)

* Added sents property to Span class that returns a generator of sentences the Span belongs to * Added description to Span.sents property * Update test_span to clarify the difference between span.sent and span.sents Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/doc/test_span.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix documentation typos in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update Span.sents doc string in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Parametrized test_span_spans * Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided * Corrected Span ocumentation copy/paste issue * Put back accidentally deleted lines * Fixed formatting in span.pyx * Moved check for SENT_START annotation after user hooks in Span.sents * add version where the property was introduced Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-11-03 09:27:56 +03:00 · 2021-12-06 01:58:01 -07:00 · 2021-12-06 01:58:01 -07:00 · 472740d613
commit 472740d613
parent 7d50804644
3 changed files with 136 additions and 0 deletions
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
    assert doc[:2].sent.root.text == "is"
    assert doc[:2].sent.text == "This is a sentence."
    assert doc[6:7].sent.root.left_edge.text == "This"
    assert doc[0 : len(doc)].sent == list(doc.sents)[0]
    assert list(doc[0 : len(doc)].sents) == list(doc.sents)
    with pytest.raises(ValueError):
        doc_not_parsed[:2].sent
    # test on manual sbd
    doc_not_parsed[0].is_sent_start = True
    doc_not_parsed[5].is_sent_start = True
@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
@pytest.mark.parametrize(
    "start,end,expected_sentence",
    [
        (0, 14, "This is"),  # Entire doc
        (1, 4, "This is"),  # Overlapping with 2 sentences
        (0, 2, "This is"),  # Beginning of the Doc. Full sentence
        (0, 1, "This is"),  # Beginning of the Doc. Part of a sentence
        (10, 14, "And a"),  # End of the Doc. Overlapping with 2 senteces
        (12, 14, "third."),  # End of the Doc. Full sentence
        (1, 1, "This is"),  # Empty Span
    ],
 )
 def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
    # Doc-level sents hook
    def user_hook(doc):
        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
    doc.user_hooks["sents"] = user_hook
    # Make sure doc-level sents hook works
    assert doc[start:end].sent.text == expected_sentence
    # Span-level sent hook
    doc.user_span_hooks["sent"] = lambda x: x
    # Now, span=level sent hook overrides the doc-level sents hook
    assert doc[start:end].sent == doc[start:end]
 def test_spans_lca_matrix(en_tokenizer):
    """Test span's lca matrix generation"""
    tokens = en_tokenizer("the lazy dog slept")
@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
    # single-token span with vector
    assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
    doc.vocab.vectors = prev_vectors
@pytest.mark.parametrize(
    "start,end,expected_sentences,expected_sentences_with_hook",
    [
        (0, 14, 3, 7),  # Entire doc
        (3, 6, 2, 2),  # Overlapping with 2 sentences
        (0, 4, 1, 2),  # Beginning of the Doc. Full sentence
        (0, 3, 1, 2),  # Beginning of the Doc. Part of a sentence
        (9, 14, 2, 3),  # End of the Doc. Overlapping with 2 senteces
        (10, 14, 1, 2),  # End of the Doc. Full sentence
        (11, 14, 1, 2),  # End of the Doc. Partial sentence
        (0, 0, 1, 1),  # Empty Span
    ],
 )
 def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
    assert len(list(doc[start:end].sents)) == expected_sentences
    def user_hook(doc):
        return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
    doc.user_hooks["sents"] = user_hook
    assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
    doc.user_span_hooks["sents"] = lambda x: [x]
    assert list(doc[start:end].sents)[0] == doc[start:end]
    assert len(list(doc[start:end].sents)) == 1
 def test_span_sents_not_parsed(doc_not_parsed):
    with pytest.raises(ValueError):
        list(Span(doc_not_parsed, 0, 3).sents)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -404,6 +404,10 @@ cdef class Span:
        """
        if "sent" in self.doc.user_span_hooks:
            return self.doc.user_span_hooks["sent"](self)
        elif "sents" in self.doc.user_hooks:
            for sentence in self.doc.user_hooks["sents"](self.doc):
                if sentence.start <= self.start < sentence.end:
                    return sentence
        # Use `sent_start` token attribute to find sentence boundaries
        cdef int n = 0
        if self.doc.has_annotation("SENT_START"):
@ -422,6 +426,47 @@ cdef class Span:
        else:
            raise ValueError(Errors.E030)
    @property
    def sents(self):
        """Obtain the sentences that contain this span. If the given span
        crosses sentence boundaries, return all sentences it is a part of.
        RETURNS (Iterable[Span]): All sentences that the span is a part of.
         DOCS: https://spacy.io/api/span#sents
        """
        cdef int start
        cdef int i
        if "sents" in self.doc.user_span_hooks:
            yield from self.doc.user_span_hooks["sents"](self)
        elif "sents" in self.doc.user_hooks:
            for sentence in self.doc.user_hooks["sents"](self.doc):
                if sentence.end > self.start:
                    if sentence.start < self.end or sentence.start == self.start == self.end:
                        yield sentence
                    else:
                        break
        else:
            if not self.doc.has_annotation("SENT_START"):
                raise ValueError(Errors.E030)
            # Use `sent_start` token attribute to find sentence boundaries
            # Find start of the 1st sentence of the Span
            start = self.start
            while self.doc.c[start].sent_start != 1 and start > 0:
                start -= 1
            # Now, find all the sentences in the span
            for i in range(start + 1, self.doc.length):
                if self.doc.c[i].sent_start == 1:
                    yield Span(self.doc, start, i)
                    start = i
                    if start >= self.end:
                        break
            if start < self.end:
                yield Span(self.doc, start, self.end)
    @property
    def ents(self):
        """The named entities in the span. Returns a tuple of named entity
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
 | ----------- | ------------------------------------------------------- |
 | **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
 ## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
 Returns a generator over the sentences the span belongs to. This property is only available
 when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
 document by the `parser`, `senter`, `sentencizer` or some custom function. It
 will raise an error otherwise.
 If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
 > #### Example
 >
 > ```python
 > doc = nlp("Give it back! He pleaded.")
 > span = doc[2:4]
 > assert len(span.sents) == 2
 > ```
 | Name        | Description                                                                |
 | ----------- | -------------------------------------------------------------------------- |
 | **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
 ## Attributes {#attributes}
 | Name                                    | Description                                                                                                                   |