Return Tuple[Span] for all Doc/Span attrs that provide spans (#12288)

* Return Tuple[Span] for all Doc/Span attrs that provide spans * Update Span types
2025-09-11 22:52:39 +03:00 · 2023-03-01 16:00:02 +01:00 · 2023-03-01 16:00:02 +01:00 · da75896ef5
commit da75896ef5
parent df4c069a13
6 changed files with 58 additions and 55 deletions
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -131,9 +131,9 @@ class Doc:
        default: str = ...,
    ) -> None: ...
    @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
    @property
-    def sents(self) -> Iterator[Span]: ...
+    def sents(self) -> Tuple[Span]: ...
    @property
    def lang(self) -> int: ...
    @property
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -703,10 +703,10 @@ cdef class Doc:
        return self.text

    property ents:
-        """The named entities in the document. Returns a tuple of named entity
+        """The named entities in the document. Returns a list of named entity
        `Span` objects, if the entity recognizer has been applied.

-        RETURNS (tuple): Entities in the document, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.

        DOCS: https://spacy.io/api/doc#ents
        """
@ -864,7 +864,7 @@ cdef class Doc:
        NP-level coordination, no prepositional phrases, and no relative
        clauses.

-        YIELDS (Span): Noun chunks in the document.
+        RETURNS (Tuple[Span]): Noun chunks in the document.

        DOCS: https://spacy.io/api/doc#noun_chunks
        """
@ -873,36 +873,35 @@ cdef class Doc:

        # Accumulate the result before beginning to iterate over it. This
        # prevents the tokenization from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenization changing, so it's okay once we have the Span
-        # objects. See Issue #375.
+        # during the iteration.
        spans = []
        for start, end, label in self.noun_chunks_iterator(self):
            spans.append(Span(self, start, end, label=label))
-        for span in spans:
-            yield span
+        return tuple(spans)

    @property
    def sents(self):
        """Iterate over the sentences in the document. Yields sentence `Span`
        objects. Sentence spans have no label.

-        YIELDS (Span): Sentences in the document.
+        RETURNS (Tuple[Span]): Sentences in the document.

        DOCS: https://spacy.io/api/doc#sents
        """
        if not self.has_annotation("SENT_START"):
            raise ValueError(Errors.E030)
        if "sents" in self.user_hooks:
-            yield from self.user_hooks["sents"](self)
+            return tuple(self.user_hooks["sents"](self))
        else:
            start = 0
+            spans = []
            for i in range(1, self.length):
                if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
+                    spans.append(Span(self, start, i))
                    start = i
            if start != self.length:
-                yield Span(self, start, self.length)
+                spans.append(Span(self, start, self.length))
+            return tuple(spans)

    @property
    def lang(self):
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@ -74,6 +74,8 @@ class Span:
    @property
    def ents(self) -> Tuple[Span]: ...
    @property
+    def sents(self) -> Tuple[Span]: ...
+    @property
    def has_vector(self) -> bool: ...
    @property
    def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
    @property
    def text_with_ws(self) -> str: ...
    @property
-    def noun_chunks(self) -> Iterator[Span]: ...
+    def noun_chunks(self) -> Tuple[Span]: ...
    @property
    def root(self) -> Token: ...
    def char_span(
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -461,7 +461,7 @@ cdef class Span:
        """Obtain the sentences that contain this span. If the given span
        crosses sentence boundaries, return all sentences it is a part of.

-        RETURNS (Iterable[Span]): All sentences that the span is a part of.
+        RETURNS (Tuple[Span]): All sentences that the span is a part of.

        DOCS: https://spacy.io/api/span#sents
        """
@ -469,12 +469,13 @@ cdef class Span:
        cdef int i

        if "sents" in self.doc.user_span_hooks:
-            yield from self.doc.user_span_hooks["sents"](self)
-        elif "sents" in self.doc.user_hooks:
+            return tuple(self.doc.user_span_hooks["sents"](self))
+        spans = []
+        if "sents" in self.doc.user_hooks:
            for sentence in self.doc.user_hooks["sents"](self.doc):
                if sentence.end > self.start:
                    if sentence.start < self.end or sentence.start == self.start == self.end:
-                        yield sentence
+                        spans.append(sentence)
                    else:
                        break
        else:
@ -489,12 +490,13 @@ cdef class Span:
            # Now, find all the sentences in the span
            for i in range(start + 1, self.doc.length):
                if self.doc.c[i].sent_start == 1:
-                    yield Span(self.doc, start, i)
+                    spans.append(Span(self.doc, start, i))
                    start = i
                    if start >= self.end:
                        break
            if start < self.end:
-                yield Span(self.doc, start, self.end)
+                spans.append(Span(self.doc, start, self.end))
+        return tuple(spans)


    @property
@ -502,7 +504,7 @@ cdef class Span:
        """The named entities that fall completely within the span. Returns
        a tuple of `Span` objects.

-        RETURNS (tuple): Entities in the span, one `Span` per entity.
+        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.

        DOCS: https://spacy.io/api/span#ents
        """
@ -517,7 +519,7 @@ cdef class Span:
                    ents.append(ent)
                else:
                    break
-        return ents
+        return tuple(ents)

    @property
    def has_vector(self):
@ -613,13 +615,15 @@ cdef class Span:
        NP-level coordination, no prepositional phrases, and no relative
        clauses.

-        YIELDS (Span): Noun chunks in the span.
+        RETURNS (Tuple[Span]): Noun chunks in the span.

        DOCS: https://spacy.io/api/span#noun_chunks
        """
+        spans = []
        for span in self.doc.noun_chunks:
            if span.start >= self.start and span.end <= self.end:
-                yield span
+                spans.append(span)
+        return tuple(spans)

    @property
    def root(self):
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).

 ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}

-Iterate over the base noun phrases in the document. Yields base noun-phrase
-`Span` objects, if the document has been syntactically parsed. A base noun
-phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
-nested within it – so no NP-level coordination, no prepositional phrases, and no
-relative clauses.
+Returns a tuple of the base noun phrases in the doc, if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.

 To customize the noun chunk iterator in a loaded pipeline, modify
 [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -676,12 +675,12 @@ implemented for the given language, a `NotImplementedError` is raised.
 > ```

 | Name        | Description                                  |
-| ---------- | ------------------------------------- |
-| **YIELDS** | Noun chunks in the document. ~~Span~~ |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |

 ## Doc.sents {id="sents",tag="property",model="sentences"}

-Iterate over the sentences in the document. Sentence spans have no label.
+Returns a tuple of the sentences in the document. Sentence spans have no label.

 This property is only available when
 [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -698,8 +697,8 @@ will raise an error otherwise.
 > ```

 | Name        | Description                                |
-| ---------- | ----------------------------------- |
-| **YIELDS** | Sentences in the document. ~~Span~~ |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |

 ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

--- a/website/docs/api/span.mdx
+++ b/website/docs/api/span.mdx
@ -276,16 +276,15 @@ The named entities that fall completely within the span. Returns a tuple of
 > ```

 | Name        | Description                                                  |
-| ----------- | ----------------------------------------------------------------- |
-| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
+| ----------- | ------------------------------------------------------------ |
+| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |

 ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}

-Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
-objects, if the document has been syntactically parsed. A base noun phrase, or
-"NP chunk", is a noun phrase that does not permit other NPs to be nested within
-it – so no NP-level coordination, no prepositional phrases, and no relative
-clauses.
+Returns a tuple of the base noun phrases in the span if the document has been
+syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
+does not permit other NPs to be nested within it – so no NP-level coordination,
+no prepositional phrases, and no relative clauses.

 If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 has not been implemeted for the given language, a `NotImplementedError` is
@ -302,8 +301,8 @@ raised.
 > ```

 | Name        | Description                              |
-| ---------- | --------------------------------- |
-| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+| ----------- | ---------------------------------------- |
+| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |

 ## Span.as_doc {id="as_doc",tag="method"}

@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]

 ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}

-Returns a generator over the sentences the span belongs to. This property is
-only available when [sentence boundaries](/usage/linguistic-features#sbd) have
-been set on the document by the `parser`, `senter`, `sentencizer` or some custom
+Returns a tuple of the sentences the span belongs to. This property is only
+available when [sentence boundaries](/usage/linguistic-features#sbd) have been
+set on the document by the `parser`, `senter`, `sentencizer` or some custom
 function. It will raise an error otherwise.

 If the span happens to cross sentence boundaries, all sentences the span
@ -542,8 +541,8 @@ overlaps with will be returned.
 > ```

 | Name        | Description                                                   |
-| ----------- | -------------------------------------------------------------------------- |
-| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
+| ----------- | ------------------------------------------------------------- |
+| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |

 ## Attributes {id="attributes"}