Add .ents_spangroup to Doc.

2025-08-04 20:30:24 +03:00 · 2023-03-07 09:55:20 +01:00 · 2023-03-07 09:55:20 +01:00 · d73f952d5b
commit d73f952d5b
parent 8ca71f9591
4 changed files with 50 additions and 16 deletions
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -81,3 +81,24 @@ def test_add_overlapping_entities(en_vocab):
    new_entity = Span(doc, 0, 1, label=392)
    with pytest.raises(ValueError):
        doc.ents = list(doc.ents) + [new_entity]
+
+
+def test_ents_spangroup(en_vocab):
+    text = [
+        "Louisiana",
+        "Office",
+        "of",
+        "Conservation",
+        "in",
+        "the",
+        "United",
+        "States",
+    ]
+    doc = Doc(en_vocab, words=text)
+    doc.ents = [Span(doc, 0, 4, label=391), Span(doc, 6, 8, label=391)]
+
+    assert doc.ents_spangroup.doc == doc
+    assert len(doc.ents_spangroup) == 2
+    assert doc.ents_spangroup.name == "ents"
+    assert str(doc.ents_spangroup[0]) == " ".join(text[:4])
+    assert str(doc.ents_spangroup[1]) == " ".join(text[6:])
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@ -4,6 +4,7 @@ from cymem.cymem import Pool
 from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
 from .span import Span
 from .token import Token
+from .span_group import SpanGroup
 from .span_groups import SpanGroups
 from .retokenizer import Retokenizer
 from ..lexeme import Lexeme
@ -131,6 +132,8 @@ class Doc:
        default: str = ...,
    ) -> None: ...
    @property
+    def ents_spangroup(self) -> SpanGroup: ...
+    @property
    def noun_chunks(self) -> Tuple[Span]: ...
    @property
    def sents(self) -> Tuple[Span]: ...
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -18,6 +18,7 @@ from thinc.util import copy_array
 import warnings

 from .span cimport Span
+from .span_group import SpanGroup
 from .token cimport MISSING_DEP
 from .span_groups import SpanGroups
 from .token cimport Token
@ -702,6 +703,14 @@ cdef class Doc:
        """
        return self.text

+    @property
+    def ents_spangroup(self) -> SpanGroup:
+        """
+        Returns entities (in `.ents`) as `SpanGroup`.
+        RETURNS (SpanGroup): All entities (in `.ents`) as `SpanGroup`.
+        """
+        return SpanGroup(self, spans=self.ents, name="ents")
+
    property ents:
        """The named entities in the document. Returns a list of named entity
        `Span` objects, if the entity recognizer has been applied.
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.

 ## Attributes {id="attributes"}

-| Name                                       | Description                                                                                                                                    |
-| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                                     | A string representation of the document text. ~~str~~                                                                                          |
-| `text_with_ws`                             | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                  |
-| `mem`                                      | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                       |
-| `vocab`                                    | The store of lexical types. ~~Vocab~~                                                                                                          |
-| `tensor`                                   | Container for dense vector representations. ~~numpy.ndarray~~                                                                                  |
-| `user_data`                                | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
-| `lang`                                     | Language of the document's vocabulary. ~~int~~                                                                                                 |
-| `lang_`                                    | Language of the document's vocabulary. ~~str~~                                                                                                 |
-| `user_hooks`                               | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
-| `user_token_hooks`                         | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
-| `user_span_hooks`                          | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
-| `has_unknown_spaces`                       | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~            |
-| `_`                                        | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
-| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
+| Name                                          | Description                                                                                                                                    |
+| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                                        | A string representation of the document text. ~~str~~                                                                                          |
+| `text_with_ws`                                | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                  |
+| `mem`                                         | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                       |
+| `vocab`                                       | The store of lexical types. ~~Vocab~~                                                                                                          |
+| `tensor`                                      | Container for dense vector representations. ~~numpy.ndarray~~                                                                                  |
+| `user_data`                                   | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                               |
+| `lang`                                        | Language of the document's vocabulary. ~~int~~                                                                                                 |
+| `lang_`                                       | Language of the document's vocabulary. ~~str~~                                                                                                 |
+| `user_hooks`                                  | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                      |
+| `user_token_hooks`                            | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                              |
+| `user_span_hooks`                             | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                               |
+| `has_unknown_spaces`                          | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~            |
+| `_`                                           | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                  |
+| `activations` <Tag variant="new">4.0</Tag>    | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
+| `ents_spangroup` <Tag variant="new">4.0</Tag> | All entitity `Span` instances (as stored in `.ents`) as [`SpanGroup`](/api/spangroup). ~~SpanGroup~~                                           |

 ## Serialization fields {id="serialization-fields"}