From d73f952d5bc21a44a8d18823919e6e5f9a2cbf7e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 09:55:20 +0100 Subject: [PATCH] Add .ents_spangroup to Doc. --- spacy/tests/doc/test_add_entities.py | 21 ++++++++++++++++++ spacy/tokens/doc.pyi | 3 +++ spacy/tokens/doc.pyx | 9 ++++++++ website/docs/api/doc.mdx | 33 ++++++++++++++-------------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 30d66115f..5788e9b86 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -81,3 +81,24 @@ def test_add_overlapping_entities(en_vocab): new_entity = Span(doc, 0, 1, label=392) with pytest.raises(ValueError): doc.ents = list(doc.ents) + [new_entity] + + +def test_ents_spangroup(en_vocab): + text = [ + "Louisiana", + "Office", + "of", + "Conservation", + "in", + "the", + "United", + "States", + ] + doc = Doc(en_vocab, words=text) + doc.ents = [Span(doc, 0, 4, label=391), Span(doc, 6, 8, label=391)] + + assert doc.ents_spangroup.doc == doc + assert len(doc.ents_spangroup) == 2 + assert doc.ents_spangroup.name == "ents" + assert str(doc.ents_spangroup[0]) == " ".join(text[:4]) + assert str(doc.ents_spangroup[1]) == " ".join(text[6:]) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 48bc21c27..feb29f16e 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -4,6 +4,7 @@ from cymem.cymem import Pool from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from .span import Span from .token import Token +from .span_group import SpanGroup from .span_groups import SpanGroups from .retokenizer import Retokenizer from ..lexeme import Lexeme @@ -131,6 +132,8 @@ class Doc: default: str = ..., ) -> None: ... @property + def ents_spangroup(self) -> SpanGroup: ... + @property def noun_chunks(self) -> Tuple[Span]: ... @property def sents(self) -> Tuple[Span]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0ea2c39ab..8869d20e2 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -18,6 +18,7 @@ from thinc.util import copy_array import warnings from .span cimport Span +from .span_group import SpanGroup from .token cimport MISSING_DEP from .span_groups import SpanGroups from .token cimport Token @@ -702,6 +703,14 @@ cdef class Doc: """ return self.text + @property + def ents_spangroup(self) -> SpanGroup: + """ + Returns entities (in `.ents`) as `SpanGroup`. + RETURNS (SpanGroup): All entities (in `.ents`) as `SpanGroup`. + """ + return SpanGroup(self, spans=self.ents, name="ents") + property ents: """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index fca056ed0..ba54953a4 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -752,22 +752,23 @@ The L2 norm of the document's vector representation. ## Attributes {id="attributes"} -| Name | Description | -| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `text` | A string representation of the document text. ~~str~~ | -| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | -| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | -| `vocab` | The store of lexical types. ~~Vocab~~ | -| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | -| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | -| `lang` | Language of the document's vocabulary. ~~int~~ | -| `lang_` | Language of the document's vocabulary. ~~str~~ | -| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | -| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | -| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | -| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | -| `activations` 4.0 | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ | +| Name | Description | +| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | A string representation of the document text. ~~str~~ | +| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | +| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | +| `vocab` | The store of lexical types. ~~Vocab~~ | +| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | +| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | +| `lang` | Language of the document's vocabulary. ~~int~~ | +| `lang_` | Language of the document's vocabulary. ~~str~~ | +| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | +| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | +| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| `activations` 4.0 | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ | +| `ents_spangroup` 4.0 | All entitity `Span` instances (as stored in `.ents`) as [`SpanGroup`](/api/spangroup). ~~SpanGroup~~ | ## Serialization fields {id="serialization-fields"}