Add .ents_spangroup to Doc.

This commit is contained in:
Raphael Mitsch 2023-03-07 09:55:20 +01:00
parent 8ca71f9591
commit d73f952d5b
4 changed files with 50 additions and 16 deletions

View File

@ -81,3 +81,24 @@ def test_add_overlapping_entities(en_vocab):
new_entity = Span(doc, 0, 1, label=392) new_entity = Span(doc, 0, 1, label=392)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc.ents = list(doc.ents) + [new_entity] doc.ents = list(doc.ents) + [new_entity]
def test_ents_spangroup(en_vocab):
text = [
"Louisiana",
"Office",
"of",
"Conservation",
"in",
"the",
"United",
"States",
]
doc = Doc(en_vocab, words=text)
doc.ents = [Span(doc, 0, 4, label=391), Span(doc, 6, 8, label=391)]
assert doc.ents_spangroup.doc == doc
assert len(doc.ents_spangroup) == 2
assert doc.ents_spangroup.name == "ents"
assert str(doc.ents_spangroup[0]) == " ".join(text[:4])
assert str(doc.ents_spangroup[1]) == " ".join(text[6:])

View File

@ -4,6 +4,7 @@ from cymem.cymem import Pool
from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
from .span import Span from .span import Span
from .token import Token from .token import Token
from .span_group import SpanGroup
from .span_groups import SpanGroups from .span_groups import SpanGroups
from .retokenizer import Retokenizer from .retokenizer import Retokenizer
from ..lexeme import Lexeme from ..lexeme import Lexeme
@ -131,6 +132,8 @@ class Doc:
default: str = ..., default: str = ...,
) -> None: ... ) -> None: ...
@property @property
def ents_spangroup(self) -> SpanGroup: ...
@property
def noun_chunks(self) -> Tuple[Span]: ... def noun_chunks(self) -> Tuple[Span]: ...
@property @property
def sents(self) -> Tuple[Span]: ... def sents(self) -> Tuple[Span]: ...

View File

@ -18,6 +18,7 @@ from thinc.util import copy_array
import warnings import warnings
from .span cimport Span from .span cimport Span
from .span_group import SpanGroup
from .token cimport MISSING_DEP from .token cimport MISSING_DEP
from .span_groups import SpanGroups from .span_groups import SpanGroups
from .token cimport Token from .token cimport Token
@ -702,6 +703,14 @@ cdef class Doc:
""" """
return self.text return self.text
@property
def ents_spangroup(self) -> SpanGroup:
"""
Returns entities (in `.ents`) as `SpanGroup`.
RETURNS (SpanGroup): All entities (in `.ents`) as `SpanGroup`.
"""
return SpanGroup(self, spans=self.ents, name="ents")
property ents: property ents:
"""The named entities in the document. Returns a list of named entity """The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied. `Span` objects, if the entity recognizer has been applied.

View File

@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
## Attributes {id="attributes"} ## Attributes {id="attributes"}
| Name | Description | | Name | Description |
| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | A string representation of the document text. ~~str~~ | | `text` | A string representation of the document text. ~~str~~ |
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | | `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | | `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
| `vocab` | The store of lexical types. ~~Vocab~~ | | `vocab` | The store of lexical types. ~~Vocab~~ |
| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | | `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ |
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` | Language of the document's vocabulary. ~~int~~ | | `lang` | Language of the document's vocabulary. ~~int~~ |
| `lang_` | Language of the document's vocabulary. ~~str~~ | | `lang_` | Language of the document's vocabulary. ~~str~~ |
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | | `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ | | `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
| `ents_spangroup` <Tag variant="new">4.0</Tag> | All entitity `Span` instances (as stored in `.ents`) as [`SpanGroup`](/api/spangroup). ~~SpanGroup~~ |
## Serialization fields {id="serialization-fields"} ## Serialization fields {id="serialization-fields"}