mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-22 18:12:00 +03:00
Add .ents_spangroup to Doc.
This commit is contained in:
parent
8ca71f9591
commit
d73f952d5b
|
@ -81,3 +81,24 @@ def test_add_overlapping_entities(en_vocab):
|
|||
new_entity = Span(doc, 0, 1, label=392)
|
||||
with pytest.raises(ValueError):
|
||||
doc.ents = list(doc.ents) + [new_entity]
|
||||
|
||||
|
||||
def test_ents_spangroup(en_vocab):
|
||||
text = [
|
||||
"Louisiana",
|
||||
"Office",
|
||||
"of",
|
||||
"Conservation",
|
||||
"in",
|
||||
"the",
|
||||
"United",
|
||||
"States",
|
||||
]
|
||||
doc = Doc(en_vocab, words=text)
|
||||
doc.ents = [Span(doc, 0, 4, label=391), Span(doc, 6, 8, label=391)]
|
||||
|
||||
assert doc.ents_spangroup.doc == doc
|
||||
assert len(doc.ents_spangroup) == 2
|
||||
assert doc.ents_spangroup.name == "ents"
|
||||
assert str(doc.ents_spangroup[0]) == " ".join(text[:4])
|
||||
assert str(doc.ents_spangroup[1]) == " ".join(text[6:])
|
||||
|
|
|
@ -4,6 +4,7 @@ from cymem.cymem import Pool
|
|||
from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
|
||||
from .span import Span
|
||||
from .token import Token
|
||||
from .span_group import SpanGroup
|
||||
from .span_groups import SpanGroups
|
||||
from .retokenizer import Retokenizer
|
||||
from ..lexeme import Lexeme
|
||||
|
@ -131,6 +132,8 @@ class Doc:
|
|||
default: str = ...,
|
||||
) -> None: ...
|
||||
@property
|
||||
def ents_spangroup(self) -> SpanGroup: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def sents(self) -> Tuple[Span]: ...
|
||||
|
|
|
@ -18,6 +18,7 @@ from thinc.util import copy_array
|
|||
import warnings
|
||||
|
||||
from .span cimport Span
|
||||
from .span_group import SpanGroup
|
||||
from .token cimport MISSING_DEP
|
||||
from .span_groups import SpanGroups
|
||||
from .token cimport Token
|
||||
|
@ -702,6 +703,14 @@ cdef class Doc:
|
|||
"""
|
||||
return self.text
|
||||
|
||||
@property
|
||||
def ents_spangroup(self) -> SpanGroup:
|
||||
"""
|
||||
Returns entities (in `.ents`) as `SpanGroup`.
|
||||
RETURNS (SpanGroup): All entities (in `.ents`) as `SpanGroup`.
|
||||
"""
|
||||
return SpanGroup(self, spans=self.ents, name="ents")
|
||||
|
||||
property ents:
|
||||
"""The named entities in the document. Returns a list of named entity
|
||||
`Span` objects, if the entity recognizer has been applied.
|
||||
|
|
|
@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
|
|||
|
||||
## Attributes {id="attributes"}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | A string representation of the document text. ~~str~~ |
|
||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||
| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||
| `lang` | Language of the document's vocabulary. ~~int~~ |
|
||||
| `lang_` | Language of the document's vocabulary. ~~str~~ |
|
||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | A string representation of the document text. ~~str~~ |
|
||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||
| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||
| `lang` | Language of the document's vocabulary. ~~int~~ |
|
||||
| `lang_` | Language of the document's vocabulary. ~~str~~ |
|
||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
| `activations` <Tag variant="new">4.0</Tag> | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
|
||||
| `ents_spangroup` <Tag variant="new">4.0</Tag> | All entitity `Span` instances (as stored in `.ents`) as [`SpanGroup`](/api/spangroup). ~~SpanGroup~~ |
|
||||
|
||||
## Serialization fields {id="serialization-fields"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user