SpanGroup(s)-related optimizations (#11380)

* `SpanGroup`: Add support for binding copies to a new reference document

* `SpanGroups`: Replace superfluous serialize-deserialize roundtrip in `copy`

Instead, directly copy the in-memory representations of the constituent `SpanGroup`s.

* Update `SpanGroup.copy()` signature

* Rename `new_doc` param to `doc`

* Fix kwdarg

* Update `.pyi` file and docstrings

* `mypy` fix

* Update spacy/tokens/span_group.pyx

* Update docs

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Madeesh Kannan 2022-08-31 09:03:20 +02:00 committed by GitHub
parent aafee5e1b7
commit 604a7c3c26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 13 additions and 8 deletions

View File

@ -42,7 +42,8 @@ class SpanGroups(UserDict):
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups": def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
if doc is None: if doc is None:
doc = self._ensure_doc() doc = self._ensure_doc()
return SpanGroups(doc).from_bytes(self.to_bytes()) data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
return SpanGroups(doc, items=data_copy)
def setdefault(self, key, default=None): def setdefault(self, key, default=None):
if not isinstance(default, SpanGroup): if not isinstance(default, SpanGroup):

View File

@ -1,4 +1,4 @@
from typing import Any, Dict, Iterable from typing import Any, Dict, Iterable, Optional
from .doc import Doc from .doc import Doc
from .span import Span from .span import Span
@ -24,4 +24,4 @@ class SpanGroup:
def __getitem__(self, i: int) -> Span: ... def __getitem__(self, i: int) -> Span: ...
def to_bytes(self) -> bytes: ... def to_bytes(self) -> bytes: ...
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ... def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
def copy(self) -> SpanGroup: ... def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...

View File

@ -241,15 +241,18 @@ cdef class SpanGroup:
cdef void push_back(self, SpanC span) nogil: cdef void push_back(self, SpanC span) nogil:
self.c.push_back(span) self.c.push_back(span)
def copy(self) -> SpanGroup: def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
"""Clones the span group. """Clones the span group.
doc (Doc): New reference document to which the copy is bound.
RETURNS (SpanGroup): A copy of the span group. RETURNS (SpanGroup): A copy of the span group.
DOCS: https://spacy.io/api/spangroup#copy DOCS: https://spacy.io/api/spangroup#copy
""" """
if doc is None:
doc = self.doc
return SpanGroup( return SpanGroup(
self.doc, doc,
name=self.name, name=self.name,
attrs=deepcopy(self.attrs), attrs=deepcopy(self.attrs),
spans=list(self), spans=list(self),

View File

@ -255,9 +255,10 @@ Return a copy of the span group.
> new_group = doc.spans["errors"].copy() > new_group = doc.spans["errors"].copy()
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------- |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ | | `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
## SpanGroup.to_bytes {#to_bytes tag="method"} ## SpanGroup.to_bytes {#to_bytes tag="method"}