From 604a7c3c26bcc6737a9676c3ba1b16c9ac705be3 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Wed, 31 Aug 2022 09:03:20 +0200 Subject: [PATCH] `SpanGroup(s)`-related optimizations (#11380) * `SpanGroup`: Add support for binding copies to a new reference document * `SpanGroups`: Replace superfluous serialize-deserialize roundtrip in `copy` Instead, directly copy the in-memory representations of the constituent `SpanGroup`s. * Update `SpanGroup.copy()` signature * Rename `new_doc` param to `doc` * Fix kwdarg * Update `.pyi` file and docstrings * `mypy` fix * Update spacy/tokens/span_group.pyx * Update docs Co-authored-by: Adriane Boyd --- spacy/tokens/_dict_proxies.py | 3 ++- spacy/tokens/span_group.pyi | 4 ++-- spacy/tokens/span_group.pyx | 7 +++++-- website/docs/api/spangroup.md | 7 ++++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index 9630da261..6edcce13d 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -42,7 +42,8 @@ class SpanGroups(UserDict): def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups": if doc is None: doc = self._ensure_doc() - return SpanGroups(doc).from_bytes(self.to_bytes()) + data_copy = ((k, v.copy(doc=doc)) for k, v in self.items()) + return SpanGroups(doc, items=data_copy) def setdefault(self, key, default=None): if not isinstance(default, SpanGroup): diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi index 245eb4dbe..21cd124ab 100644 --- a/spacy/tokens/span_group.pyi +++ b/spacy/tokens/span_group.pyi @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Optional from .doc import Doc from .span import Span @@ -24,4 +24,4 @@ class SpanGroup: def __getitem__(self, i: int) -> Span: ... def to_bytes(self) -> bytes: ... def from_bytes(self, bytes_data: bytes) -> SpanGroup: ... - def copy(self) -> SpanGroup: ... + def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ... diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx index bb0fab24f..1aa3c0bc8 100644 --- a/spacy/tokens/span_group.pyx +++ b/spacy/tokens/span_group.pyx @@ -241,15 +241,18 @@ cdef class SpanGroup: cdef void push_back(self, SpanC span) nogil: self.c.push_back(span) - def copy(self) -> SpanGroup: + def copy(self, doc: Optional["Doc"] = None) -> SpanGroup: """Clones the span group. + doc (Doc): New reference document to which the copy is bound. RETURNS (SpanGroup): A copy of the span group. DOCS: https://spacy.io/api/spangroup#copy """ + if doc is None: + doc = self.doc return SpanGroup( - self.doc, + doc, name=self.name, attrs=deepcopy(self.attrs), spans=list(self), diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md index 8dbdefc01..2d1cf73c4 100644 --- a/website/docs/api/spangroup.md +++ b/website/docs/api/spangroup.md @@ -255,9 +255,10 @@ Return a copy of the span group. > new_group = doc.spans["errors"].copy() > ``` -| Name | Description | -| ----------- | ----------------------------------------------- | -| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------- | +| `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ | +| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ | ## SpanGroup.to_bytes {#to_bytes tag="method"}