mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-29 18:54:07 +03:00
75f7c15187
* Span/SpanGroup: wrap SpanC in shared_ptr When a Span that was retrieved from a SpanGroup was modified, these changes were not reflected in the SpanGroup because the underlying SpanC struct was copied. This change applies the solution proposed by @nrodnova, to wrap SpanC in a shared_ptr. This makes a SpanGroup and Spans derived from it share the same SpanC. So, changes made through a Span are visible in the SpanGroup as well. Fixes #9556 * Test that a SpanGroup is modified through its Spans * SpanGroup.push_back: remove nogil Modifying std::vector is not thread-safe. * C++ >= 11 does not allow const T in vector<T> * Add Span.span_c as a shorthand for Span.c.get Since this method is cdef'ed, it is only visible from Cython, so we avoid using raw pointers in Python Replace existing uses of span.c.get() to use this new method. * Fix formatting * Style fix: pointer types * SpanGroup.to_bytes: reduce number of shared_ptr::get calls * Mark SpanGroup modification test with issue Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
193 lines
6.1 KiB
Cython
193 lines
6.1 KiB
Cython
import weakref
|
|
import struct
|
|
import srsly
|
|
|
|
from spacy.errors import Errors
|
|
from .span cimport Span
|
|
from libc.stdint cimport uint64_t, uint32_t, int32_t
|
|
from libcpp.memory cimport make_shared
|
|
|
|
|
|
cdef class SpanGroup:
|
|
"""A group of spans that all belong to the same Doc object. The group
|
|
can be named, and you can attach additional attributes to it. Span groups
|
|
are generally accessed via the `doc.spans` attribute. The `doc.spans`
|
|
attribute will convert lists of spans into a `SpanGroup` object for you
|
|
automatically on assignment.
|
|
|
|
Example:
|
|
Construction 1
|
|
>>> doc = nlp("Their goi ng home")
|
|
>>> doc.spans["errors"] = SpanGroup(
|
|
doc,
|
|
name="errors",
|
|
spans=[doc[0:1], doc[2:4]],
|
|
attrs={"annotator": "matt"}
|
|
)
|
|
|
|
Construction 2
|
|
>>> doc = nlp("Their goi ng home")
|
|
>>> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
|
>>> assert isinstance(doc.spans["errors"], SpanGroup)
|
|
|
|
DOCS: https://spacy.io/api/spangroup
|
|
"""
|
|
def __init__(self, doc, *, name="", attrs={}, spans=[]):
|
|
"""Create a SpanGroup.
|
|
|
|
doc (Doc): The reference Doc object.
|
|
name (str): The group name.
|
|
attrs (Dict[str, Any]): Optional JSON-serializable attributes to attach.
|
|
spans (Iterable[Span]): The spans to add to the group.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#init
|
|
"""
|
|
# We need to make this a weak reference, so that the Doc object can
|
|
# own the SpanGroup without circular references. We do want to get
|
|
# the Doc though, because otherwise the API gets annoying.
|
|
self._doc_ref = weakref.ref(doc)
|
|
self.name = name
|
|
self.attrs = dict(attrs) if attrs is not None else {}
|
|
cdef Span span
|
|
for span in spans:
|
|
self.push_back(span.c)
|
|
|
|
def __repr__(self):
|
|
return str(list(self))
|
|
|
|
@property
|
|
def doc(self):
|
|
"""RETURNS (Doc): The reference document.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#doc
|
|
"""
|
|
doc = self._doc_ref()
|
|
if doc is None:
|
|
# referent has been garbage collected
|
|
raise RuntimeError(Errors.E865)
|
|
return doc
|
|
|
|
@property
|
|
def has_overlap(self):
|
|
"""RETURNS (bool): Whether the group contains overlapping spans.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#has_overlap
|
|
"""
|
|
if not len(self):
|
|
return False
|
|
sorted_spans = list(sorted(self))
|
|
last_end = sorted_spans[0].end
|
|
for span in sorted_spans[1:]:
|
|
if span.start < last_end:
|
|
return True
|
|
last_end = span.end
|
|
return False
|
|
|
|
def __len__(self):
|
|
"""RETURNS (int): The number of spans in the group.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#len
|
|
"""
|
|
return self.c.size()
|
|
|
|
def append(self, Span span):
|
|
"""Add a span to the group. The span must refer to the same Doc
|
|
object as the span group.
|
|
|
|
span (Span): The span to append.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#append
|
|
"""
|
|
if span.doc is not self.doc:
|
|
raise ValueError("Cannot add span to group: refers to different Doc.")
|
|
self.push_back(span.c)
|
|
|
|
def extend(self, spans):
|
|
"""Add multiple spans to the group. All spans must refer to the same
|
|
Doc object as the span group.
|
|
|
|
spans (Iterable[Span]): The spans to add.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#extend
|
|
"""
|
|
cdef Span span
|
|
for span in spans:
|
|
self.append(span)
|
|
|
|
def __getitem__(self, int i):
|
|
"""Get a span from the group.
|
|
|
|
i (int): The item index.
|
|
RETURNS (Span): The span at the given index.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#getitem
|
|
"""
|
|
cdef int size = self.c.size()
|
|
if i < -size or i >= size:
|
|
raise IndexError(f"list index {i} out of range")
|
|
if i < 0:
|
|
i += size
|
|
return Span.cinit(self.doc, self.c[i])
|
|
|
|
def to_bytes(self):
|
|
"""Serialize the SpanGroup's contents to a byte string.
|
|
|
|
RETURNS (bytes): The serialized span group.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#to_bytes
|
|
"""
|
|
cdef SpanC* span_c
|
|
output = {"name": self.name, "attrs": self.attrs, "spans": []}
|
|
for i in range(self.c.size()):
|
|
span = self.c[i]
|
|
span_c = span.get()
|
|
# The struct.pack here is probably overkill, but it might help if
|
|
# you're saving tonnes of spans, and it doesn't really add any
|
|
# complexity. We do take care to specify little-endian byte order
|
|
# though, to ensure the message can be loaded back on a different
|
|
# arch.
|
|
# Q: uint64_t
|
|
# q: int64_t
|
|
# L: uint32_t
|
|
# l: int32_t
|
|
output["spans"].append(struct.pack(
|
|
">QQQllll",
|
|
span_c.id,
|
|
span_c.kb_id,
|
|
span_c.label,
|
|
span_c.start,
|
|
span_c.end,
|
|
span_c.start_char,
|
|
span_c.end_char
|
|
))
|
|
return srsly.msgpack_dumps(output)
|
|
|
|
def from_bytes(self, bytes_data):
|
|
"""Deserialize the SpanGroup's contents from a byte string.
|
|
|
|
bytes_data (bytes): The span group to load.
|
|
RETURNS (SpanGroup): The deserialized span group.
|
|
|
|
DOCS: https://spacy.io/api/spangroup#from_bytes
|
|
"""
|
|
msg = srsly.msgpack_loads(bytes_data)
|
|
self.name = msg["name"]
|
|
self.attrs = dict(msg["attrs"])
|
|
self.c.clear()
|
|
self.c.reserve(len(msg["spans"]))
|
|
cdef SpanC span
|
|
for span_data in msg["spans"]:
|
|
items = struct.unpack(">QQQllll", span_data)
|
|
span.id = items[0]
|
|
span.kb_id = items[1]
|
|
span.label = items[2]
|
|
span.start = items[3]
|
|
span.end = items[4]
|
|
span.start_char = items[5]
|
|
span.end_char = items[6]
|
|
self.c.push_back(make_shared[SpanC](span))
|
|
return self
|
|
|
|
cdef void push_back(self, const shared_ptr[SpanC] &span):
|
|
self.c.push_back(span)
|