mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add SpanGroup and Graph container types to represent arbitrary annotations (#6696)
* Draft out initial Spans data structure * Initial span group commit * Basic span group support on Doc * Basic test for span group * Compile span_group.pyx * Draft addition of SpanGroup to DocBin * Add deserialization for SpanGroup * Add tests for serializing SpanGroup * Fix serialization of SpanGroup * Add EdgeC and GraphC structs * Add draft Graph data structure * Compile graph * More work on Graph * Update GraphC * Upd graph * Fix walk functions * Let Graph take nodes and edges on construction * Fix walking and getting * Add graph tests * Fix import * Add module with the SpanGroups dict thingy * Update test * Rename 'span_groups' attribute * Try to fix c++11 compilation * Fix test * Update DocBin * Try to fix compilation * Try to fix graph * Improve SpanGroup docstrings * Add doc.spans to documentation * Fix serialization * Tidy up and add docs * Update docs [ci skip] * Add SpanGroup.has_overlap * WIP updated Graph API * Start testing new Graph API * Update Graph tests * Update Graph * Add docstring Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
parent
54e8e3c208
commit
f277bfdf0f
6
setup.py
6
setup.py
|
@ -55,6 +55,8 @@ MOD_NAMES = [
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
"spacy.tokens.token",
|
"spacy.tokens.token",
|
||||||
|
"spacy.tokens.span_group",
|
||||||
|
"spacy.tokens.graph",
|
||||||
"spacy.tokens.morphanalysis",
|
"spacy.tokens.morphanalysis",
|
||||||
"spacy.tokens._retokenize",
|
"spacy.tokens._retokenize",
|
||||||
"spacy.matcher.matcher",
|
"spacy.matcher.matcher",
|
||||||
|
@ -68,7 +70,7 @@ COMPILE_OPTIONS = {
|
||||||
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||||
}
|
}
|
||||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
|
||||||
COMPILER_DIRECTIVES = {
|
COMPILER_DIRECTIVES = {
|
||||||
"language_level": -3,
|
"language_level": -3,
|
||||||
"embedsignature": True,
|
"embedsignature": True,
|
||||||
|
@ -201,7 +203,7 @@ def setup_package():
|
||||||
ext_modules = []
|
ext_modules = []
|
||||||
for name in MOD_NAMES:
|
for name in MOD_NAMES:
|
||||||
mod_path = name.replace(".", "/") + ".pyx"
|
mod_path = name.replace(".", "/") + ".pyx"
|
||||||
ext = Extension(name, [mod_path], language="c++")
|
ext = Extension(name, [mod_path], language="c++", extra_compile_args=["-std=c++11"])
|
||||||
ext_modules.append(ext)
|
ext_modules.append(ext)
|
||||||
print("Cythonizing sources")
|
print("Cythonizing sources")
|
||||||
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
|
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.unordered_set cimport unordered_set
|
||||||
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
|
||||||
from .typedefs cimport flags_t, attr_t, hash_t
|
from .typedefs cimport flags_t, attr_t, hash_t
|
||||||
|
@ -91,3 +93,22 @@ cdef struct AliasC:
|
||||||
|
|
||||||
# Prior probability P(entity|alias) - should sum up to (at most) 1.
|
# Prior probability P(entity|alias) - should sum up to (at most) 1.
|
||||||
vector[float] probs
|
vector[float] probs
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct EdgeC:
|
||||||
|
hash_t label
|
||||||
|
int32_t head
|
||||||
|
int32_t tail
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct GraphC:
|
||||||
|
vector[vector[int32_t]] nodes
|
||||||
|
vector[EdgeC] edges
|
||||||
|
vector[float] weights
|
||||||
|
vector[int] n_heads
|
||||||
|
vector[int] n_tails
|
||||||
|
vector[int] first_head
|
||||||
|
vector[int] first_tail
|
||||||
|
unordered_set[int]* roots
|
||||||
|
unordered_map[hash_t, int]* node_map
|
||||||
|
unordered_map[hash_t, int]* edge_map
|
||||||
|
|
|
@ -631,3 +631,24 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
doc.ents = spans
|
doc.ents = spans
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_groups(en_tokenizer):
|
||||||
|
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
|
||||||
|
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
|
||||||
|
assert "hi" in doc.spans
|
||||||
|
assert "bye" not in doc.spans
|
||||||
|
assert len(doc.spans["hi"]) == 1
|
||||||
|
assert doc.spans["hi"][0].label_ == "bye"
|
||||||
|
doc.spans["hi"].append(doc[0:3])
|
||||||
|
assert len(doc.spans["hi"]) == 2
|
||||||
|
assert doc.spans["hi"][1].text == "Some text about"
|
||||||
|
assert [span.text for span in doc.spans["hi"]] == ["Colombia", "Some text about"]
|
||||||
|
assert not doc.spans["hi"].has_overlap
|
||||||
|
doc.ents = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
|
||||||
|
doc.spans["hi"].extend(doc.ents)
|
||||||
|
assert len(doc.spans["hi"]) == 4
|
||||||
|
assert [span.label_ for span in doc.spans["hi"]] == ["bye", "", "GPE", "GPE"]
|
||||||
|
assert doc.spans["hi"].has_overlap
|
||||||
|
del doc.spans["hi"]
|
||||||
|
assert "hi" not in doc.spans
|
||||||
|
|
57
spacy/tests/doc/test_graph.py
Normal file
57
spacy/tests/doc/test_graph.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokens.doc import Doc
|
||||||
|
from spacy.tokens.graph import Graph
|
||||||
|
|
||||||
|
|
||||||
|
def test_graph_init():
|
||||||
|
doc = Doc(Vocab(), words=["a", "b", "c", "d"])
|
||||||
|
graph = Graph(doc, name="hello")
|
||||||
|
assert graph.name == "hello"
|
||||||
|
assert graph.doc is doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_graph_edges_and_nodes():
|
||||||
|
doc = Doc(Vocab(), words=["a", "b", "c", "d"])
|
||||||
|
graph = Graph(doc, name="hello")
|
||||||
|
node1 = graph.add_node((0,))
|
||||||
|
assert graph.get_node((0,)) == node1
|
||||||
|
node2 = graph.add_node((1, 3))
|
||||||
|
assert list(node2) == [1, 3]
|
||||||
|
graph.add_edge(
|
||||||
|
node1,
|
||||||
|
node2,
|
||||||
|
label="one",
|
||||||
|
weight=-10.5
|
||||||
|
)
|
||||||
|
assert graph.has_edge(
|
||||||
|
node1,
|
||||||
|
node2,
|
||||||
|
label="one"
|
||||||
|
)
|
||||||
|
assert node1.heads() == []
|
||||||
|
assert [tuple(h) for h in node2.heads()] == [(0,)]
|
||||||
|
assert [tuple(t) for t in node1.tails()] == [(1, 3)]
|
||||||
|
assert [tuple(t) for t in node2.tails()] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_graph_walk():
|
||||||
|
doc = Doc(Vocab(), words=["a", "b", "c", "d"])
|
||||||
|
graph = Graph(
|
||||||
|
doc,
|
||||||
|
name="hello",
|
||||||
|
nodes=[(0,), (1,), (2,), (3,)],
|
||||||
|
edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
|
||||||
|
labels=None,
|
||||||
|
weights=None
|
||||||
|
)
|
||||||
|
node0, node1, node2, node3 = list(graph.nodes)
|
||||||
|
assert [tuple(h) for h in node0.heads()] == [(3,)]
|
||||||
|
assert [tuple(h) for h in node1.heads()] == [(0,)]
|
||||||
|
assert [tuple(h) for h in node0.walk_heads()] == [(3,), (0,)]
|
||||||
|
assert [tuple(h) for h in node1.walk_heads()] == [(0,), (3,), (0,)]
|
||||||
|
assert [tuple(h) for h in node2.walk_heads()] == [(0,), (3,), (0,)]
|
||||||
|
assert [tuple(h) for h in node3.walk_heads()] == [(0,), (3,)]
|
||||||
|
assert [tuple(t) for t in node0.walk_tails()] == [(1,), (2,), (3,), (0,)]
|
||||||
|
assert [tuple(t) for t in node1.walk_tails()] == []
|
||||||
|
assert [tuple(t) for t in node2.walk_tails()] == []
|
||||||
|
assert [tuple(t) for t in node3.walk_tails()] == [(0,), (1,), (2,), (3,)]
|
|
@ -56,6 +56,13 @@ def test_serialize_doc_exclude(en_vocab):
|
||||||
assert not new_doc.user_data
|
assert not new_doc.user_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_doc_span_groups(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
|
doc.spans["content"] = [doc[0:2]]
|
||||||
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
|
assert len(new_doc.spans["content"]) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_bin():
|
def test_serialize_doc_bin():
|
||||||
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
|
||||||
texts = ["Some text", "Lots of texts...", "..."]
|
texts = ["Some text", "Lots of texts...", "..."]
|
||||||
|
@ -63,6 +70,7 @@ def test_serialize_doc_bin():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
|
doc.spans["start"] = [doc[0:2]]
|
||||||
doc_bin.add(doc)
|
doc_bin.add(doc)
|
||||||
bytes_data = doc_bin.to_bytes()
|
bytes_data = doc_bin.to_bytes()
|
||||||
|
|
||||||
|
@ -73,6 +81,7 @@ def test_serialize_doc_bin():
|
||||||
for i, doc in enumerate(reloaded_docs):
|
for i, doc in enumerate(reloaded_docs):
|
||||||
assert doc.text == texts[i]
|
assert doc.text == texts[i]
|
||||||
assert doc.cats == cats
|
assert doc.cats == cats
|
||||||
|
assert len(doc.spans) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||||
|
|
49
spacy/tokens/_dict_proxies.py
Normal file
49
spacy/tokens/_dict_proxies.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
from typing import Iterable, Tuple, Union, TYPE_CHECKING
|
||||||
|
import weakref
|
||||||
|
from collections import UserDict
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from .span_group import SpanGroup
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from .doc import Doc # noqa: F401
|
||||||
|
from .span import Span # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
# Why inherit from UserDict instead of dict here?
|
||||||
|
# Well, the 'dict' class doesn't necessarily delegate everything nicely,
|
||||||
|
# for performance reasons. The UserDict is slower by better behaved.
|
||||||
|
# See https://treyhunner.com/2019/04/why-you-shouldnt-inherit-from-list-and-dict-in-python/0ww
|
||||||
|
class SpanGroups(UserDict):
|
||||||
|
"""A dict-like proxy held by the Doc, to control access to span groups."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple()
|
||||||
|
) -> None:
|
||||||
|
self.doc_ref = weakref.ref(doc)
|
||||||
|
UserDict.__init__(self, items)
|
||||||
|
|
||||||
|
def __setitem__(self, key: str, value: Union[SpanGroup, Iterable["Span"]]) -> None:
|
||||||
|
if not isinstance(value, SpanGroup):
|
||||||
|
value = self._make_span_group(key, value)
|
||||||
|
assert value.doc is self.doc_ref()
|
||||||
|
UserDict.__setitem__(self, key, value)
|
||||||
|
|
||||||
|
def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
|
||||||
|
return SpanGroup(self.doc_ref(), name=name, spans=spans)
|
||||||
|
|
||||||
|
def to_bytes(self) -> bytes:
|
||||||
|
# We don't need to serialize this as a dict, because the groups
|
||||||
|
# know their names.
|
||||||
|
msg = [value.to_bytes() for value in self.values()]
|
||||||
|
return srsly.msgpack_dumps(msg)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data: bytes) -> "SpanGroups":
|
||||||
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
|
self.clear()
|
||||||
|
doc = self.doc_ref()
|
||||||
|
for value_bytes in msg:
|
||||||
|
group = SpanGroup(doc).from_bytes(value_bytes)
|
||||||
|
self[group.name] = group
|
||||||
|
return self
|
|
@ -33,6 +33,7 @@ class DocBin:
|
||||||
{
|
{
|
||||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||||
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||||
|
"spans": List[Dict[str, bytes]], # SpanGroups data for each doc
|
||||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||||
"strings": List[unicode] # List of unique strings in the token data
|
"strings": List[unicode] # List of unique strings in the token data
|
||||||
|
@ -70,6 +71,7 @@ class DocBin:
|
||||||
self.tokens = []
|
self.tokens = []
|
||||||
self.spaces = []
|
self.spaces = []
|
||||||
self.cats = []
|
self.cats = []
|
||||||
|
self.span_groups = []
|
||||||
self.user_data = []
|
self.user_data = []
|
||||||
self.flags = []
|
self.flags = []
|
||||||
self.strings = set()
|
self.strings = set()
|
||||||
|
@ -107,6 +109,10 @@ class DocBin:
|
||||||
self.strings.add(token.ent_kb_id_)
|
self.strings.add(token.ent_kb_id_)
|
||||||
self.cats.append(doc.cats)
|
self.cats.append(doc.cats)
|
||||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||||
|
self.span_groups.append(doc.spans.to_bytes())
|
||||||
|
for key, group in doc.spans.items():
|
||||||
|
for span in group:
|
||||||
|
self.strings.add(span.label_)
|
||||||
|
|
||||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
@ -130,6 +136,10 @@ class DocBin:
|
||||||
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
||||||
doc = doc.from_array(self.attrs, tokens)
|
doc = doc.from_array(self.attrs, tokens)
|
||||||
doc.cats = self.cats[i]
|
doc.cats = self.cats[i]
|
||||||
|
if self.span_groups[i]:
|
||||||
|
doc.spans.from_bytes(self.span_groups[i])
|
||||||
|
else:
|
||||||
|
doc.spans.clear()
|
||||||
if i < len(self.user_data) and self.user_data[i] is not None:
|
if i < len(self.user_data) and self.user_data[i] is not None:
|
||||||
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
|
user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
|
||||||
doc.user_data.update(user_data)
|
doc.user_data.update(user_data)
|
||||||
|
@ -161,6 +171,7 @@ class DocBin:
|
||||||
self.spaces.extend(other.spaces)
|
self.spaces.extend(other.spaces)
|
||||||
self.strings.update(other.strings)
|
self.strings.update(other.strings)
|
||||||
self.cats.extend(other.cats)
|
self.cats.extend(other.cats)
|
||||||
|
self.span_groups.extend(other.span_groups)
|
||||||
self.flags.extend(other.flags)
|
self.flags.extend(other.flags)
|
||||||
self.user_data.extend(other.user_data)
|
self.user_data.extend(other.user_data)
|
||||||
|
|
||||||
|
@ -185,6 +196,7 @@ class DocBin:
|
||||||
"strings": list(sorted(self.strings)),
|
"strings": list(sorted(self.strings)),
|
||||||
"cats": self.cats,
|
"cats": self.cats,
|
||||||
"flags": self.flags,
|
"flags": self.flags,
|
||||||
|
"span_groups": self.span_groups,
|
||||||
}
|
}
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
msg["user_data"] = self.user_data
|
msg["user_data"] = self.user_data
|
||||||
|
@ -213,6 +225,7 @@ class DocBin:
|
||||||
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
||||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||||
self.cats = msg["cats"]
|
self.cats = msg["cats"]
|
||||||
|
self.span_groups = msg.get("span_groups", [b"" for _ in lengths])
|
||||||
self.flags = msg.get("flags", [{} for _ in lengths])
|
self.flags = msg.get("flags", [{} for _ in lengths])
|
||||||
if "user_data" in msg:
|
if "user_data" in msg:
|
||||||
self.user_data = list(msg["user_data"])
|
self.user_data = list(msg["user_data"])
|
||||||
|
|
|
@ -2,7 +2,7 @@ from cymem.cymem cimport Pool
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC, SpanC
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
cdef readonly Pool mem
|
cdef readonly Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
@ -43,6 +44,7 @@ cdef class Doc:
|
||||||
cdef public object tensor
|
cdef public object tensor
|
||||||
cdef public object cats
|
cdef public object cats
|
||||||
cdef public object user_data
|
cdef public object user_data
|
||||||
|
cdef readonly object spans
|
||||||
|
|
||||||
cdef TokenC* c
|
cdef TokenC* c
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ from thinc.util import copy_array
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
from ._dict_proxies import SpanGroups
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
@ -222,6 +223,7 @@ cdef class Doc:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = max(20, (len(words) if words is not None else 0))
|
size = max(20, (len(words) if words is not None else 0))
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self.spans = SpanGroups(self)
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
# realloc.
|
# realloc.
|
||||||
|
@ -1255,6 +1257,9 @@ cdef class Doc:
|
||||||
strings.add(token.ent_kb_id_)
|
strings.add(token.ent_kb_id_)
|
||||||
strings.add(token.ent_id_)
|
strings.add(token.ent_id_)
|
||||||
strings.add(token.norm_)
|
strings.add(token.norm_)
|
||||||
|
for group in self.spans.values():
|
||||||
|
for span in group:
|
||||||
|
strings.add(span.label_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
@ -1266,6 +1271,7 @@ cdef class Doc:
|
||||||
"sentiment": lambda: self.sentiment,
|
"sentiment": lambda: self.sentiment,
|
||||||
"tensor": lambda: self.tensor,
|
"tensor": lambda: self.tensor,
|
||||||
"cats": lambda: self.cats,
|
"cats": lambda: self.cats,
|
||||||
|
"spans": lambda: self.spans.to_bytes(),
|
||||||
"strings": lambda: list(strings),
|
"strings": lambda: list(strings),
|
||||||
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
||||||
}
|
}
|
||||||
|
@ -1290,18 +1296,6 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError(Errors.E033.format(length=self.length))
|
raise ValueError(Errors.E033.format(length=self.length))
|
||||||
deserializers = {
|
|
||||||
"text": lambda b: None,
|
|
||||||
"array_head": lambda b: None,
|
|
||||||
"array_body": lambda b: None,
|
|
||||||
"sentiment": lambda b: None,
|
|
||||||
"tensor": lambda b: None,
|
|
||||||
"cats": lambda b: None,
|
|
||||||
"strings": lambda b: None,
|
|
||||||
"user_data_keys": lambda b: None,
|
|
||||||
"user_data_values": lambda b: None,
|
|
||||||
"has_unknown_spaces": lambda b: None
|
|
||||||
}
|
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
@ -1336,9 +1330,12 @@ cdef class Doc:
|
||||||
self.push_back(lex, has_space)
|
self.push_back(lex, has_space)
|
||||||
start = end + has_space
|
start = end + has_space
|
||||||
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
||||||
|
if "spans" in msg:
|
||||||
|
self.spans.from_bytes(msg["spans"])
|
||||||
|
else:
|
||||||
|
self.spans.clear()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def extend_tensor(self, tensor):
|
def extend_tensor(self, tensor):
|
||||||
"""Concatenate a new tensor onto the doc.tensor object.
|
"""Concatenate a new tensor onto the doc.tensor object.
|
||||||
|
|
||||||
|
|
13
spacy/tokens/graph.pxd
Normal file
13
spacy/tokens/graph.pxd
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
from ..structs cimport GraphC, EdgeC
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Graph:
|
||||||
|
cdef GraphC c
|
||||||
|
cdef Pool mem
|
||||||
|
cdef PreshMap node_map
|
||||||
|
cdef PreshMap edge_map
|
||||||
|
cdef object doc_ref
|
||||||
|
cdef public str name
|
709
spacy/tokens/graph.pyx
Normal file
709
spacy/tokens/graph.pyx
Normal file
|
@ -0,0 +1,709 @@
|
||||||
|
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||||
|
from typing import List, Tuple, Generator
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libcpp.pair cimport pair
|
||||||
|
from libcpp.unordered_map cimport unordered_map
|
||||||
|
from libcpp.unordered_set cimport unordered_set
|
||||||
|
from cython.operator cimport dereference
|
||||||
|
cimport cython
|
||||||
|
import weakref
|
||||||
|
from preshed.maps cimport map_get_unless_missing
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
from ..typedefs cimport hash_t
|
||||||
|
from ..strings import get_string_id
|
||||||
|
from ..structs cimport EdgeC, GraphC
|
||||||
|
from .token import Token
|
||||||
|
|
||||||
|
|
||||||
|
@cython.freelist(8)
|
||||||
|
cdef class Edge:
|
||||||
|
cdef readonly Graph graph
|
||||||
|
cdef readonly int i
|
||||||
|
|
||||||
|
def __init__(self, Graph graph, int i):
|
||||||
|
self.graph = graph
|
||||||
|
self.i = i
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_none(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self) -> "Doc":
|
||||||
|
return self.graph.doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def head(self) -> "Node":
|
||||||
|
return Node(self.graph, self.graph.c.edges[self.i].head)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tail(self) -> "Tail":
|
||||||
|
return Node(self.graph, self.graph.c.edges[self.i].tail)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self) -> int:
|
||||||
|
return self.graph.c.edges[self.i].label
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight(self) -> float:
|
||||||
|
return self.graph.c.weights[self.i]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_(self) -> str:
|
||||||
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
|
||||||
|
@cython.freelist(8)
|
||||||
|
cdef class Node:
|
||||||
|
cdef readonly Graph graph
|
||||||
|
cdef readonly int i
|
||||||
|
|
||||||
|
def __init__(self, Graph graph, int i):
|
||||||
|
"""A reference to a node of an annotation graph. Each node is made up of
|
||||||
|
an ordered set of zero or more token indices.
|
||||||
|
|
||||||
|
Node references are usually created by the Graph object itself, or from
|
||||||
|
the Node or Edge objects. You usually won't need to instantiate this
|
||||||
|
class yourself.
|
||||||
|
"""
|
||||||
|
cdef int length = graph.c.nodes.size()
|
||||||
|
if i >= length or -i >= length:
|
||||||
|
raise IndexError(f"Node index {i} out of bounds ({length})")
|
||||||
|
if i < 0:
|
||||||
|
i += length
|
||||||
|
self.graph = graph
|
||||||
|
self.i = i
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
if self.graph is not other.graph:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return self.i == other.i
|
||||||
|
|
||||||
|
def __iter__(self) -> Generator[int]:
|
||||||
|
for i in self.graph.c.nodes[self.i]:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def __getitem__(self, int i) -> int:
|
||||||
|
"""Get a token index from the node's set of tokens."""
|
||||||
|
length = self.graph.c.nodes[self.i].size()
|
||||||
|
if i >= length or -i >= length:
|
||||||
|
raise IndexError(f"Token index {i} out of bounds ({length})")
|
||||||
|
if i < 0:
|
||||||
|
i += length
|
||||||
|
return self.graph.c.nodes[self.i][i]
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
"""The number of tokens that make up the node."""
|
||||||
|
return self.graph.c.nodes[self.i].size()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_none(self) -> bool:
|
||||||
|
"""Whether the node is a special value, indicating 'none'.
|
||||||
|
|
||||||
|
The NoneNode type is returned by the Graph, Edge and Node objects when
|
||||||
|
there is no match to a query. It has the same API as Node, but it always
|
||||||
|
returns NoneNode, NoneEdge or empty lists for its queries.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self) -> "Doc":
|
||||||
|
"""The Doc object that the graph refers to."""
|
||||||
|
return self.graph.doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tokens(self) -> Tuple[Token]:
|
||||||
|
"""A tuple of Token objects that make up the node."""
|
||||||
|
doc = self.doc
|
||||||
|
return tuple([doc[i] for i in self])
|
||||||
|
|
||||||
|
def head(self, i=None, label=None) -> "Node":
|
||||||
|
"""Get the head of the first matching edge, searching by index, label,
|
||||||
|
both or neither.
|
||||||
|
|
||||||
|
For instance, `node.head(i=1)` will get the head of the second edge that
|
||||||
|
this node is a tail of. `node.head(i=1, label="ARG0")` will further
|
||||||
|
check that the second edge has the label `"ARG0"`.
|
||||||
|
|
||||||
|
If no matching node can be found, the graph's NoneNode is returned.
|
||||||
|
"""
|
||||||
|
return self.headed(i=i, label=label)
|
||||||
|
|
||||||
|
def tail(self, i=None, label=None) -> "Node":
|
||||||
|
"""Get the tail of the first matching edge, searching by index, label,
|
||||||
|
both or neither.
|
||||||
|
|
||||||
|
If no matching node can be found, the graph's NoneNode is returned.
|
||||||
|
"""
|
||||||
|
return self.tailed(i=i, label=label).tail
|
||||||
|
|
||||||
|
def sibling(self, i=None, label=None):
|
||||||
|
"""Get the first matching sibling node. Two nodes are siblings if they
|
||||||
|
are both tails of the same head.
|
||||||
|
If no matching node can be found, the graph's NoneNode is returned.
|
||||||
|
"""
|
||||||
|
if i is None:
|
||||||
|
siblings = self.siblings(label=label)
|
||||||
|
return siblings[0] if siblings else NoneNode(self)
|
||||||
|
else:
|
||||||
|
edges = []
|
||||||
|
for h in self.headed():
|
||||||
|
edges.extend([e for e in h.tailed() if e.tail.i != self.i])
|
||||||
|
if i >= len(edges):
|
||||||
|
return NoneNode(self)
|
||||||
|
elif label is not None and edges[i].label != label:
|
||||||
|
return NoneNode(self)
|
||||||
|
else:
|
||||||
|
return edges[i].tail
|
||||||
|
|
||||||
|
def heads(self, label=None) -> List["Node"]:
|
||||||
|
"""Find all matching heads of this node."""
|
||||||
|
cdef vector[int] edge_indices
|
||||||
|
self._find_edges(edge_indices, "head", label)
|
||||||
|
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
|
||||||
|
|
||||||
|
def tails(self, label=None) -> List["Node"]:
|
||||||
|
"""Find all matching tails of this node."""
|
||||||
|
cdef vector[int] edge_indices
|
||||||
|
self._find_edges(edge_indices, "tail", label)
|
||||||
|
return [Node(self.graph, self.graph.c.edges[i].tail) for i in edge_indices]
|
||||||
|
|
||||||
|
def siblings(self, label=None) -> List["Node"]:
|
||||||
|
"""Find all maching siblings of this node. Two nodes are siblings if they
|
||||||
|
are tails of the same head.
|
||||||
|
"""
|
||||||
|
edges = []
|
||||||
|
for h in self.headed():
|
||||||
|
edges.extend([e for e in h.tailed() if e.tail.i != self.i])
|
||||||
|
if label is None:
|
||||||
|
return [e.tail for e in edges]
|
||||||
|
else:
|
||||||
|
return [e.tail for e in edges if e.label == label]
|
||||||
|
|
||||||
|
def headed(self, i=None, label=None) -> Edge:
|
||||||
|
"""Find the first matching edge headed by this node.
|
||||||
|
If no matching edge can be found, the graph's NoneEdge is returned.
|
||||||
|
"""
|
||||||
|
start, end = self._find_range(i, self.c.n_head[self.i])
|
||||||
|
idx = self._find_edge("head", start, end, label)
|
||||||
|
if idx == -1:
|
||||||
|
return NoneEdge(self.graph)
|
||||||
|
else:
|
||||||
|
return Edge(self.graph, idx)
|
||||||
|
|
||||||
|
def tailed(self, i=None, label=None) -> Edge:
|
||||||
|
"""Find the first matching edge tailed by this node.
|
||||||
|
If no matching edge can be found, the graph's NoneEdge is returned.
|
||||||
|
"""
|
||||||
|
start, end = self._find_range(i, self.c.n_tail[self.i])
|
||||||
|
idx = self._find_edge("tail", start, end, label)
|
||||||
|
if idx == -1:
|
||||||
|
return NoneEdge(self.graph)
|
||||||
|
else:
|
||||||
|
return Edge(self.graph, idx)
|
||||||
|
|
||||||
|
def headeds(self, label=None) -> List[Edge]:
|
||||||
|
"""Find all matching edges headed by this node."""
|
||||||
|
cdef vector[int] edge_indices
|
||||||
|
self._find_edges(edge_indices, "head", label)
|
||||||
|
return [Edge(self.graph, i) for i in edge_indices]
|
||||||
|
|
||||||
|
def taileds(self, label=None) -> List["Edge"]:
|
||||||
|
"""Find all matching edges headed by this node."""
|
||||||
|
cdef vector[int] edge_indices
|
||||||
|
self._find_edges(edge_indices, "tail", label)
|
||||||
|
return [Edge(self.graph, i) for i in edge_indices]
|
||||||
|
|
||||||
|
def walk_heads(self):
|
||||||
|
cdef vector[int] node_indices
|
||||||
|
walk_head_nodes(node_indices, &self.graph.c, self.i)
|
||||||
|
for i in node_indices:
|
||||||
|
yield Node(self.graph, i)
|
||||||
|
|
||||||
|
def walk_tails(self):
|
||||||
|
cdef vector[int] node_indices
|
||||||
|
walk_tail_nodes(node_indices, &self.graph.c, self.i)
|
||||||
|
for i in node_indices:
|
||||||
|
yield Node(self.graph, i)
|
||||||
|
|
||||||
|
cdef (int, int) _get_range(self, i, n):
|
||||||
|
if i is None:
|
||||||
|
return (0, n)
|
||||||
|
elif i < n:
|
||||||
|
return (i, i+1)
|
||||||
|
else:
|
||||||
|
return (0, 0)
|
||||||
|
|
||||||
|
cdef int _find_edge(self, str direction, int start, int end, label) except -2:
|
||||||
|
if direction == "head":
|
||||||
|
get_edges = get_head_edges
|
||||||
|
else:
|
||||||
|
get_edges = get_tail_edges
|
||||||
|
cdef vector[int] edge_indices
|
||||||
|
get_edges(edge_indices, &self.graph.c, self.i)
|
||||||
|
if label is None:
|
||||||
|
return edge_indices[start]
|
||||||
|
for edge_index in edge_indices[start:end]:
|
||||||
|
if self.graph.c.edges[edge_index].label == label:
|
||||||
|
return edge_index
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
cdef int _find_edges(self, vector[int]& edge_indices, str direction, label):
|
||||||
|
if direction == "head":
|
||||||
|
get_edges = get_head_edges
|
||||||
|
else:
|
||||||
|
get_edges = get_tail_edges
|
||||||
|
if label is None:
|
||||||
|
get_edges(edge_indices, &self.graph.c, self.i)
|
||||||
|
return edge_indices.size()
|
||||||
|
cdef vector[int] unfiltered
|
||||||
|
get_edges(unfiltered, &self.graph.c, self.i)
|
||||||
|
for edge_index in unfiltered:
|
||||||
|
if self.graph.c.edges[edge_index].label == label:
|
||||||
|
edge_indices.push_back(edge_index)
|
||||||
|
return edge_indices.size()
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NoneEdge(Edge):
|
||||||
|
"""An Edge subclass, representing a non-result. The NoneEdge has the same
|
||||||
|
API as other Edge instances, but always returns NoneEdge, NoneNode, or empty
|
||||||
|
lists.
|
||||||
|
"""
|
||||||
|
def __init__(self, graph):
|
||||||
|
self.graph = graph
|
||||||
|
self.i = -1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self) -> "Doc":
|
||||||
|
return self.graph.doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def head(self) -> "NoneNode":
|
||||||
|
return NoneNode(self.graph)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tail(self) -> "NoneNode":
|
||||||
|
return NoneNode(self.graph)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label(self) -> int:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def weight(self) -> float:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_(self) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NoneNode(Node):
|
||||||
|
def __init__(self, graph):
|
||||||
|
self.graph = graph
|
||||||
|
self.i = -1
|
||||||
|
|
||||||
|
def __getitem__(self, int i):
|
||||||
|
raise IndexError("Cannot index into NoneNode.")
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_none(self):
|
||||||
|
return -1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self):
|
||||||
|
return self.graph.doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tokens(self):
|
||||||
|
return tuple()
|
||||||
|
|
||||||
|
def head(self, i=None, label=None):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def tail(self, i=None, label=None):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def walk_heads(self):
|
||||||
|
yield from []
|
||||||
|
|
||||||
|
def walk_tails(self):
|
||||||
|
yield from []
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Graph:
|
||||||
|
"""A set of directed labelled relationships between sets of tokens.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
Construction 1
|
||||||
|
>>> graph = Graph(doc, name="srl")
|
||||||
|
|
||||||
|
Construction 2
|
||||||
|
>>> graph = Graph(
|
||||||
|
doc,
|
||||||
|
name="srl",
|
||||||
|
nodes=[(0,), (1, 3), (,)],
|
||||||
|
edges=[(0, 2), (2, 1)]
|
||||||
|
)
|
||||||
|
|
||||||
|
Construction 3
|
||||||
|
>>> graph = Graph(
|
||||||
|
doc,
|
||||||
|
name="srl",
|
||||||
|
nodes=[(0,), (1, 3), (,)],
|
||||||
|
edges=[(2, 0), (0, 1)],
|
||||||
|
labels=["word sense ID 1675", "agent"],
|
||||||
|
weights=[-42.6, -1.7]
|
||||||
|
)
|
||||||
|
>>> assert graph.has_node((0,))
|
||||||
|
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
||||||
|
"""
|
||||||
|
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
|
||||||
|
"""Create a Graph object.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object the graph will refer to.
|
||||||
|
name (str): A string name to help identify the graph. Defaults to "".
|
||||||
|
nodes (List[Tuple[int]]): A list of token-index tuples to add to the graph
|
||||||
|
as nodes. Defaults to [].
|
||||||
|
edges (List[Tuple[int, int]]): A list of edges between the provided nodes.
|
||||||
|
Each edge should be a (head, tail) tuple, where `head` and `tail`
|
||||||
|
are integers pointing into the `nodes` list. Defaults to [].
|
||||||
|
labels (Optional[List[str]]): A list of labels for the provided edges.
|
||||||
|
If None, all of the edges specified by the edges argument will have
|
||||||
|
be labelled with the empty string (""). If `labels` is not `None`,
|
||||||
|
it must have the same length as the `edges` argument.
|
||||||
|
weights (Optional[List[float]]): A list of weights for the provided edges.
|
||||||
|
If None, all of the edges specified by the edges argument will
|
||||||
|
have the weight 0.0. If `weights` is not `None`, it must have the
|
||||||
|
same length as the `edges` argument.
|
||||||
|
"""
|
||||||
|
if weights is not None:
|
||||||
|
assert len(weights) == len(edges)
|
||||||
|
else:
|
||||||
|
weights = [0.0] * len(edges)
|
||||||
|
if labels is not None:
|
||||||
|
assert len(labels) == len(edges)
|
||||||
|
else:
|
||||||
|
labels = [""] * len(edges)
|
||||||
|
self.c.node_map = new unordered_map[hash_t, int]()
|
||||||
|
self.c.edge_map = new unordered_map[hash_t, int]()
|
||||||
|
self.c.roots = new unordered_set[int]()
|
||||||
|
self.name = name
|
||||||
|
self.doc_ref = weakref.ref(doc)
|
||||||
|
for node in nodes:
|
||||||
|
self.add_node(node)
|
||||||
|
for (head, tail), label, weight in zip(edges, labels, weights):
|
||||||
|
self.add_edge(
|
||||||
|
Node(self, head),
|
||||||
|
Node(self, tail),
|
||||||
|
label=label,
|
||||||
|
weight=weight
|
||||||
|
)
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
del self.c.node_map
|
||||||
|
del self.c.edge_map
|
||||||
|
del self.c.roots
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self) -> "Doc":
|
||||||
|
"""The Doc object the graph refers to."""
|
||||||
|
return self.doc_ref()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def edges(self) -> Generator[Edge]:
|
||||||
|
"""Iterate over the edges in the graph."""
|
||||||
|
for i in range(self.c.edges.size()):
|
||||||
|
yield Edge(self, i)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nodes(self) -> Generator[Node]:
|
||||||
|
"""Iterate over the nodes in the graph."""
|
||||||
|
for i in range(self.c.nodes.size()):
|
||||||
|
yield Node(self, i)
|
||||||
|
|
||||||
|
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
|
||||||
|
"""Add an edge to the graph, connecting two groups of tokens.
|
||||||
|
|
||||||
|
If there is already an edge for the (head, tail, label) triple, it will
|
||||||
|
be returned, and no new edge will be created. The weight of the edge
|
||||||
|
will be updated if a weight is specified.
|
||||||
|
"""
|
||||||
|
label_hash = self.doc.vocab.strings.as_int(label)
|
||||||
|
weight_float = weight if weight is not None else 0.0
|
||||||
|
edge_index = add_edge(
|
||||||
|
&self.c,
|
||||||
|
EdgeC(
|
||||||
|
head=self.add_node(head).i,
|
||||||
|
tail=self.add_node(tail).i,
|
||||||
|
label=self.doc.vocab.strings.as_int(label),
|
||||||
|
),
|
||||||
|
weight=weight if weight is not None else 0.0
|
||||||
|
)
|
||||||
|
return Edge(self, edge_index)
|
||||||
|
|
||||||
|
def get_edge(self, head, tail, *, label="") -> Edge:
|
||||||
|
"""Look up an edge in the graph. If the graph has no matching edge,
|
||||||
|
the NoneEdge object is returned.
|
||||||
|
"""
|
||||||
|
head_node = self.get_node(head)
|
||||||
|
if head_node.is_none:
|
||||||
|
return NoneEdge(self)
|
||||||
|
tail_node = self.get_node(tail)
|
||||||
|
if tail_node.is_none:
|
||||||
|
return NoneEdge(self)
|
||||||
|
edge_index = get_edge(
|
||||||
|
&self.c,
|
||||||
|
EdgeC(head=head_node.i, tail=tail_node.i, label=get_string_id(label))
|
||||||
|
)
|
||||||
|
if edge_index < 0:
|
||||||
|
return NoneEdge(self)
|
||||||
|
else:
|
||||||
|
return Edge(self, edge_index)
|
||||||
|
|
||||||
|
def has_edge(self, head, tail, label) -> bool:
|
||||||
|
"""Check whether a (head, tail, label) triple is an edge in the graph."""
|
||||||
|
return not self.get_edge(head, tail, label=label).is_none
|
||||||
|
|
||||||
|
def add_node(self, indices) -> Node:
|
||||||
|
"""Add a node to the graph and return it. Nodes refer to ordered sets
|
||||||
|
of token indices.
|
||||||
|
|
||||||
|
This method is idempotent: if there is already a node for the given
|
||||||
|
indices, it is returned without a new node being created.
|
||||||
|
"""
|
||||||
|
if isinstance(indices, Node):
|
||||||
|
return indices
|
||||||
|
cdef vector[int32_t] node
|
||||||
|
node.reserve(len(indices))
|
||||||
|
for idx in indices:
|
||||||
|
node.push_back(idx)
|
||||||
|
i = add_node(&self.c, node)
|
||||||
|
print("Add node", indices, i)
|
||||||
|
return Node(self, i)
|
||||||
|
|
||||||
|
def get_node(self, indices) -> Node:
|
||||||
|
"""Get a node from the graph, or the NoneNode if there is no node for
|
||||||
|
the given indices.
|
||||||
|
"""
|
||||||
|
if isinstance(indices, Node):
|
||||||
|
return indices
|
||||||
|
cdef vector[int32_t] node
|
||||||
|
node.reserve(len(indices))
|
||||||
|
for idx in indices:
|
||||||
|
node.push_back(idx)
|
||||||
|
node_index = get_node(&self.c, node)
|
||||||
|
if node_index < 0:
|
||||||
|
return NoneNode(self)
|
||||||
|
else:
|
||||||
|
print("Get node", indices, node_index)
|
||||||
|
return Node(self, node_index)
|
||||||
|
|
||||||
|
def has_node(self, tuple indices) -> bool:
|
||||||
|
"""Check whether the graph has a node for the given indices."""
|
||||||
|
return not self.get_node(indices).is_none
|
||||||
|
|
||||||
|
|
||||||
|
cdef int add_edge(GraphC* graph, EdgeC edge, float weight) nogil:
|
||||||
|
key = hash64(&edge, sizeof(edge), 0)
|
||||||
|
it = graph.edge_map.find(key)
|
||||||
|
if it != graph.edge_map.end():
|
||||||
|
edge_index = dereference(it).second
|
||||||
|
graph.weights[edge_index] = weight
|
||||||
|
return edge_index
|
||||||
|
else:
|
||||||
|
edge_index = graph.edges.size()
|
||||||
|
graph.edge_map.insert(pair[hash_t, int](key, edge_index))
|
||||||
|
graph.edges.push_back(edge)
|
||||||
|
if graph.n_tails[edge.head] == 0:
|
||||||
|
graph.first_tail[edge.head] = edge_index
|
||||||
|
if graph.n_heads[edge.tail] == 0:
|
||||||
|
graph.first_head[edge.tail] = edge_index
|
||||||
|
graph.n_tails[edge.head] += 1
|
||||||
|
graph.n_heads[edge.tail] += 1
|
||||||
|
graph.weights.push_back(weight)
|
||||||
|
# If we had the tail marked as a root, remove it.
|
||||||
|
tail_root_index = graph.roots.find(edge.tail)
|
||||||
|
if tail_root_index != graph.roots.end():
|
||||||
|
graph.roots.erase(tail_root_index)
|
||||||
|
return edge_index
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_edge(const GraphC* graph, EdgeC edge) nogil:
|
||||||
|
key = hash64(&edge, sizeof(edge), 0)
|
||||||
|
it = graph.edge_map.find(key)
|
||||||
|
if it == graph.edge_map.end():
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return dereference(it).second
|
||||||
|
|
||||||
|
|
||||||
|
cdef int has_edge(const GraphC* graph, EdgeC edge) nogil:
|
||||||
|
return get_edge(graph, edge) >= 0
|
||||||
|
|
||||||
|
|
||||||
|
cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
|
||||||
|
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
|
||||||
|
it = graph.node_map.find(key)
|
||||||
|
if it != graph.node_map.end():
|
||||||
|
# Item found. Convert the iterator to an index value.
|
||||||
|
return dereference(it).second
|
||||||
|
else:
|
||||||
|
index = graph.nodes.size()
|
||||||
|
graph.nodes.push_back(node)
|
||||||
|
graph.n_heads.push_back(0)
|
||||||
|
graph.n_tails.push_back(0)
|
||||||
|
graph.first_head.push_back(0)
|
||||||
|
graph.first_tail.push_back(0)
|
||||||
|
graph.roots.insert(index)
|
||||||
|
graph.node_map.insert(pair[hash_t, int](key, index))
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
|
||||||
|
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
|
||||||
|
it = graph.node_map.find(key)
|
||||||
|
if it == graph.node_map.end():
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return dereference(it).second
|
||||||
|
|
||||||
|
|
||||||
|
cdef int has_node(const GraphC* graph, vector[int32_t] node) nogil:
|
||||||
|
return get_node(graph, node) >= 0
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
todo = graph.n_heads[node]
|
||||||
|
if todo == 0:
|
||||||
|
return 0
|
||||||
|
output.reserve(output.size() + todo)
|
||||||
|
start = graph.first_head[node]
|
||||||
|
end = graph.edges.size()
|
||||||
|
for i in range(start, end):
|
||||||
|
if todo <= 0:
|
||||||
|
break
|
||||||
|
elif graph.edges[i].tail == node:
|
||||||
|
output.push_back(graph.edges[i].head)
|
||||||
|
todo -= 1
|
||||||
|
return todo
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
todo = graph.n_tails[node]
|
||||||
|
if todo == 0:
|
||||||
|
return 0
|
||||||
|
output.reserve(output.size() + todo)
|
||||||
|
start = graph.first_tail[node]
|
||||||
|
end = graph.edges.size()
|
||||||
|
for i in range(start, end):
|
||||||
|
if todo <= 0:
|
||||||
|
break
|
||||||
|
elif graph.edges[i].head == node:
|
||||||
|
output.push_back(graph.edges[i].tail)
|
||||||
|
todo -= 1
|
||||||
|
return todo
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_sibling_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
cdef vector[int] heads
|
||||||
|
cdef vector[int] tails
|
||||||
|
get_head_nodes(heads, graph, node)
|
||||||
|
for i in range(heads.size()):
|
||||||
|
get_tail_nodes(tails, graph, heads[i])
|
||||||
|
for j in range(tails.size()):
|
||||||
|
if tails[j] != node:
|
||||||
|
output.push_back(tails[j])
|
||||||
|
tails.clear()
|
||||||
|
return output.size()
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
todo = graph.n_heads[node]
|
||||||
|
if todo == 0:
|
||||||
|
return 0
|
||||||
|
output.reserve(output.size() + todo)
|
||||||
|
start = graph.first_head[node]
|
||||||
|
end = graph.edges.size()
|
||||||
|
for i in range(start, end):
|
||||||
|
if todo <= 0:
|
||||||
|
break
|
||||||
|
elif graph.edges[i].tail == node:
|
||||||
|
output.push_back(i)
|
||||||
|
todo -= 1
|
||||||
|
return todo
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
todo = graph.n_tails[node]
|
||||||
|
if todo == 0:
|
||||||
|
return 0
|
||||||
|
output.reserve(output.size() + todo)
|
||||||
|
start = graph.first_tail[node]
|
||||||
|
end = graph.edges.size()
|
||||||
|
for i in range(start, end):
|
||||||
|
if todo <= 0:
|
||||||
|
break
|
||||||
|
elif graph.edges[i].head == node:
|
||||||
|
output.push_back(i)
|
||||||
|
todo -= 1
|
||||||
|
return todo
|
||||||
|
|
||||||
|
|
||||||
|
cdef int walk_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
cdef unordered_set[int] seen = unordered_set[int]()
|
||||||
|
get_head_nodes(output, graph, node)
|
||||||
|
seen.insert(node)
|
||||||
|
i = 0
|
||||||
|
while i < output.size():
|
||||||
|
with gil:
|
||||||
|
print("Walk up from", output[i])
|
||||||
|
if seen.find(output[i]) == seen.end():
|
||||||
|
seen.insert(output[i])
|
||||||
|
get_head_nodes(output, graph, output[i])
|
||||||
|
i += 1
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
cdef int walk_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
cdef unordered_set[int] seen = unordered_set[int]()
|
||||||
|
get_tail_nodes(output, graph, node)
|
||||||
|
seen.insert(node)
|
||||||
|
i = 0
|
||||||
|
while i < output.size():
|
||||||
|
if seen.find(output[i]) == seen.end():
|
||||||
|
seen.insert(output[i])
|
||||||
|
get_tail_nodes(output, graph, output[i])
|
||||||
|
i += 1
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
cdef int walk_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
cdef unordered_set[int] seen = unordered_set[int]()
|
||||||
|
get_head_edges(output, graph, node)
|
||||||
|
seen.insert(node)
|
||||||
|
i = 0
|
||||||
|
while i < output.size():
|
||||||
|
if seen.find(output[i]) == seen.end():
|
||||||
|
seen.insert(output[i])
|
||||||
|
get_head_edges(output, graph, output[i])
|
||||||
|
i += 1
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
cdef int walk_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
|
||||||
|
cdef unordered_set[int] seen = unordered_set[int]()
|
||||||
|
get_tail_edges(output, graph, node)
|
||||||
|
seen.insert(node)
|
||||||
|
i = 0
|
||||||
|
while i < output.size():
|
||||||
|
if seen.find(output[i]) == seen.end():
|
||||||
|
seen.insert(output[i])
|
||||||
|
get_tail_edges(output, graph, output[i])
|
||||||
|
i += 1
|
||||||
|
return i
|
|
@ -2,18 +2,24 @@ cimport numpy as np
|
||||||
|
|
||||||
from .doc cimport Doc
|
from .doc cimport Doc
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
from ..structs cimport SpanC
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Doc doc
|
cdef readonly Doc doc
|
||||||
cdef readonly int start
|
cdef SpanC c
|
||||||
cdef readonly int end
|
|
||||||
cdef readonly int start_char
|
|
||||||
cdef readonly int end_char
|
|
||||||
cdef readonly attr_t label
|
|
||||||
cdef readonly attr_t kb_id
|
|
||||||
|
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline Span cinit(Doc doc, SpanC span):
|
||||||
|
cdef Span self = Span.__new__(
|
||||||
|
Span,
|
||||||
|
doc,
|
||||||
|
start=span.start,
|
||||||
|
end=span.end
|
||||||
|
)
|
||||||
|
self.c = span
|
||||||
|
return self
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -97,23 +97,23 @@ cdef class Span:
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.start = start
|
|
||||||
self.start_char = self.doc[start].idx if start < self.doc.length else 0
|
|
||||||
self.end = end
|
|
||||||
if end >= 1:
|
|
||||||
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
|
|
||||||
else:
|
|
||||||
self.end_char = 0
|
|
||||||
if isinstance(label, str):
|
if isinstance(label, str):
|
||||||
label = doc.vocab.strings.add(label)
|
label = doc.vocab.strings.add(label)
|
||||||
if isinstance(kb_id, str):
|
if isinstance(kb_id, str):
|
||||||
kb_id = doc.vocab.strings.add(kb_id)
|
kb_id = doc.vocab.strings.add(kb_id)
|
||||||
if label not in doc.vocab.strings:
|
if label not in doc.vocab.strings:
|
||||||
raise ValueError(Errors.E084.format(label=label))
|
raise ValueError(Errors.E084.format(label=label))
|
||||||
self.label = label
|
|
||||||
|
self.c = SpanC(
|
||||||
|
label=label,
|
||||||
|
kb_id=kb_id,
|
||||||
|
start=start,
|
||||||
|
end=end,
|
||||||
|
start_char=doc[start].idx if start < doc.length else 0,
|
||||||
|
end_char=doc[end - 1].idx + len(doc[end - 1]) if end >= 1 else 0,
|
||||||
|
)
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
self.kb_id = kb_id
|
|
||||||
|
|
||||||
def __richcmp__(self, Span other, int op):
|
def __richcmp__(self, Span other, int op):
|
||||||
if other is None:
|
if other is None:
|
||||||
|
@ -123,25 +123,39 @@ cdef class Span:
|
||||||
return True
|
return True
|
||||||
# <
|
# <
|
||||||
if op == 0:
|
if op == 0:
|
||||||
return self.start_char < other.start_char
|
return self.c.start_char < other.c.start_char
|
||||||
# <=
|
# <=
|
||||||
elif op == 1:
|
elif op == 1:
|
||||||
return self.start_char <= other.start_char
|
return self.c.start_char <= other.c.start_char
|
||||||
# ==
|
# ==
|
||||||
elif op == 2:
|
elif op == 2:
|
||||||
return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) == (other.doc, other.start_char, other.end_char, other.label, other.kb_id)
|
# Do the cheap comparisons first
|
||||||
|
return (
|
||||||
|
(self.c.start_char == other.c.start_char) and \
|
||||||
|
(self.c.end_char == other.c.end_char) and \
|
||||||
|
(self.c.label == other.c.label) and \
|
||||||
|
(self.c.kb_id == other.c.kb_id) and \
|
||||||
|
(self.doc == other.doc)
|
||||||
|
)
|
||||||
# !=
|
# !=
|
||||||
elif op == 3:
|
elif op == 3:
|
||||||
return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) != (other.doc, other.start_char, other.end_char, other.label, other.kb_id)
|
# Do the cheap comparisons first
|
||||||
|
return not (
|
||||||
|
(self.c.start_char == other.c.start_char) and \
|
||||||
|
(self.c.end_char == other.c.end_char) and \
|
||||||
|
(self.c.label == other.c.label) and \
|
||||||
|
(self.c.kb_id == other.c.kb_id) and \
|
||||||
|
(self.doc == other.doc)
|
||||||
|
)
|
||||||
# >
|
# >
|
||||||
elif op == 4:
|
elif op == 4:
|
||||||
return self.start_char > other.start_char
|
return self.c.start_char > other.c.start_char
|
||||||
# >=
|
# >=
|
||||||
elif op == 5:
|
elif op == 5:
|
||||||
return self.start_char >= other.start_char
|
return self.c.start_char >= other.c.start_char
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash((self.doc, self.start_char, self.end_char, self.label, self.kb_id))
|
return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of tokens in the span.
|
"""Get the number of tokens in the span.
|
||||||
|
@ -150,9 +164,9 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#len
|
DOCS: https://nightly.spacy.io/api/span#len
|
||||||
"""
|
"""
|
||||||
if self.end < self.start:
|
if self.c.end < self.c.start:
|
||||||
return 0
|
return 0
|
||||||
return self.end - self.start
|
return self.c.end - self.c.start
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
@ -171,10 +185,10 @@ cdef class Span:
|
||||||
return Span(self.doc, start + self.start, end + self.start)
|
return Span(self.doc, start + self.start, end + self.start)
|
||||||
else:
|
else:
|
||||||
if i < 0:
|
if i < 0:
|
||||||
token_i = self.end + i
|
token_i = self.c.end + i
|
||||||
else:
|
else:
|
||||||
token_i = self.start + i
|
token_i = self.c.start + i
|
||||||
if self.start <= token_i < self.end:
|
if self.c.start <= token_i < self.c.end:
|
||||||
return self.doc[token_i]
|
return self.doc[token_i]
|
||||||
else:
|
else:
|
||||||
raise IndexError(Errors.E1002)
|
raise IndexError(Errors.E1002)
|
||||||
|
@ -186,7 +200,7 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#iter
|
DOCS: https://nightly.spacy.io/api/span#iter
|
||||||
"""
|
"""
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.c.start, self.c.end):
|
||||||
yield self.doc[i]
|
yield self.doc[i]
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -196,7 +210,7 @@ cdef class Span:
|
||||||
def _(self):
|
def _(self):
|
||||||
"""Custom extension attributes registered via `set_extension`."""
|
"""Custom extension attributes registered via `set_extension`."""
|
||||||
return Underscore(Underscore.span_extensions, self,
|
return Underscore(Underscore.span_extensions, self,
|
||||||
start=self.start_char, end=self.end_char)
|
start=self.c.start_char, end=self.c.end_char)
|
||||||
|
|
||||||
def as_doc(self, *, bint copy_user_data=False):
|
def as_doc(self, *, bint copy_user_data=False):
|
||||||
"""Create a `Doc` object with a copy of the `Span`'s data.
|
"""Create a `Doc` object with a copy of the `Span`'s data.
|
||||||
|
@ -242,7 +256,7 @@ cdef class Span:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
# if the HEAD refers to a token outside this span, find a more appropriate ancestor
|
# if the HEAD refers to a token outside this span, find a more appropriate ancestor
|
||||||
token = self[i]
|
token = self[i]
|
||||||
ancestor_i = token.head.i - self.start # span offset
|
ancestor_i = token.head.i - self.c.start # span offset
|
||||||
if ancestor_i not in range(length):
|
if ancestor_i not in range(length):
|
||||||
if DEP in attrs:
|
if DEP in attrs:
|
||||||
array[i, attrs.index(DEP)] = dep
|
array[i, attrs.index(DEP)] = dep
|
||||||
|
@ -250,7 +264,7 @@ cdef class Span:
|
||||||
# try finding an ancestor within this span
|
# try finding an ancestor within this span
|
||||||
ancestors = token.ancestors
|
ancestors = token.ancestors
|
||||||
for ancestor in ancestors:
|
for ancestor in ancestors:
|
||||||
ancestor_i = ancestor.i - self.start
|
ancestor_i = ancestor.i - self.c.start
|
||||||
if ancestor_i in range(length):
|
if ancestor_i in range(length):
|
||||||
array[i, head_col] = ancestor_i - i
|
array[i, head_col] = ancestor_i - i
|
||||||
|
|
||||||
|
@ -279,7 +293,7 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#get_lca_matrix
|
DOCS: https://nightly.spacy.io/api/span#get_lca_matrix
|
||||||
"""
|
"""
|
||||||
return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end))
|
return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
@ -373,10 +387,14 @@ cdef class Span:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/span#ents
|
DOCS: https://nightly.spacy.io/api/span#ents
|
||||||
"""
|
"""
|
||||||
|
cdef Span ent
|
||||||
ents = []
|
ents = []
|
||||||
for ent in self.doc.ents:
|
for ent in self.doc.ents:
|
||||||
if ent.start >= self.start and ent.end <= self.end:
|
if ent.c.start >= self.c.start:
|
||||||
ents.append(ent)
|
if ent.c.end <= self.c.end:
|
||||||
|
ents.append(ent)
|
||||||
|
else:
|
||||||
|
break
|
||||||
return ents
|
return ents
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -513,7 +531,7 @@ cdef class Span:
|
||||||
# with head==0, i.e. a sentence root. If so, we can return it. The
|
# with head==0, i.e. a sentence root. If so, we can return it. The
|
||||||
# longer the span, the more likely it contains a sentence root, and
|
# longer the span, the more likely it contains a sentence root, and
|
||||||
# in this case we return in linear time.
|
# in this case we return in linear time.
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.c.start, self.c.end):
|
||||||
if self.doc.c[i].head == 0:
|
if self.doc.c[i].head == 0:
|
||||||
return self.doc[i]
|
return self.doc[i]
|
||||||
# If we don't have a sentence root, we do something that's not so
|
# If we don't have a sentence root, we do something that's not so
|
||||||
|
@ -524,15 +542,15 @@ cdef class Span:
|
||||||
# think this should be okay.
|
# think this should be okay.
|
||||||
cdef int current_best = self.doc.length
|
cdef int current_best = self.doc.length
|
||||||
cdef int root = -1
|
cdef int root = -1
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.c.start, self.c.end):
|
||||||
if self.start <= (i+self.doc.c[i].head) < self.end:
|
if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
|
||||||
continue
|
continue
|
||||||
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
||||||
if words_to_root < current_best:
|
if words_to_root < current_best:
|
||||||
current_best = words_to_root
|
current_best = words_to_root
|
||||||
root = i
|
root = i
|
||||||
if root == -1:
|
if root == -1:
|
||||||
return self.doc[self.start]
|
return self.doc[self.c.start]
|
||||||
else:
|
else:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
|
@ -548,8 +566,8 @@ cdef class Span:
|
||||||
the span.
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
start_idx += self.start_char
|
start_idx += self.c.start_char
|
||||||
end_idx += self.start_char
|
end_idx += self.c.start_char
|
||||||
return self.doc.char_span(start_idx, end_idx)
|
return self.doc.char_span(start_idx, end_idx)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -628,6 +646,56 @@ cdef class Span:
|
||||||
for word in self.rights:
|
for word in self.rights:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
|
property start:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.start
|
||||||
|
|
||||||
|
def __set__(self, int start):
|
||||||
|
if start < 0:
|
||||||
|
raise IndexError("TODO")
|
||||||
|
self.c.start = start
|
||||||
|
|
||||||
|
property end:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.end
|
||||||
|
|
||||||
|
def __set__(self, int end):
|
||||||
|
if end < 0:
|
||||||
|
raise IndexError("TODO")
|
||||||
|
self.c.end = end
|
||||||
|
|
||||||
|
property start_char:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.start_char
|
||||||
|
|
||||||
|
def __set__(self, int start_char):
|
||||||
|
if start_char < 0:
|
||||||
|
raise IndexError("TODO")
|
||||||
|
self.c.start_char = start_char
|
||||||
|
|
||||||
|
property end_char:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.end_char
|
||||||
|
|
||||||
|
def __set__(self, int end_char):
|
||||||
|
if end_char < 0:
|
||||||
|
raise IndexError("TODO")
|
||||||
|
self.c.end_char = end_char
|
||||||
|
|
||||||
|
property label:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.label
|
||||||
|
|
||||||
|
def __set__(self, attr_t label):
|
||||||
|
self.c.label = label
|
||||||
|
|
||||||
|
property kb_id:
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.kb_id
|
||||||
|
|
||||||
|
def __set__(self, attr_t kb_id):
|
||||||
|
self.c.kb_id = kb_id
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): The entity ID."""
|
"""RETURNS (uint64): The entity ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
10
spacy/tokens/span_group.pxd
Normal file
10
spacy/tokens/span_group.pxd
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from ..structs cimport SpanC
|
||||||
|
|
||||||
|
cdef class SpanGroup:
|
||||||
|
cdef public object _doc_ref
|
||||||
|
cdef public str name
|
||||||
|
cdef public dict attrs
|
||||||
|
cdef vector[SpanC] c
|
||||||
|
|
||||||
|
cdef void push_back(self, SpanC span) nogil
|
183
spacy/tokens/span_group.pyx
Normal file
183
spacy/tokens/span_group.pyx
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
import weakref
|
||||||
|
import struct
|
||||||
|
import srsly
|
||||||
|
from .span cimport Span
|
||||||
|
from libc.stdint cimport uint64_t, uint32_t, int32_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef class SpanGroup:
|
||||||
|
"""A group of spans that all belong to the same Doc object. The group
|
||||||
|
can be named, and you can attach additional attributes to it. Span groups
|
||||||
|
are generally accessed via the `doc.spans` attribute. The `doc.spans`
|
||||||
|
attribute will convert lists of spans into a `SpanGroup` object for you
|
||||||
|
automatically on assignment.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
Construction 1
|
||||||
|
>>> doc = nlp("Their goi ng home")
|
||||||
|
>>> doc.spans["errors"] = SpanGroup(
|
||||||
|
doc,
|
||||||
|
name="errors",
|
||||||
|
spans=[doc[0:1], doc[2:4]],
|
||||||
|
attrs={"annotator": "matt"}
|
||||||
|
)
|
||||||
|
|
||||||
|
Construction 2
|
||||||
|
>>> doc = nlp("Their goi ng home")
|
||||||
|
>>> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
>>> assert isinstance(doc.spans["errors"], SpanGroup)
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup
|
||||||
|
"""
|
||||||
|
def __init__(self, doc, *, name="", attrs={}, spans=[]):
|
||||||
|
"""Create a SpanGroup.
|
||||||
|
|
||||||
|
doc (Doc): The reference Doc object.
|
||||||
|
name (str): The group name.
|
||||||
|
attrs (Dict[str, Any]): Optional JSON-serializable attributes to attach.
|
||||||
|
spans (Iterable[Span]): The spans to add to the group.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#init
|
||||||
|
"""
|
||||||
|
# We need to make this a weak reference, so that the Doc object can
|
||||||
|
# own the SpanGroup without circular references. We do want to get
|
||||||
|
# the Doc though, because otherwise the API gets annoying.
|
||||||
|
self._doc_ref = weakref.ref(doc)
|
||||||
|
self.name = name
|
||||||
|
self.attrs = dict(attrs) if attrs is not None else {}
|
||||||
|
cdef Span span
|
||||||
|
for span in spans:
|
||||||
|
self.push_back(span.c)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(list(self))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def doc(self):
|
||||||
|
"""RETURNS (Doc): The reference document.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#doc
|
||||||
|
"""
|
||||||
|
return self._doc_ref()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_overlap(self):
|
||||||
|
"""RETURNS (bool): Whether the group contains overlapping spans.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#has_overlap
|
||||||
|
"""
|
||||||
|
if not len(self):
|
||||||
|
return False
|
||||||
|
sorted_spans = list(sorted(self))
|
||||||
|
last_end = sorted_spans[0].end
|
||||||
|
for span in sorted_spans[1:]:
|
||||||
|
if span.start < last_end:
|
||||||
|
return True
|
||||||
|
last_end = span.end
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""RETURNS (int): The number of spans in the group.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#len
|
||||||
|
"""
|
||||||
|
return self.c.size()
|
||||||
|
|
||||||
|
def append(self, Span span):
|
||||||
|
"""Add a span to the group. The span must refer to the same Doc
|
||||||
|
object as the span group.
|
||||||
|
|
||||||
|
span (Span): The span to append.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#append
|
||||||
|
"""
|
||||||
|
if span.doc is not self.doc:
|
||||||
|
raise ValueError("Cannot add span to group: refers to different Doc.")
|
||||||
|
self.push_back(span.c)
|
||||||
|
|
||||||
|
def extend(self, spans):
|
||||||
|
"""Add multiple spans to the group. All spans must refer to the same
|
||||||
|
Doc object as the span group.
|
||||||
|
|
||||||
|
spans (Iterable[Span]): The spans to add.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#extend
|
||||||
|
"""
|
||||||
|
cdef Span span
|
||||||
|
for span in spans:
|
||||||
|
self.append(span)
|
||||||
|
|
||||||
|
def __getitem__(self, int i):
|
||||||
|
"""Get a span from the group.
|
||||||
|
|
||||||
|
i (int): The item index.
|
||||||
|
RETURNS (Span): The span at the given index.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#getitem
|
||||||
|
"""
|
||||||
|
cdef int size = self.c.size()
|
||||||
|
if i < -size or i >= size:
|
||||||
|
raise IndexError(f"list index {i} out of range")
|
||||||
|
if i < 0:
|
||||||
|
i += size
|
||||||
|
return Span.cinit(self.doc, self.c[i])
|
||||||
|
|
||||||
|
def to_bytes(self):
|
||||||
|
"""Serialize the SpanGroup's contents to a byte string.
|
||||||
|
|
||||||
|
RETURNS (bytes): The serialized span group.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#to_bytes
|
||||||
|
"""
|
||||||
|
output = {"name": self.name, "attrs": self.attrs, "spans": []}
|
||||||
|
for i in range(self.c.size()):
|
||||||
|
span = self.c[i]
|
||||||
|
# The struct.pack here is probably overkill, but it might help if
|
||||||
|
# you're saving tonnes of spans, and it doesn't really add any
|
||||||
|
# complexity. We do take care to specify little-endian byte order
|
||||||
|
# though, to ensure the message can be loaded back on a different
|
||||||
|
# arch.
|
||||||
|
# Q: uint64_t
|
||||||
|
# q: int64_t
|
||||||
|
# L: uint32_t
|
||||||
|
# l: int32_t
|
||||||
|
output["spans"].append(struct.pack(
|
||||||
|
">QQQllll",
|
||||||
|
span.id,
|
||||||
|
span.kb_id,
|
||||||
|
span.label,
|
||||||
|
span.start,
|
||||||
|
span.end,
|
||||||
|
span.start_char,
|
||||||
|
span.end_char
|
||||||
|
))
|
||||||
|
return srsly.msgpack_dumps(output)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data):
|
||||||
|
"""Deserialize the SpanGroup's contents from a byte string.
|
||||||
|
|
||||||
|
bytes_data (bytes): The span group to load.
|
||||||
|
RETURNS (SpanGroup): The deserialized span group.
|
||||||
|
|
||||||
|
DOCS: https://nightly.spacy.io/api/spangroup#from_bytes
|
||||||
|
"""
|
||||||
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
|
self.name = msg["name"]
|
||||||
|
self.attrs = dict(msg["attrs"])
|
||||||
|
self.c.clear()
|
||||||
|
self.c.reserve(len(msg["spans"]))
|
||||||
|
cdef SpanC span
|
||||||
|
for span_data in msg["spans"]:
|
||||||
|
items = struct.unpack(">QQQllll", span_data)
|
||||||
|
span.id = items[0]
|
||||||
|
span.kb_id = items[1]
|
||||||
|
span.label = items[2]
|
||||||
|
span.start = items[3]
|
||||||
|
span.end = items[4]
|
||||||
|
span.start_char = items[5]
|
||||||
|
span.end_char = items[6]
|
||||||
|
self.c.push_back(span)
|
||||||
|
return self
|
||||||
|
|
||||||
|
cdef void push_back(self, SpanC span) nogil:
|
||||||
|
self.c.push_back(span)
|
|
@ -575,6 +575,39 @@ objects, if the entity recognizer has been applied.
|
||||||
| ----------- | --------------------------------------------------------------------- |
|
| ----------- | --------------------------------------------------------------------- |
|
||||||
| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||||
|
|
||||||
|
## Doc.spans {#spans tag="property"}
|
||||||
|
|
||||||
|
A dictionary of named span groups, to store and access additional span
|
||||||
|
annotations. You can write to it by assigning a list of [`Span`](/api/span)
|
||||||
|
objects or a [`SpanGroup`](/api/spangroup) to a given key.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------ |
|
||||||
|
| **RETURNS** | The span groups assigned to the document. ~~Dict[str, SpanGroup]~~ |
|
||||||
|
|
||||||
|
## Doc.cats {#cats tag="property" model="text classifier"}
|
||||||
|
|
||||||
|
Maps a label to a score for categories applied to the document. Typically set by
|
||||||
|
the [`TextCategorizer`](/api/textcategorizer).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("This is a text about football.")
|
||||||
|
> print(doc.cats)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ---------------------------------------------------------- |
|
||||||
|
| **RETURNS** | The text categories mapped to scores. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}
|
## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}
|
||||||
|
|
||||||
Iterate over the base noun phrases in the document. Yields base noun-phrase
|
Iterate over the base noun phrases in the document. Yields base noun-phrase
|
||||||
|
@ -668,23 +701,22 @@ The L2 norm of the document's vector representation.
|
||||||
|
|
||||||
## Attributes {#attributes}
|
## Attributes {#attributes}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `text` | A string representation of the document text. ~~str~~ |
|
| `text` | A string representation of the document text. ~~str~~ |
|
||||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||||
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||||
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
||||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
||||||
| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
|
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
|
||||||
|
|
||||||
## Serialization fields {#serialization-fields}
|
## Serialization fields {#serialization-fields}
|
||||||
|
|
||||||
|
|
185
website/docs/api/spangroup.md
Normal file
185
website/docs/api/spangroup.md
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
---
|
||||||
|
title: SpanGroup
|
||||||
|
tag: class
|
||||||
|
source: spacy/tokens/span_group.pyx
|
||||||
|
new: 3
|
||||||
|
---
|
||||||
|
|
||||||
|
A group of arbitrary, potentially overlapping [`Span`](/api/span) objects that
|
||||||
|
all belong to the same [`Doc`](/api/doc) object. The group can be named, and you
|
||||||
|
can attach additional attributes to it. Span groups are generally accessed via
|
||||||
|
the [`Doc.spans`](/api/doc#spans) attribute, which will convert lists of spans
|
||||||
|
into a `SpanGroup` object for you automatically on assignment. `SpanGroup`
|
||||||
|
objects behave similar to `list`s, so you can append `Span` objects to them or
|
||||||
|
access a member at a given index.
|
||||||
|
|
||||||
|
## SpanGroup.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
Create a `SpanGroup`.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> spans = [doc[0:1], doc[2:4]]
|
||||||
|
>
|
||||||
|
> # Construction 1
|
||||||
|
> from spacy.tokens import SpanGroup
|
||||||
|
>
|
||||||
|
> group = SpanGroup(doc, name="errors", spans=spans, attrs={"annotator": "matt"})
|
||||||
|
> doc.spans["errors"] = group
|
||||||
|
>
|
||||||
|
> # Construction 2
|
||||||
|
> doc.spans["errors"] = spans
|
||||||
|
> assert isinstance(doc.spans["errors"], SpanGroup)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `doc` | The document the span group belongs to. ~~Doc~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `name` | The name of the span group. If the span group is created automatically on assignment to `doc.spans`, the key name is used. Defaults to `""`. ~~str~~ |
|
||||||
|
| `attrs` | Optional JSON-serializable attributes to attach to the span group. ~~Dict[str, Any]~~ |
|
||||||
|
| `spans` | The spans to add to the span group. ~~Iterable[Span]~~ |
|
||||||
|
|
||||||
|
## SpanGroup.doc {#doc tag="property"}
|
||||||
|
|
||||||
|
The [`Doc`](/api/doc) object the span group is referring to.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> assert doc.spans["errors"].doc == doc
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------- |
|
||||||
|
| **RETURNS** | The reference document. ~~Doc~~ |
|
||||||
|
|
||||||
|
## SpanGroup.has_overlap {#has_overlap tag="property"}
|
||||||
|
|
||||||
|
Check whether the span group contains overlapping spans.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> assert not doc.spans["errors"].has_overlap
|
||||||
|
> doc.spans["errors"].append(doc[1:2])
|
||||||
|
> assert doc.spans["errors"].has_overlap
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------------------------- |
|
||||||
|
| **RETURNS** | Whether the span group contains overlaps. ~~bool~~ |
|
||||||
|
|
||||||
|
## SpanGroup.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
|
Get the number of spans in the group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> assert len(doc.spans["errors"]) == 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------- |
|
||||||
|
| **RETURNS** | The number of spans in the group. ~~int~~ |
|
||||||
|
|
||||||
|
## SpanGroup.\_\_getitem\_\_ {#getitem tag="method"}
|
||||||
|
|
||||||
|
Get a span from the group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> span = doc.spans["errors"][1]
|
||||||
|
> assert span.text == "goi ng"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------- |
|
||||||
|
| `i` | The item index. ~~int~~ |
|
||||||
|
| **RETURNS** | The span at the given index. ~~Span~~ |
|
||||||
|
|
||||||
|
## SpanGroup.append {#append tag="method"}
|
||||||
|
|
||||||
|
Add a [`Span`](/api/span) object to the group. The span must refer to the same
|
||||||
|
[`Doc`](/api/doc) object as the span group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1]]
|
||||||
|
> doc.spans["errors"].append(doc[2:4])
|
||||||
|
> assert len(doc.spans["errors"]) == 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------ | ---------------------------- |
|
||||||
|
| `span` | The span to append. ~~Span~~ |
|
||||||
|
|
||||||
|
## SpanGroup.extend {#extend tag="method"}
|
||||||
|
|
||||||
|
Add multiple [`Span`](/api/span) objects to the group. All spans must refer to
|
||||||
|
the same [`Doc`](/api/doc) object as the span group.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = []
|
||||||
|
> doc.spans["errors"].extend([doc[2:4], doc[0:1]])
|
||||||
|
> assert len(doc.spans["errors"]) == 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | ------------------------------------ |
|
||||||
|
| `spans` | The spans to add. ~~Iterable[Span]~~ |
|
||||||
|
|
||||||
|
## SpanGroup.to_bytes {#to_bytes tag="method"}
|
||||||
|
|
||||||
|
Serialize the span group to a bytestring.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> group_bytes = doc.spans["errors"].to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------- |
|
||||||
|
| **RETURNS** | The serialized `SpanGroup`. ~~bytes~~ |
|
||||||
|
|
||||||
|
## SpanGroup.from_bytes {#from_bytes tag="method"}
|
||||||
|
|
||||||
|
Load the span group from a bytestring. Modifies the object in place and returns
|
||||||
|
it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.tokens import SpanGroup
|
||||||
|
>
|
||||||
|
> doc = nlp("Their goi ng home")
|
||||||
|
> doc.spans["errors"] = [doc[0:1], doc[2:4]]
|
||||||
|
> group_bytes = doc.spans["errors"].to_bytes()
|
||||||
|
> new_group = SpanGroup()
|
||||||
|
> new_group.from_bytes(group_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------ | ------------------------------------- |
|
||||||
|
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||||
|
| **RETURNS** | The `SpanGroup` object. ~~SpanGroup~~ |
|
|
@ -18,15 +18,16 @@ It also orchestrates training and serialization.
|
||||||
|
|
||||||
### Container objects {#architecture-containers}
|
### Container objects {#architecture-containers}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
||||||
| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
|
| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
|
||||||
| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
|
| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
|
||||||
| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
|
| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
|
||||||
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
||||||
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
||||||
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
| [`SpanGroup`](/api/spangroup) | A named collection of spans belonging to a `Doc`. |
|
||||||
|
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
||||||
|
|
||||||
### Processing pipeline {#architecture-pipeline}
|
### Processing pipeline {#architecture-pipeline}
|
||||||
|
|
||||||
|
|
|
@ -501,7 +501,7 @@ format for documenting argument and return types.
|
||||||
[`AttributeRuler`](/api/attributeruler),
|
[`AttributeRuler`](/api/attributeruler),
|
||||||
[`SentenceRecognizer`](/api/sentencerecognizer),
|
[`SentenceRecognizer`](/api/sentencerecognizer),
|
||||||
[`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe),
|
[`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe),
|
||||||
[`Corpus`](/api/corpus)
|
[`Corpus`](/api/corpus), [`SpanGroup`](/api/spangroup),
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,7 @@
|
||||||
{ "text": "Language", "url": "/api/language" },
|
{ "text": "Language", "url": "/api/language" },
|
||||||
{ "text": "Lexeme", "url": "/api/lexeme" },
|
{ "text": "Lexeme", "url": "/api/lexeme" },
|
||||||
{ "text": "Span", "url": "/api/span" },
|
{ "text": "Span", "url": "/api/span" },
|
||||||
|
{ "text": "SpanGroup", "url": "/api/spangroup" },
|
||||||
{ "text": "Token", "url": "/api/token" }
|
{ "text": "Token", "url": "/api/token" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
"Doc": "/api/doc",
|
"Doc": "/api/doc",
|
||||||
"Token": "/api/token",
|
"Token": "/api/token",
|
||||||
"Span": "/api/span",
|
"Span": "/api/span",
|
||||||
|
"SpanGroup": "/api/spangroup",
|
||||||
"Lexeme": "/api/lexeme",
|
"Lexeme": "/api/lexeme",
|
||||||
"Example": "/api/example",
|
"Example": "/api/example",
|
||||||
"Alignment": "/api/example#alignment-object",
|
"Alignment": "/api/example#alignment-object",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user