Add SpanGroup and Graph container types to represent arbitrary annotations (#6696)

* Draft out initial Spans data structure * Initial span group commit * Basic span group support on Doc * Basic test for span group * Compile span_group.pyx * Draft addition of SpanGroup to DocBin * Add deserialization for SpanGroup * Add tests for serializing SpanGroup * Fix serialization of SpanGroup * Add EdgeC and GraphC structs * Add draft Graph data structure * Compile graph * More work on Graph * Update GraphC * Upd graph * Fix walk functions * Let Graph take nodes and edges on construction * Fix walking and getting * Add graph tests * Fix import * Add module with the SpanGroups dict thingy * Update test * Rename 'span_groups' attribute * Try to fix c++11 compilation * Fix test * Update DocBin * Try to fix compilation * Try to fix graph * Improve SpanGroup docstrings * Add doc.spans to documentation * Fix serialization * Tidy up and add docs * Update docs [ci skip] * Add SpanGroup.has_overlap * WIP updated Graph API * Start testing new Graph API * Update Graph tests * Update Graph * Add docstring Co-authored-by: Ines Montani <ines@ines.io>
2026-01-08 09:41:11 +03:00 · 2021-01-14 17:30:41 +11:00 · 2021-01-14 17:30:41 +11:00 · f277bfdf0f
commit f277bfdf0f
parent 54e8e3c208
21 changed files with 1464 additions and 84 deletions
--- a/setup.py
+++ b/setup.py
@ -55,6 +55,8 @@ MOD_NAMES = [
    "spacy.tokens.doc",
    "spacy.tokens.span",
    "spacy.tokens.token",
+    "spacy.tokens.span_group",
+    "spacy.tokens.graph",
    "spacy.tokens.morphanalysis",
    "spacy.tokens._retokenize",
    "spacy.matcher.matcher",
@ -68,7 +70,7 @@ COMPILE_OPTIONS = {
    "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
    "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 }
-LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
+LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
 COMPILER_DIRECTIVES = {
    "language_level": -3,
    "embedsignature": True,
@ -201,7 +203,7 @@ def setup_package():
    ext_modules = []
    for name in MOD_NAMES:
        mod_path = name.replace(".", "/") + ".pyx"
-        ext = Extension(name, [mod_path], language="c++")
+        ext = Extension(name, [mod_path], language="c++", extra_compile_args=["-std=c++11"])
        ext_modules.append(ext)
    print("Cythonizing sources")
    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,5 +1,7 @@
 from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
 from libcpp.vector cimport vector
+from libcpp.unordered_set cimport unordered_set
+from libcpp.unordered_map cimport unordered_map
 from libc.stdint cimport int32_t, int64_t

 from .typedefs cimport flags_t, attr_t, hash_t
@ -91,3 +93,22 @@ cdef struct AliasC:

    # Prior probability P(entity|alias) - should sum up to (at most) 1.
    vector[float] probs
+
+
+cdef struct EdgeC:
+    hash_t label
+    int32_t head
+    int32_t tail
+
+
+cdef struct GraphC:
+    vector[vector[int32_t]] nodes
+    vector[EdgeC] edges
+    vector[float] weights
+    vector[int] n_heads
+    vector[int] n_tails
+    vector[int] first_head
+    vector[int] first_tail
+    unordered_set[int]* roots
+    unordered_map[hash_t, int]* node_map
+    unordered_map[hash_t, int]* edge_map
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -631,3 +631,24 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
            retokenizer.merge(span)
    with pytest.raises(IndexError):
        doc.ents = spans
+
+
+def test_span_groups(en_tokenizer):
+    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
+    doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
+    assert "hi" in doc.spans
+    assert "bye" not in doc.spans
+    assert len(doc.spans["hi"]) == 1
+    assert doc.spans["hi"][0].label_ == "bye"
+    doc.spans["hi"].append(doc[0:3])
+    assert len(doc.spans["hi"]) == 2
+    assert doc.spans["hi"][1].text == "Some text about"
+    assert [span.text for span in doc.spans["hi"]] == ["Colombia", "Some text about"]
+    assert not doc.spans["hi"].has_overlap
+    doc.ents = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
+    doc.spans["hi"].extend(doc.ents)
+    assert len(doc.spans["hi"]) == 4
+    assert [span.label_ for span in doc.spans["hi"]] == ["bye", "", "GPE", "GPE"]
+    assert doc.spans["hi"].has_overlap
+    del doc.spans["hi"]
+    assert "hi" not in doc.spans
--- a/spacy/tests/doc/test_graph.py
+++ b/spacy/tests/doc/test_graph.py
@ -0,0 +1,57 @@
+from spacy.vocab import Vocab
+from spacy.tokens.doc import Doc
+from spacy.tokens.graph import Graph
+
+
+def test_graph_init():
+    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
+    graph = Graph(doc, name="hello")
+    assert graph.name == "hello"
+    assert graph.doc is doc
+
+
+def test_graph_edges_and_nodes():
+    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
+    graph = Graph(doc, name="hello")
+    node1 = graph.add_node((0,))
+    assert graph.get_node((0,)) == node1
+    node2 = graph.add_node((1, 3))
+    assert list(node2) == [1, 3]
+    graph.add_edge(
+        node1,
+        node2,
+        label="one",
+        weight=-10.5
+    )
+    assert graph.has_edge(
+        node1,
+        node2,
+        label="one"
+    )
+    assert node1.heads() == []
+    assert [tuple(h) for h in node2.heads()] == [(0,)]
+    assert [tuple(t) for t in node1.tails()] == [(1, 3)]
+    assert [tuple(t) for t in node2.tails()] == []
+
+
+def test_graph_walk():
+    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
+    graph = Graph(
+        doc,
+        name="hello",
+        nodes=[(0,), (1,), (2,), (3,)],
+        edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
+        labels=None,
+        weights=None
+    )
+    node0, node1, node2, node3 = list(graph.nodes)
+    assert [tuple(h) for h in node0.heads()] == [(3,)]
+    assert [tuple(h) for h in node1.heads()] == [(0,)]
+    assert [tuple(h) for h in node0.walk_heads()] == [(3,), (0,)]
+    assert [tuple(h) for h in node1.walk_heads()] == [(0,), (3,), (0,)]
+    assert [tuple(h) for h in node2.walk_heads()] == [(0,), (3,), (0,)]
+    assert [tuple(h) for h in node3.walk_heads()] == [(0,), (3,)]
+    assert [tuple(t) for t in node0.walk_tails()] == [(1,), (2,), (3,), (0,)]
+    assert [tuple(t) for t in node1.walk_tails()] == []
+    assert [tuple(t) for t in node2.walk_tails()] == []
+    assert [tuple(t) for t in node3.walk_tails()] == [(0,), (1,), (2,), (3,)]
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -56,6 +56,13 @@ def test_serialize_doc_exclude(en_vocab):
    assert not new_doc.user_data


+def test_serialize_doc_span_groups(en_vocab):
+    doc = Doc(en_vocab, words=["hello", "world", "!"])
+    doc.spans["content"] = [doc[0:2]]
+    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
+    assert len(new_doc.spans["content"]) == 1
+
+
 def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
@ -63,6 +70,7 @@ def test_serialize_doc_bin():
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
+        doc.spans["start"] = [doc[0:2]]
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

@ -73,6 +81,7 @@ def test_serialize_doc_bin():
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
+        assert len(doc.spans) == 1


 def test_serialize_doc_bin_unknown_spaces(en_vocab):
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@ -0,0 +1,49 @@
+from typing import Iterable, Tuple, Union, TYPE_CHECKING
+import weakref
+from collections import UserDict
+import srsly
+
+from .span_group import SpanGroup
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .doc import Doc  # noqa: F401
+    from .span import Span  # noqa: F401
+
+
+# Why inherit from UserDict instead of dict here?
+# Well, the 'dict' class doesn't necessarily delegate everything nicely,
+# for performance reasons. The UserDict is slower by better behaved.
+# See https://treyhunner.com/2019/04/why-you-shouldnt-inherit-from-list-and-dict-in-python/0ww
+class SpanGroups(UserDict):
+    """A dict-like proxy held by the Doc, to control access to span groups."""
+
+    def __init__(
+        self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple()
+    ) -> None:
+        self.doc_ref = weakref.ref(doc)
+        UserDict.__init__(self, items)
+
+    def __setitem__(self, key: str, value: Union[SpanGroup, Iterable["Span"]]) -> None:
+        if not isinstance(value, SpanGroup):
+            value = self._make_span_group(key, value)
+        assert value.doc is self.doc_ref()
+        UserDict.__setitem__(self, key, value)
+
+    def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
+        return SpanGroup(self.doc_ref(), name=name, spans=spans)
+
+    def to_bytes(self) -> bytes:
+        # We don't need to serialize this as a dict, because the groups
+        # know their names.
+        msg = [value.to_bytes() for value in self.values()]
+        return srsly.msgpack_dumps(msg)
+
+    def from_bytes(self, bytes_data: bytes) -> "SpanGroups":
+        msg = srsly.msgpack_loads(bytes_data)
+        self.clear()
+        doc = self.doc_ref()
+        for value_bytes in msg:
+            group = SpanGroup(doc).from_bytes(value_bytes)
+            self[group.name] = group
+        return self
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -33,6 +33,7 @@ class DocBin:
    {
        "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
        "tokens": bytes, # Serialized numpy uint64 array with the token data
+        "spans": List[Dict[str, bytes]], # SpanGroups data for each doc
        "spaces": bytes, # Serialized numpy boolean array with spaces data
        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
        "strings": List[unicode] # List of unique strings in the token data
@ -70,6 +71,7 @@ class DocBin:
        self.tokens = []
        self.spaces = []
        self.cats = []
+        self.span_groups = []
        self.user_data = []
        self.flags = []
        self.strings = set()
@ -107,6 +109,10 @@ class DocBin:
            self.strings.add(token.ent_kb_id_)
        self.cats.append(doc.cats)
        self.user_data.append(srsly.msgpack_dumps(doc.user_data))
+        self.span_groups.append(doc.spans.to_bytes())
+        for key, group in doc.spans.items():
+            for span in group:
+                self.strings.add(span.label_)

    def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
        """Recover Doc objects from the annotations, using the given vocab.
@ -130,6 +136,10 @@ class DocBin:
            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
            doc = doc.from_array(self.attrs, tokens)
            doc.cats = self.cats[i]
+            if self.span_groups[i]:
+                doc.spans.from_bytes(self.span_groups[i])
+            else:
+                doc.spans.clear()
            if i < len(self.user_data) and self.user_data[i] is not None:
                user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
                doc.user_data.update(user_data)
@ -161,6 +171,7 @@ class DocBin:
        self.spaces.extend(other.spaces)
        self.strings.update(other.strings)
        self.cats.extend(other.cats)
+        self.span_groups.extend(other.span_groups)
        self.flags.extend(other.flags)
        self.user_data.extend(other.user_data)

@ -185,6 +196,7 @@ class DocBin:
            "strings": list(sorted(self.strings)),
            "cats": self.cats,
            "flags": self.flags,
+            "span_groups": self.span_groups,
        }
        if self.store_user_data:
            msg["user_data"] = self.user_data
@ -213,6 +225,7 @@ class DocBin:
        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
        self.cats = msg["cats"]
+        self.span_groups = msg.get("span_groups", [b"" for _ in lengths])
        self.flags = msg.get("flags", [{} for _ in lengths])
        if "user_data" in msg:
            self.user_data = list(msg["user_data"])
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -2,7 +2,7 @@ from cymem.cymem cimport Pool
 cimport numpy as np

 from ..vocab cimport Vocab
-from ..structs cimport TokenC, LexemeC
+from ..structs cimport TokenC, LexemeC, SpanC
 from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t

@ -33,6 +33,7 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2

 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)

+
 cdef class Doc:
    cdef readonly Pool mem
    cdef readonly Vocab vocab
@ -43,6 +44,7 @@ cdef class Doc:
    cdef public object tensor
    cdef public object cats
    cdef public object user_data
+    cdef readonly object spans

    cdef TokenC* c

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -16,6 +16,7 @@ from thinc.util import copy_array
 import warnings

 from .span cimport Span
+from ._dict_proxies import SpanGroups
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
@ -222,6 +223,7 @@ cdef class Doc:
        self.vocab = vocab
        size = max(20, (len(words) if words is not None else 0))
        self.mem = Pool()
+        self.spans = SpanGroups(self)
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
@ -1255,6 +1257,9 @@ cdef class Doc:
            strings.add(token.ent_kb_id_)
            strings.add(token.ent_id_)
            strings.add(token.norm_)
+        for group in self.spans.values():
+            for span in group:
+                strings.add(span.label_)
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
@ -1266,6 +1271,7 @@ cdef class Doc:
            "sentiment": lambda: self.sentiment,
            "tensor": lambda: self.tensor,
            "cats": lambda: self.cats,
+            "spans": lambda: self.spans.to_bytes(),
            "strings": lambda: list(strings),
            "has_unknown_spaces": lambda: self.has_unknown_spaces
        }
@ -1290,18 +1296,6 @@ cdef class Doc:
        """
        if self.length != 0:
            raise ValueError(Errors.E033.format(length=self.length))
-        deserializers = {
-            "text": lambda b: None,
-            "array_head": lambda b: None,
-            "array_body": lambda b: None,
-            "sentiment": lambda b: None,
-            "tensor": lambda b: None,
-            "cats": lambda b: None,
-            "strings": lambda b: None,
-            "user_data_keys": lambda b: None,
-            "user_data_values": lambda b: None,
-            "has_unknown_spaces": lambda b: None
-        }
        # Msgpack doesn't distinguish between lists and tuples, which is
        # vexing for user data. As a best guess, we *know* that within
        # keys, we must have tuples. In values we just have to hope
@ -1336,9 +1330,12 @@ cdef class Doc:
            self.push_back(lex, has_space)
            start = end + has_space
        self.from_array(msg["array_head"][2:], attrs[:, 2:])
+        if "spans" in msg:
+            self.spans.from_bytes(msg["spans"])
+        else:
+            self.spans.clear()
        return self

-
    def extend_tensor(self, tensor):
        """Concatenate a new tensor onto the doc.tensor object.

--- a/spacy/tokens/graph.pxd
+++ b/spacy/tokens/graph.pxd
@ -0,0 +1,13 @@
+from libcpp.vector cimport vector
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from ..structs cimport GraphC, EdgeC
+
+
+cdef class Graph:
+    cdef GraphC c
+    cdef Pool mem
+    cdef PreshMap node_map
+    cdef PreshMap edge_map
+    cdef object doc_ref
+    cdef public str name
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@ -0,0 +1,709 @@
+# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
+from typing import List, Tuple, Generator
+from libc.stdint cimport int32_t, int64_t
+from libcpp.pair cimport pair
+from libcpp.unordered_map cimport unordered_map
+from libcpp.unordered_set cimport unordered_set
+from cython.operator cimport dereference
+cimport cython
+import weakref
+from preshed.maps cimport map_get_unless_missing
+from murmurhash.mrmr cimport hash64
+from ..typedefs cimport hash_t
+from ..strings import get_string_id
+from ..structs cimport EdgeC, GraphC
+from .token import Token
+
+
+@cython.freelist(8)
+cdef class Edge:
+    cdef readonly Graph graph
+    cdef readonly int i
+    
+    def __init__(self, Graph graph, int i):
+        self.graph = graph
+        self.i = i
+
+    @property
+    def is_none(self) -> bool:
+        return False
+
+    @property
+    def doc(self) -> "Doc":
+        return self.graph.doc
+
+    @property
+    def head(self) -> "Node":
+        return Node(self.graph, self.graph.c.edges[self.i].head)
+    
+    @property
+    def tail(self) -> "Tail":
+        return Node(self.graph, self.graph.c.edges[self.i].tail)
+
+    @property
+    def label(self) -> int:
+        return self.graph.c.edges[self.i].label
+
+    @property
+    def weight(self) -> float:
+        return self.graph.c.weights[self.i]
+
+    @property
+    def label_(self) -> str:
+        return self.doc.vocab.strings[self.label]
+
+
+@cython.freelist(8)
+cdef class Node:
+    cdef readonly Graph graph
+    cdef readonly int i
+
+    def __init__(self, Graph graph, int i):
+        """A reference to a node of an annotation graph. Each node is made up of
+        an ordered set of zero or more token indices.
+        
+        Node references are usually created by the Graph object itself, or from
+        the Node or Edge objects. You usually won't need to instantiate this
+        class yourself.
+        """
+        cdef int length = graph.c.nodes.size()
+        if i >= length or -i >= length:
+            raise IndexError(f"Node index {i} out of bounds ({length})")
+        if i < 0:
+            i += length
+        self.graph = graph
+        self.i = i
+
+    def __eq__(self, other):
+        if self.graph is not other.graph:
+            return False
+        else:
+            return self.i == other.i
+
+    def __iter__(self) -> Generator[int]:
+        for i in self.graph.c.nodes[self.i]:
+            yield i
+
+    def __getitem__(self, int i) -> int:
+        """Get a token index from the node's set of tokens."""
+        length = self.graph.c.nodes[self.i].size()
+        if i >= length or -i >= length:
+            raise IndexError(f"Token index {i} out of bounds ({length})")
+        if i < 0:
+            i += length
+        return self.graph.c.nodes[self.i][i]
+
+    def __len__(self) -> int:
+        """The number of tokens that make up the node."""
+        return self.graph.c.nodes[self.i].size()
+
+    @property
+    def is_none(self) -> bool:
+        """Whether the node is a special value, indicating 'none'.
+        
+        The NoneNode type is returned by the Graph, Edge and Node objects when
+        there is no match to a query. It has the same API as Node, but it always
+        returns NoneNode, NoneEdge or empty lists for its queries.
+        """
+        return False
+ 
+    @property
+    def doc(self) -> "Doc":
+        """The Doc object that the graph refers to."""
+        return self.graph.doc
+
+    @property
+    def tokens(self) -> Tuple[Token]:
+        """A tuple of Token objects that make up the node."""
+        doc = self.doc
+        return tuple([doc[i] for i in self])
+
+    def head(self, i=None, label=None) -> "Node":
+        """Get the head of the first matching edge, searching by index, label,
+        both or neither.
+        
+        For instance, `node.head(i=1)` will get the head of the second edge that
+        this node is a tail of. `node.head(i=1, label="ARG0")` will further
+        check that the second edge has the label `"ARG0"`. 
+        
+        If no matching node can be found, the graph's NoneNode is returned. 
+        """
+        return self.headed(i=i, label=label)
+    
+    def tail(self, i=None, label=None) -> "Node":
+        """Get the tail of the first matching edge, searching by index, label,
+        both or neither.
+ 
+        If no matching node can be found, the graph's NoneNode is returned. 
+        """
+        return self.tailed(i=i, label=label).tail
+
+    def sibling(self, i=None, label=None):
+        """Get the first matching sibling node. Two nodes are siblings if they
+        are both tails of the same head.
+        If no matching node can be found, the graph's NoneNode is returned. 
+        """
+        if i is None:
+            siblings = self.siblings(label=label)
+            return siblings[0] if siblings else NoneNode(self)
+        else:
+            edges = []
+            for h in self.headed():
+                edges.extend([e for e in h.tailed() if e.tail.i != self.i])
+            if i >= len(edges):
+                return NoneNode(self)
+            elif label is not None and edges[i].label != label:
+                return NoneNode(self)
+            else:
+                return edges[i].tail
+
+    def heads(self, label=None) -> List["Node"]:
+        """Find all matching heads of this node."""
+        cdef vector[int] edge_indices
+        self._find_edges(edge_indices, "head", label)
+        return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
+     
+    def tails(self, label=None) -> List["Node"]:
+        """Find all matching tails of this node."""
+        cdef vector[int] edge_indices
+        self._find_edges(edge_indices, "tail", label)
+        return [Node(self.graph, self.graph.c.edges[i].tail) for i in edge_indices]
+
+    def siblings(self, label=None) -> List["Node"]:
+        """Find all maching siblings of this node. Two nodes are siblings if they
+        are tails of the same head.
+        """
+        edges = []
+        for h in self.headed():
+            edges.extend([e for e in h.tailed() if e.tail.i != self.i])
+        if label is None:
+            return [e.tail for e in edges]
+        else:
+            return [e.tail for e in edges if e.label == label]
+
+    def headed(self, i=None, label=None) -> Edge:
+        """Find the first matching edge headed by this node.
+        If no matching edge can be found, the graph's NoneEdge is returned.
+        """
+        start, end = self._find_range(i, self.c.n_head[self.i])
+        idx = self._find_edge("head", start, end, label)
+        if idx == -1:
+            return NoneEdge(self.graph)
+        else:
+            return Edge(self.graph, idx)
+    
+    def tailed(self, i=None, label=None) -> Edge:
+        """Find the first matching edge tailed by this node.
+        If no matching edge can be found, the graph's NoneEdge is returned.
+        """
+        start, end = self._find_range(i, self.c.n_tail[self.i])
+        idx = self._find_edge("tail", start, end, label)
+        if idx == -1:
+            return NoneEdge(self.graph)
+        else:
+            return Edge(self.graph, idx)
+
+    def headeds(self, label=None) -> List[Edge]:
+        """Find all matching edges headed by this node."""
+        cdef vector[int] edge_indices
+        self._find_edges(edge_indices, "head", label)
+        return [Edge(self.graph, i) for i in edge_indices]
+
+    def taileds(self, label=None) -> List["Edge"]:
+        """Find all matching edges headed by this node."""
+        cdef vector[int] edge_indices
+        self._find_edges(edge_indices, "tail", label)
+        return [Edge(self.graph, i) for i in edge_indices]
+
+    def walk_heads(self):
+        cdef vector[int] node_indices
+        walk_head_nodes(node_indices, &self.graph.c, self.i)
+        for i in node_indices:
+            yield Node(self.graph, i)
+
+    def walk_tails(self):
+        cdef vector[int] node_indices
+        walk_tail_nodes(node_indices, &self.graph.c, self.i)
+        for i in node_indices:
+            yield Node(self.graph, i)
+
+    cdef (int, int) _get_range(self, i, n):
+        if i is None:
+            return (0, n)
+        elif i < n:
+            return (i, i+1)
+        else:
+            return (0, 0)
+
+    cdef int _find_edge(self, str direction, int start, int end, label) except -2:
+        if direction == "head":
+            get_edges = get_head_edges
+        else:
+            get_edges = get_tail_edges
+        cdef vector[int] edge_indices
+        get_edges(edge_indices, &self.graph.c, self.i)
+        if label is None:
+            return edge_indices[start]
+        for edge_index in edge_indices[start:end]:
+            if self.graph.c.edges[edge_index].label == label:
+                return edge_index
+        else:
+            return -1
+
+    cdef int _find_edges(self, vector[int]& edge_indices, str direction, label):
+        if direction == "head":
+            get_edges = get_head_edges
+        else:
+            get_edges = get_tail_edges
+        if label is None:
+            get_edges(edge_indices, &self.graph.c, self.i)
+            return edge_indices.size()
+        cdef vector[int] unfiltered
+        get_edges(unfiltered, &self.graph.c, self.i)
+        for edge_index in unfiltered:
+            if self.graph.c.edges[edge_index].label == label:
+                edge_indices.push_back(edge_index)
+        return edge_indices.size()
+
+
+cdef class NoneEdge(Edge):
+    """An Edge subclass, representing a non-result. The NoneEdge has the same
+    API as other Edge instances, but always returns NoneEdge, NoneNode, or empty
+    lists.
+    """
+    def __init__(self, graph):
+        self.graph = graph
+        self.i = -1
+   
+    @property
+    def doc(self) -> "Doc":
+        return self.graph.doc
+
+    @property
+    def head(self) -> "NoneNode":
+        return NoneNode(self.graph)
+    
+    @property
+    def tail(self) -> "NoneNode":
+        return NoneNode(self.graph)
+
+    @property
+    def label(self) -> int:
+        return 0
+
+    @property
+    def weight(self) -> float:
+        return 0.0
+
+    @property
+    def label_(self) -> str:
+        return ""
+
+
+cdef class NoneNode(Node):
+    def __init__(self, graph):
+        self.graph = graph
+        self.i = -1
+
+    def __getitem__(self, int i):
+        raise IndexError("Cannot index into NoneNode.")
+
+    def __len__(self):
+        return 0
+ 
+    @property
+    def is_none(self):
+        return -1
+
+    @property
+    def doc(self):
+        return self.graph.doc
+
+    @property
+    def tokens(self):
+        return tuple()
+
+    def head(self, i=None, label=None):
+        return self
+
+    def tail(self, i=None, label=None):
+        return self
+
+    def walk_heads(self):
+        yield from [] 
+    
+    def walk_tails(self):
+        yield from [] 
+ 
+
+cdef class Graph:
+    """A set of directed labelled relationships between sets of tokens.
+    
+    EXAMPLE:
+        Construction 1
+        >>> graph = Graph(doc, name="srl")
+
+        Construction 2
+        >>> graph = Graph(
+            doc,
+            name="srl",
+            nodes=[(0,), (1, 3), (,)],
+            edges=[(0, 2), (2, 1)]
+        )
+
+        Construction 3
+        >>> graph = Graph(
+            doc,
+            name="srl",
+            nodes=[(0,), (1, 3), (,)],
+            edges=[(2, 0), (0, 1)],
+            labels=["word sense ID 1675", "agent"],
+            weights=[-42.6, -1.7]
+        )
+        >>> assert graph.has_node((0,))
+        >>> assert graph.has_edge((0,), (1,3), label="agent")
+    """
+    def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
+        """Create a Graph object.
+
+        doc (Doc): The Doc object the graph will refer to.
+        name (str): A string name to help identify the graph. Defaults to "".
+        nodes (List[Tuple[int]]): A list of token-index tuples to add to the graph
+            as nodes. Defaults to [].
+        edges (List[Tuple[int, int]]): A list of edges between the provided nodes.
+            Each edge should be a (head, tail) tuple, where `head` and `tail`
+            are integers pointing into the `nodes` list. Defaults to [].
+        labels (Optional[List[str]]): A list of labels for the provided edges.
+            If None, all of the edges specified by the edges argument will have
+            be labelled with the empty string (""). If `labels` is not `None`,
+            it must have the same length as the `edges` argument.
+        weights (Optional[List[float]]): A list of weights for the provided edges.
+            If None, all of the edges specified by the edges argument will 
+            have the weight 0.0. If `weights` is not `None`, it must have the
+            same length as the `edges` argument.
+        """
+        if weights is not None:
+            assert len(weights) == len(edges)
+        else:
+            weights = [0.0] * len(edges)
+        if labels is not None:
+            assert len(labels) == len(edges)
+        else:
+            labels = [""] * len(edges)
+        self.c.node_map = new unordered_map[hash_t, int]()
+        self.c.edge_map = new unordered_map[hash_t, int]()
+        self.c.roots = new unordered_set[int]()
+        self.name = name
+        self.doc_ref = weakref.ref(doc)
+        for node in nodes:
+            self.add_node(node)
+        for (head, tail), label, weight in zip(edges, labels, weights):
+            self.add_edge(
+                Node(self, head),
+                Node(self, tail),
+                label=label,
+                weight=weight
+            )
+
+    def __dealloc__(self):
+        del self.c.node_map
+        del self.c.edge_map
+        del self.c.roots
+
+    @property
+    def doc(self) -> "Doc":
+        """The Doc object the graph refers to."""
+        return self.doc_ref()
+
+    @property
+    def edges(self) -> Generator[Edge]:
+        """Iterate over the edges in the graph."""
+        for i in range(self.c.edges.size()):
+            yield Edge(self, i)
+
+    @property
+    def nodes(self) -> Generator[Node]:
+        """Iterate over the nodes in the graph."""
+        for i in range(self.c.nodes.size()):
+            yield Node(self, i)
+
+    def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
+        """Add an edge to the graph, connecting two groups of tokens.
+       
+        If there is already an edge for the (head, tail, label) triple, it will
+        be returned, and no new edge will be created. The weight of the edge
+        will be updated if a weight is specified.
+        """
+        label_hash = self.doc.vocab.strings.as_int(label)
+        weight_float = weight if weight is not None else 0.0
+        edge_index = add_edge(
+            &self.c,
+            EdgeC(
+                head=self.add_node(head).i,
+                tail=self.add_node(tail).i,
+                label=self.doc.vocab.strings.as_int(label),
+            ),
+            weight=weight if weight is not None else 0.0
+        )
+        return Edge(self, edge_index)
+
+    def get_edge(self, head, tail, *, label="") -> Edge:
+        """Look up an edge in the graph. If the graph has no matching edge,
+        the NoneEdge object is returned.
+        """
+        head_node = self.get_node(head)
+        if head_node.is_none:
+            return NoneEdge(self)
+        tail_node = self.get_node(tail)
+        if tail_node.is_none:
+            return NoneEdge(self)
+        edge_index = get_edge(
+            &self.c,
+            EdgeC(head=head_node.i, tail=tail_node.i, label=get_string_id(label))
+        )
+        if edge_index < 0:
+            return NoneEdge(self)
+        else:
+            return Edge(self, edge_index)
+
+    def has_edge(self, head, tail, label) -> bool:
+        """Check whether a (head, tail, label) triple is an edge in the graph."""
+        return not self.get_edge(head, tail, label=label).is_none
+    
+    def add_node(self, indices) -> Node:
+        """Add a node to the graph and return it. Nodes refer to ordered sets
+        of token indices.
+        
+        This method is idempotent: if there is already a node for the given
+        indices, it is returned without a new node being created.
+        """
+        if isinstance(indices, Node):
+            return indices
+        cdef vector[int32_t] node 
+        node.reserve(len(indices))
+        for idx in indices:
+            node.push_back(idx)
+        i = add_node(&self.c, node)
+        print("Add node", indices, i)
+        return Node(self, i)
+
+    def get_node(self, indices) -> Node:
+        """Get a node from the graph, or the NoneNode if there is no node for
+        the given indices.
+        """
+        if isinstance(indices, Node):
+            return indices
+        cdef vector[int32_t] node 
+        node.reserve(len(indices))
+        for idx in indices:
+            node.push_back(idx)
+        node_index = get_node(&self.c, node)
+        if node_index < 0:
+            return NoneNode(self)
+        else:
+            print("Get node", indices, node_index)
+            return Node(self, node_index)
+ 
+    def has_node(self, tuple indices) -> bool:
+        """Check whether the graph has a node for the given indices."""
+        return not self.get_node(indices).is_none
+
+
+cdef int add_edge(GraphC* graph, EdgeC edge, float weight) nogil:
+    key = hash64(&edge, sizeof(edge), 0)
+    it = graph.edge_map.find(key)
+    if it != graph.edge_map.end():
+        edge_index = dereference(it).second
+        graph.weights[edge_index] = weight
+        return edge_index
+    else:
+        edge_index = graph.edges.size()
+        graph.edge_map.insert(pair[hash_t, int](key, edge_index))
+        graph.edges.push_back(edge)
+        if graph.n_tails[edge.head] == 0:
+            graph.first_tail[edge.head] = edge_index
+        if graph.n_heads[edge.tail] == 0:
+            graph.first_head[edge.tail] = edge_index
+        graph.n_tails[edge.head] += 1
+        graph.n_heads[edge.tail] += 1
+        graph.weights.push_back(weight)
+        # If we had the tail marked as a root, remove it.
+        tail_root_index = graph.roots.find(edge.tail)
+        if tail_root_index != graph.roots.end():
+            graph.roots.erase(tail_root_index)
+        return edge_index
+
+
+cdef int get_edge(const GraphC* graph, EdgeC edge) nogil:
+    key = hash64(&edge, sizeof(edge), 0)
+    it = graph.edge_map.find(key)
+    if it == graph.edge_map.end():
+        return -1
+    else:
+        return dereference(it).second
+
+
+cdef int has_edge(const GraphC* graph, EdgeC edge) nogil:
+    return get_edge(graph, edge) >= 0
+
+
+cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
+    key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
+    it = graph.node_map.find(key)
+    if it != graph.node_map.end():
+        # Item found. Convert the iterator to an index value.
+        return dereference(it).second
+    else:
+        index = graph.nodes.size()
+        graph.nodes.push_back(node)
+        graph.n_heads.push_back(0)
+        graph.n_tails.push_back(0)
+        graph.first_head.push_back(0)
+        graph.first_tail.push_back(0)
+        graph.roots.insert(index)
+        graph.node_map.insert(pair[hash_t, int](key, index))
+        return index
+ 
+
+cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
+    key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
+    it = graph.node_map.find(key)
+    if it == graph.node_map.end():
+        return -1
+    else:
+        return dereference(it).second
+
+
+cdef int has_node(const GraphC* graph, vector[int32_t] node) nogil:
+    return get_node(graph, node) >= 0
+
+
+cdef int get_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
+    todo = graph.n_heads[node]
+    if todo == 0:
+        return 0
+    output.reserve(output.size() + todo)
+    start = graph.first_head[node] 
+    end = graph.edges.size()
+    for i in range(start, end):
+        if todo <= 0:
+            break
+        elif graph.edges[i].tail == node:
+            output.push_back(graph.edges[i].head)
+            todo -= 1
+    return todo
+
+
+cdef int get_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
+    todo = graph.n_tails[node]
+    if todo == 0:
+        return 0
+    output.reserve(output.size() + todo)
+    start = graph.first_tail[node] 
+    end = graph.edges.size()
+    for i in range(start, end):
+        if todo <= 0:
+            break
+        elif graph.edges[i].head == node:
+            output.push_back(graph.edges[i].tail)
+            todo -= 1
+    return todo
+
+
+cdef int get_sibling_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
+    cdef vector[int] heads
+    cdef vector[int] tails
+    get_head_nodes(heads, graph, node)
+    for i in range(heads.size()):
+        get_tail_nodes(tails, graph, heads[i])
+        for j in range(tails.size()):
+            if tails[j] != node:
+                output.push_back(tails[j])
+        tails.clear()
+    return output.size()
+
+
+cdef int get_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
+    todo = graph.n_heads[node]
+    if todo == 0:
+        return 0
+    output.reserve(output.size() + todo)
+    start = graph.first_head[node] 
+    end = graph.edges.size()
+    for i in range(start, end):
+        if todo <= 0:
+            break
+        elif graph.edges[i].tail == node:
+            output.push_back(i)
+            todo -= 1
+    return todo
+
+
+cdef int get_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
+    todo = graph.n_tails[node]
+    if todo == 0:
+        return 0
+    output.reserve(output.size() + todo)
+    start = graph.first_tail[node] 
+    end = graph.edges.size()
+    for i in range(start, end):
+        if todo <= 0:
+            break
+        elif graph.edges[i].head == node:
+            output.push_back(i)
+            todo -= 1
+    return todo
+
+
+cdef int walk_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
+    cdef unordered_set[int] seen = unordered_set[int]()
+    get_head_nodes(output, graph, node)
+    seen.insert(node)
+    i = 0
+    while i < output.size():
+        with gil:
+            print("Walk up from", output[i])
+        if seen.find(output[i]) == seen.end():
+            seen.insert(output[i])
+            get_head_nodes(output, graph, output[i])
+        i += 1
+    return i
+
+
+cdef int walk_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
+    cdef unordered_set[int] seen = unordered_set[int]()
+    get_tail_nodes(output, graph, node)
+    seen.insert(node)
+    i = 0
+    while i < output.size():
+        if seen.find(output[i]) == seen.end():
+            seen.insert(output[i])
+            get_tail_nodes(output, graph, output[i])
+        i += 1
+    return i
+
+
+cdef int walk_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
+    cdef unordered_set[int] seen = unordered_set[int]()
+    get_head_edges(output, graph, node)
+    seen.insert(node)
+    i = 0
+    while i < output.size():
+        if seen.find(output[i]) == seen.end():
+            seen.insert(output[i])
+            get_head_edges(output, graph, output[i])
+        i += 1
+    return i
+
+
+cdef int walk_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
+    cdef unordered_set[int] seen = unordered_set[int]()
+    get_tail_edges(output, graph, node)
+    seen.insert(node)
+    i = 0
+    while i < output.size():
+        if seen.find(output[i]) == seen.end():
+            seen.insert(output[i])
+            get_tail_edges(output, graph, output[i])
+        i += 1
+    return i
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -2,18 +2,24 @@ cimport numpy as np

 from .doc cimport Doc
 from ..typedefs cimport attr_t
+from ..structs cimport SpanC


 cdef class Span:
    cdef readonly Doc doc
-    cdef readonly int start
-    cdef readonly int end
-    cdef readonly int start_char
-    cdef readonly int end_char
-    cdef readonly attr_t label
-    cdef readonly attr_t kb_id
-
+    cdef SpanC c
    cdef public _vector
    cdef public _vector_norm

+    @staticmethod
+    cdef inline Span cinit(Doc doc, SpanC span):
+        cdef Span self = Span.__new__(
+            Span,
+            doc,
+            start=span.start,
+            end=span.end
+        )
+        self.c = span
+        return self
+
    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -97,23 +97,23 @@ cdef class Span:
        if not (0 <= start <= end <= len(doc)):
            raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
        self.doc = doc
-        self.start = start
-        self.start_char = self.doc[start].idx if start < self.doc.length else 0
-        self.end = end
-        if end >= 1:
-            self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
-        else:
-            self.end_char = 0
        if isinstance(label, str):
            label = doc.vocab.strings.add(label)
        if isinstance(kb_id, str):
            kb_id = doc.vocab.strings.add(kb_id)
        if label not in doc.vocab.strings:
            raise ValueError(Errors.E084.format(label=label))
-        self.label = label
+
+        self.c = SpanC(
+            label=label,
+            kb_id=kb_id,
+            start=start,
+            end=end,
+            start_char=doc[start].idx if start < doc.length else 0,
+            end_char=doc[end - 1].idx + len(doc[end - 1]) if end >= 1 else 0,
+        )
        self._vector = vector
        self._vector_norm = vector_norm
-        self.kb_id = kb_id

    def __richcmp__(self, Span other, int op):
        if other is None:
@ -123,25 +123,39 @@ cdef class Span:
                return True
        # <
        if op == 0:
-            return self.start_char < other.start_char
+            return self.c.start_char < other.c.start_char
        # <=
        elif op == 1:
-            return self.start_char <= other.start_char
+            return self.c.start_char <= other.c.start_char
        # ==
        elif op == 2:
-            return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) == (other.doc, other.start_char, other.end_char, other.label, other.kb_id)
+            # Do the cheap comparisons first
+            return (
+                (self.c.start_char == other.c.start_char) and \
+                (self.c.end_char == other.c.end_char) and \
+                (self.c.label == other.c.label) and \
+                (self.c.kb_id == other.c.kb_id) and \
+                (self.doc == other.doc)
+            )
        # !=
        elif op == 3:
-            return (self.doc, self.start_char, self.end_char, self.label, self.kb_id) != (other.doc, other.start_char, other.end_char, other.label, other.kb_id)
+            # Do the cheap comparisons first
+            return not (
+                (self.c.start_char == other.c.start_char) and \
+                (self.c.end_char == other.c.end_char) and \
+                (self.c.label == other.c.label) and \
+                (self.c.kb_id == other.c.kb_id) and \
+                (self.doc == other.doc)
+            )
        # >
        elif op == 4:
-            return self.start_char > other.start_char
+            return self.c.start_char > other.c.start_char
        # >=
        elif op == 5:
-            return self.start_char >= other.start_char
+            return self.c.start_char >= other.c.start_char

    def __hash__(self):
-        return hash((self.doc, self.start_char, self.end_char, self.label, self.kb_id))
+        return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id))

    def __len__(self):
        """Get the number of tokens in the span.
@ -150,9 +164,9 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#len
        """
-        if self.end < self.start:
+        if self.c.end < self.c.start:
            return 0
-        return self.end - self.start
+        return self.c.end - self.c.start

    def __repr__(self):
        return self.text
@ -171,10 +185,10 @@ cdef class Span:
            return Span(self.doc, start + self.start, end + self.start)
        else:
            if i < 0:
-                token_i = self.end + i
+                token_i = self.c.end + i
            else:
-                token_i = self.start + i
-            if self.start <= token_i < self.end:
+                token_i = self.c.start + i
+            if self.c.start <= token_i < self.c.end:
                return self.doc[token_i]
            else:
                raise IndexError(Errors.E1002)
@ -186,7 +200,7 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#iter
        """
-        for i in range(self.start, self.end):
+        for i in range(self.c.start, self.c.end):
            yield self.doc[i]

    def __reduce__(self):
@ -196,7 +210,7 @@ cdef class Span:
    def _(self):
        """Custom extension attributes registered via `set_extension`."""
        return Underscore(Underscore.span_extensions, self,
-                          start=self.start_char, end=self.end_char)
+                          start=self.c.start_char, end=self.c.end_char)

    def as_doc(self, *, bint copy_user_data=False):
        """Create a `Doc` object with a copy of the `Span`'s data.
@ -242,7 +256,7 @@ cdef class Span:
            for i in range(length):
                # if the HEAD refers to a token outside this span, find a more appropriate ancestor
                token = self[i]
-                ancestor_i = token.head.i - self.start   # span offset
+                ancestor_i = token.head.i - self.c.start   # span offset
                if ancestor_i not in range(length):
                    if DEP in attrs:
                        array[i, attrs.index(DEP)] = dep
@ -250,7 +264,7 @@ cdef class Span:
                    # try finding an ancestor within this span
                    ancestors = token.ancestors
                    for ancestor in ancestors:
-                        ancestor_i = ancestor.i - self.start
+                        ancestor_i = ancestor.i - self.c.start
                        if ancestor_i in range(length):
                            array[i, head_col] = ancestor_i - i

@ -279,7 +293,7 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#get_lca_matrix
        """
-        return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end))
+        return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end))

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
@ -373,10 +387,14 @@ cdef class Span:

        DOCS: https://nightly.spacy.io/api/span#ents
        """
+        cdef Span ent
        ents = []
        for ent in self.doc.ents:
-            if ent.start >= self.start and ent.end <= self.end:
-                ents.append(ent)
+            if ent.c.start >= self.c.start:
+                if ent.c.end <= self.c.end:
+                    ents.append(ent)
+                else:
+                    break
        return ents

    @property
@ -513,7 +531,7 @@ cdef class Span:
        # with head==0, i.e. a sentence root. If so, we can return it. The
        # longer the span, the more likely it contains a sentence root, and
        # in this case we return in linear time.
-        for i in range(self.start, self.end):
+        for i in range(self.c.start, self.c.end):
            if self.doc.c[i].head == 0:
                return self.doc[i]
        # If we don't have a sentence root, we do something that's not so
@ -524,15 +542,15 @@ cdef class Span:
        # think this should be okay.
        cdef int current_best = self.doc.length
        cdef int root = -1
-        for i in range(self.start, self.end):
-            if self.start <= (i+self.doc.c[i].head) < self.end:
+        for i in range(self.c.start, self.c.end):
+            if self.c.start <= (i+self.doc.c[i].head) < self.c.end:
                continue
            words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
            if words_to_root < current_best:
                current_best = words_to_root
                root = i
        if root == -1:
-            return self.doc[self.start]
+            return self.doc[self.c.start]
        else:
            return self.doc[root]

@ -548,8 +566,8 @@ cdef class Span:
            the span.
        RETURNS (Span): The newly constructed object.
        """
-        start_idx += self.start_char
-        end_idx += self.start_char
+        start_idx += self.c.start_char
+        end_idx += self.c.start_char
        return self.doc.char_span(start_idx, end_idx)

    @property
@ -628,6 +646,56 @@ cdef class Span:
        for word in self.rights:
            yield from word.subtree

+    property start:
+        def __get__(self):
+            return self.c.start
+
+        def __set__(self, int start):
+            if start < 0:
+                raise IndexError("TODO")
+            self.c.start = start
+
+    property end:
+        def __get__(self):
+            return self.c.end
+
+        def __set__(self, int end):
+            if end < 0:
+                raise IndexError("TODO")
+            self.c.end = end
+
+    property start_char:
+        def __get__(self):
+            return self.c.start_char
+
+        def __set__(self, int start_char):
+            if start_char < 0:
+                raise IndexError("TODO")
+            self.c.start_char = start_char
+
+    property end_char:
+        def __get__(self):
+            return self.c.end_char
+
+        def __set__(self, int end_char):
+            if end_char < 0:
+                raise IndexError("TODO")
+            self.c.end_char = end_char
+
+    property label:
+        def __get__(self):
+            return self.c.label
+
+        def __set__(self, attr_t label):
+            self.c.label = label
+
+    property kb_id:
+        def __get__(self):
+            return self.c.kb_id
+
+        def __set__(self, attr_t kb_id):
+            self.c.kb_id = kb_id
+
    property ent_id:
        """RETURNS (uint64): The entity ID."""
        def __get__(self):
--- a/spacy/tokens/span_group.pxd
+++ b/spacy/tokens/span_group.pxd
@ -0,0 +1,10 @@
+from libcpp.vector cimport vector
+from ..structs cimport SpanC
+
+cdef class SpanGroup:
+    cdef public object _doc_ref
+    cdef public str name
+    cdef public dict attrs
+    cdef vector[SpanC] c
+
+    cdef void push_back(self, SpanC span) nogil
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@ -0,0 +1,183 @@
+import weakref
+import struct
+import srsly
+from .span cimport Span
+from libc.stdint cimport uint64_t, uint32_t, int32_t
+
+
+cdef class SpanGroup:
+    """A group of spans that all belong to the same Doc object. The group
+    can be named, and you can attach additional attributes to it. Span groups
+    are generally accessed via the `doc.spans` attribute. The `doc.spans`
+    attribute will convert lists of spans into a `SpanGroup` object for you
+    automatically on assignment.
+
+    Example:
+        Construction 1
+        >>> doc = nlp("Their goi ng home")
+        >>> doc.spans["errors"] = SpanGroup(
+            doc,
+            name="errors",
+            spans=[doc[0:1], doc[2:4]],
+            attrs={"annotator": "matt"}
+        )
+
+        Construction 2
+        >>> doc = nlp("Their goi ng home")
+        >>> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+        >>> assert isinstance(doc.spans["errors"], SpanGroup)
+
+    DOCS: https://nightly.spacy.io/api/spangroup
+    """
+    def __init__(self, doc, *, name="", attrs={}, spans=[]):
+        """Create a SpanGroup.
+
+        doc (Doc): The reference Doc object.
+        name (str): The group name.
+        attrs (Dict[str, Any]): Optional JSON-serializable attributes to attach.
+        spans (Iterable[Span]): The spans to add to the group.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#init
+        """
+        # We need to make this a weak reference, so that the Doc object can
+        # own the SpanGroup without circular references. We do want to get
+        # the Doc though, because otherwise the API gets annoying.
+        self._doc_ref = weakref.ref(doc)
+        self.name = name
+        self.attrs = dict(attrs) if attrs is not None else {}
+        cdef Span span
+        for span in spans:
+            self.push_back(span.c)
+
+    def __repr__(self):
+        return str(list(self))
+
+    @property
+    def doc(self):
+        """RETURNS (Doc): The reference document.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#doc
+        """
+        return self._doc_ref()
+
+    @property
+    def has_overlap(self):
+        """RETURNS (bool): Whether the group contains overlapping spans.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#has_overlap
+        """
+        if not len(self):
+            return False
+        sorted_spans = list(sorted(self))
+        last_end = sorted_spans[0].end
+        for span in sorted_spans[1:]:
+            if span.start < last_end:
+                return True
+            last_end = span.end
+        return False
+
+    def __len__(self):
+        """RETURNS (int): The number of spans in the group.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#len
+        """
+        return self.c.size()
+
+    def append(self, Span span):
+        """Add a span to the group. The span must refer to the same Doc
+        object as the span group.
+
+        span (Span): The span to append.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#append
+        """
+        if span.doc is not self.doc:
+            raise ValueError("Cannot add span to group: refers to different Doc.")
+        self.push_back(span.c)
+
+    def extend(self, spans):
+        """Add multiple spans to the group. All spans must refer to the same
+        Doc object as the span group.
+
+        spans (Iterable[Span]): The spans to add.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#extend
+        """
+        cdef Span span
+        for span in spans:
+            self.append(span)
+
+    def __getitem__(self, int i):
+        """Get a span from the group.
+
+        i (int): The item index.
+        RETURNS (Span): The span at the given index.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#getitem
+        """
+        cdef int size = self.c.size()
+        if i < -size or i >= size:
+            raise IndexError(f"list index {i} out of range")
+        if i < 0:
+            i += size
+        return Span.cinit(self.doc, self.c[i])
+
+    def to_bytes(self):
+        """Serialize the SpanGroup's contents to a byte string.
+
+        RETURNS (bytes): The serialized span group.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#to_bytes
+        """
+        output = {"name": self.name, "attrs": self.attrs, "spans": []}
+        for i in range(self.c.size()):
+            span = self.c[i]
+            # The struct.pack here is probably overkill, but it might help if
+            # you're saving tonnes of spans, and it doesn't really add any
+            # complexity. We do take care to specify little-endian byte order
+            # though, to ensure the message can be loaded back on a different
+            # arch.
+            # Q: uint64_t
+            # q: int64_t
+            # L: uint32_t
+            # l: int32_t
+            output["spans"].append(struct.pack(
+                ">QQQllll",
+                span.id,
+                span.kb_id,
+                span.label,
+                span.start,
+                span.end,
+                span.start_char,
+                span.end_char
+            ))
+        return srsly.msgpack_dumps(output)
+
+    def from_bytes(self, bytes_data):
+        """Deserialize the SpanGroup's contents from a byte string.
+
+        bytes_data (bytes): The span group to load.
+        RETURNS (SpanGroup): The deserialized span group.
+
+        DOCS: https://nightly.spacy.io/api/spangroup#from_bytes
+        """
+        msg = srsly.msgpack_loads(bytes_data)
+        self.name = msg["name"]
+        self.attrs = dict(msg["attrs"])
+        self.c.clear()
+        self.c.reserve(len(msg["spans"]))
+        cdef SpanC span
+        for span_data in msg["spans"]:
+            items = struct.unpack(">QQQllll", span_data)
+            span.id = items[0]
+            span.kb_id = items[1]
+            span.label = items[2]
+            span.start = items[3]
+            span.end = items[4]
+            span.start_char = items[5]
+            span.end_char = items[6]
+            self.c.push_back(span)
+        return self
+
+    cdef void push_back(self, SpanC span) nogil:
+        self.c.push_back(span)
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -575,6 +575,39 @@ objects, if the entity recognizer has been applied.
 | ----------- | --------------------------------------------------------------------- |
 | **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |

+## Doc.spans {#spans tag="property"}
+
+A dictionary of named span groups, to store and access additional span
+annotations. You can write to it by assigning a list of [`Span`](/api/span)
+objects or a [`SpanGroup`](/api/spangroup) to a given key.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> ```
+
+| Name        | Description                                                        |
+| ----------- | ------------------------------------------------------------------ |
+| **RETURNS** | The span groups assigned to the document. ~~Dict[str, SpanGroup]~~ |
+
+## Doc.cats {#cats tag="property" model="text classifier"}
+
+Maps a label to a score for categories applied to the document. Typically set by
+the [`TextCategorizer`](/api/textcategorizer).
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a text about football.")
+> print(doc.cats)
+> ```
+
+| Name        | Description                                                |
+| ----------- | ---------------------------------------------------------- |
+| **RETURNS** | The text categories mapped to scores. ~~Dict[str, float]~~ |
+
 ## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}

 Iterate over the base noun phrases in the document. Yields base noun-phrase
@ -668,23 +701,22 @@ The L2 norm of the document's vector representation.

 ## Attributes {#attributes}

-| Name                                 | Description                                                                                                                                 |
-| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                               | A string representation of the document text. ~~str~~                                                                                       |
-| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                               |
-| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                    |
-| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                                       |
-| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                               |
-| `cats` <Tag variant="new">2</Tag>    | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
-| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                            |
-| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                              |
-| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                              |
-| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                         |
-| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
-| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
-| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
-| `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~         |
-| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
+| Name                                 | Description                                                                                                                         |
+| ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                               | A string representation of the document text. ~~str~~                                                                               |
+| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                       |
+| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                            |
+| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                               |
+| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                       |
+| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                    |
+| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                      |
+| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                      |
+| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                 |
+| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                           |
+| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                   |
+| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                    |
+| `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
+| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~       |

 ## Serialization fields {#serialization-fields}

--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@ -0,0 +1,185 @@
+---
+title: SpanGroup
+tag: class
+source: spacy/tokens/span_group.pyx
+new: 3
+---
+
+A group of arbitrary, potentially overlapping [`Span`](/api/span) objects that
+all belong to the same [`Doc`](/api/doc) object. The group can be named, and you
+can attach additional attributes to it. Span groups are generally accessed via
+the [`Doc.spans`](/api/doc#spans) attribute, which will convert lists of spans
+into a `SpanGroup` object for you automatically on assignment. `SpanGroup`
+objects behave similar to `list`s, so you can append `Span` objects to them or
+access a member at a given index.
+
+## SpanGroup.\_\_init\_\_ {#init tag="method"}
+
+Create a `SpanGroup`.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> spans = [doc[0:1], doc[2:4]]
+>
+> # Construction 1
+> from spacy.tokens import SpanGroup
+>
+> group = SpanGroup(doc, name="errors", spans=spans, attrs={"annotator": "matt"})
+> doc.spans["errors"] = group
+>
+> # Construction 2
+> doc.spans["errors"] = spans
+> assert isinstance(doc.spans["errors"], SpanGroup)
+> ```
+
+| Name           | Description                                                                                                                                          |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`          | The document the span group belongs to. ~~Doc~~                                                                                                      |
+| _keyword-only_ |                                                                                                                                                      |
+| `name`         | The name of the span group. If the span group is created automatically on assignment to `doc.spans`, the key name is used. Defaults to `""`. ~~str~~ |
+| `attrs`        | Optional JSON-serializable attributes to attach to the span group. ~~Dict[str, Any]~~                                                                |
+| `spans`        | The spans to add to the span group. ~~Iterable[Span]~~                                                                                               |
+
+## SpanGroup.doc {#doc tag="property"}
+
+The [`Doc`](/api/doc) object the span group is referring to.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> assert doc.spans["errors"].doc == doc
+> ```
+
+| Name        | Description                     |
+| ----------- | ------------------------------- |
+| **RETURNS** | The reference document. ~~Doc~~ |
+
+## SpanGroup.has_overlap {#has_overlap tag="property"}
+
+Check whether the span group contains overlapping spans.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> assert not doc.spans["errors"].has_overlap
+> doc.spans["errors"].append(doc[1:2])
+> assert doc.spans["errors"].has_overlap
+> ```
+
+| Name        | Description                                        |
+| ----------- | -------------------------------------------------- |
+| **RETURNS** | Whether the span group contains overlaps. ~~bool~~ |
+
+## SpanGroup.\_\_len\_\_ {#len tag="method"}
+
+Get the number of spans in the group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> assert len(doc.spans["errors"]) == 2
+> ```
+
+| Name        | Description                               |
+| ----------- | ----------------------------------------- |
+| **RETURNS** | The number of spans in the group. ~~int~~ |
+
+## SpanGroup.\_\_getitem\_\_ {#getitem tag="method"}
+
+Get a span from the group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> span = doc.spans["errors"][1]
+> assert span.text == "goi ng"
+> ```
+
+| Name        | Description                           |
+| ----------- | ------------------------------------- |
+| `i`         | The item index. ~~int~~               |
+| **RETURNS** | The span at the given index. ~~Span~~ |
+
+## SpanGroup.append {#append tag="method"}
+
+Add a [`Span`](/api/span) object to the group. The span must refer to the same
+[`Doc`](/api/doc) object as the span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1]]
+> doc.spans["errors"].append(doc[2:4])
+> assert len(doc.spans["errors"]) == 2
+> ```
+
+| Name   | Description                  |
+| ------ | ---------------------------- |
+| `span` | The span to append. ~~Span~~ |
+
+## SpanGroup.extend {#extend tag="method"}
+
+Add multiple [`Span`](/api/span) objects to the group. All spans must refer to
+the same [`Doc`](/api/doc) object as the span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = []
+> doc.spans["errors"].extend([doc[2:4], doc[0:1]])
+> assert len(doc.spans["errors"]) == 2
+> ```
+
+| Name    | Description                          |
+| ------- | ------------------------------------ |
+| `spans` | The spans to add. ~~Iterable[Span]~~ |
+
+## SpanGroup.to_bytes {#to_bytes tag="method"}
+
+Serialize the span group to a bytestring.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> group_bytes = doc.spans["errors"].to_bytes()
+> ```
+
+| Name        | Description                           |
+| ----------- | ------------------------------------- |
+| **RETURNS** | The serialized `SpanGroup`. ~~bytes~~ |
+
+## SpanGroup.from_bytes {#from_bytes tag="method"}
+
+Load the span group from a bytestring. Modifies the object in place and returns
+it.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import SpanGroup
+>
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[2:4]]
+> group_bytes = doc.spans["errors"].to_bytes()
+> new_group = SpanGroup()
+> new_group.from_bytes(group_bytes)
+> ```
+
+| Name         | Description                           |
+| ------------ | ------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~      |
+| **RETURNS**  | The `SpanGroup` object. ~~SpanGroup~~ |
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@ -18,15 +18,16 @@ It also orchestrates training and serialization.

 ### Container objects {#architecture-containers}

-| Name                        | Description                                                                                                                                             |
-| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Doc`](/api/doc)           | A container for accessing linguistic annotations.                                                                                                       |
-| [`DocBin`](/api/docbin)     | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |
-| [`Example`](/api/example)   | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
-| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
-| [`Lexeme`](/api/lexeme)     | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
-| [`Span`](/api/span)         | A slice from a `Doc` object.                                                                                                                            |
-| [`Token`](/api/token)       | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |
+| Name                          | Description                                                                                                                                             |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Doc`](/api/doc)             | A container for accessing linguistic annotations.                                                                                                       |
+| [`DocBin`](/api/docbin)       | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |
+| [`Example`](/api/example)     | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
+| [`Language`](/api/language)   | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
+| [`Lexeme`](/api/lexeme)       | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
+| [`Span`](/api/span)           | A slice from a `Doc` object.                                                                                                                            |
+| [`SpanGroup`](/api/spangroup) | A named collection of spans belonging to a `Doc`.                                                                                                       |
+| [`Token`](/api/token)         | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |

 ### Processing pipeline {#architecture-pipeline}

--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -501,7 +501,7 @@ format for documenting argument and return types.
  [`AttributeRuler`](/api/attributeruler),
  [`SentenceRecognizer`](/api/sentencerecognizer),
  [`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe),
-  [`Corpus`](/api/corpus)
+  [`Corpus`](/api/corpus), [`SpanGroup`](/api/spangroup),

 </Infobox>

--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -77,6 +77,7 @@
                    { "text": "Language", "url": "/api/language" },
                    { "text": "Lexeme", "url": "/api/lexeme" },
                    { "text": "Span", "url": "/api/span" },
+                    { "text": "SpanGroup", "url": "/api/spangroup" },
                    { "text": "Token", "url": "/api/token" }
                ]
            },
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@ -2,6 +2,7 @@
    "Doc": "/api/doc",
    "Token": "/api/token",
    "Span": "/api/span",
+    "SpanGroup": "/api/spangroup",
    "Lexeme": "/api/lexeme",
    "Example": "/api/example",
    "Alignment": "/api/example#alignment-object",