spaCy/spacy/tokens/graph.pyx
Madeesh Kannan 446a3ecf34
StringStore refactoring (#11344)
* `strings`: Remove unused `hash32_utf8` function

* `strings`: Make `hash_utf8` and `decode_Utf8Str` private

* `strings`: Reorganize private functions

* 'strings': Raise error when non-string/-int types are passed to functions that don't accept them

* `strings`: Add `items()` method, add type hints, remove unused methods, restrict inputs to specific types, reorganize methods

* `Morphology`: Use `StringStore.items()` to enumerate features when pickling

* `test_stringstore`: Update pre-Python 3 tests

* Update `StringStore` docs

* Fix `get_string_id` imports

* Replace redundant test with tests for type checking

* Rename `_retrieve_interned_str`, remove `.get` default arg

* Add `get_string_id` to `strings.pyi`
Remove `mypy` ignore directives from imports of the above

* `strings.pyi`: Replace functions that consume `Union`-typed params with overloads

* `strings.pyi`: Revert some function signatures

* Update `SYMBOLS_BY_INT` lookups and error codes post-merge

* Revert clobbered change introduced in a previous merge

* Remove unnecessary type hint

* Invert tuple order in `StringStore.items()`

* Add test for `StringStore.items()`

* Revert "`Morphology`: Use `StringStore.items()` to enumerate features when pickling"

This reverts commit 1af9510ceb.

* Rename `keys` and `key_map`

* Add `keys()` and `values()`

* Add comment about the inverted key-value semantics in the API

* Fix type hints

* Implement `keys()`, `values()`, `items()` without generators

* Fix type hints, remove unnecessary boxing

* Update docs

* Simplify `keys/values/items()` impl

* `mypy` fix

* Fix error message, doc fixes
2022-10-06 10:51:06 +02:00

708 lines
23 KiB
Cython

# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
from typing import List, Tuple, Generator
from libc.stdint cimport int32_t, int64_t
from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
from cython.operator cimport dereference
cimport cython
import weakref
from preshed.maps cimport map_get_unless_missing
from murmurhash.mrmr cimport hash64
from .. import Errors
from ..typedefs cimport hash_t
from ..strings cimport get_string_id
from ..structs cimport EdgeC, GraphC
from .token import Token
@cython.freelist(8)
cdef class Edge:
cdef readonly Graph graph
cdef readonly int i
def __init__(self, Graph graph, int i):
self.graph = graph
self.i = i
@property
def is_none(self) -> bool:
return False
@property
def doc(self) -> "Doc":
return self.graph.doc
@property
def head(self) -> "Node":
return Node(self.graph, self.graph.c.edges[self.i].head)
@property
def tail(self) -> "Tail":
return Node(self.graph, self.graph.c.edges[self.i].tail)
@property
def label(self) -> int:
return self.graph.c.edges[self.i].label
@property
def weight(self) -> float:
return self.graph.c.weights[self.i]
@property
def label_(self) -> str:
return self.doc.vocab.strings[self.label]
@cython.freelist(8)
cdef class Node:
cdef readonly Graph graph
cdef readonly int i
def __init__(self, Graph graph, int i):
"""A reference to a node of an annotation graph. Each node is made up of
an ordered set of zero or more token indices.
Node references are usually created by the Graph object itself, or from
the Node or Edge objects. You usually won't need to instantiate this
class yourself.
"""
cdef int length = graph.c.nodes.size()
if i >= length or -i >= length:
raise IndexError(Errors.E1034.format(i=i, length=length))
if i < 0:
i += length
self.graph = graph
self.i = i
def __eq__(self, other):
if self.graph is not other.graph:
return False
else:
return self.i == other.i
def __iter__(self) -> Generator[int]:
for i in self.graph.c.nodes[self.i]:
yield i
def __getitem__(self, int i) -> int:
"""Get a token index from the node's set of tokens."""
length = self.graph.c.nodes[self.i].size()
if i >= length or -i >= length:
raise IndexError(Errors.E1035.format(i=i, length=length))
if i < 0:
i += length
return self.graph.c.nodes[self.i][i]
def __len__(self) -> int:
"""The number of tokens that make up the node."""
return self.graph.c.nodes[self.i].size()
@property
def is_none(self) -> bool:
"""Whether the node is a special value, indicating 'none'.
The NoneNode type is returned by the Graph, Edge and Node objects when
there is no match to a query. It has the same API as Node, but it always
returns NoneNode, NoneEdge or empty lists for its queries.
"""
return False
@property
def doc(self) -> "Doc":
"""The Doc object that the graph refers to."""
return self.graph.doc
@property
def tokens(self) -> Tuple[Token]:
"""A tuple of Token objects that make up the node."""
doc = self.doc
return tuple([doc[i] for i in self])
def head(self, i=None, label=None) -> "Node":
"""Get the head of the first matching edge, searching by index, label,
both or neither.
For instance, `node.head(i=1)` will get the head of the second edge that
this node is a tail of. `node.head(i=1, label="ARG0")` will further
check that the second edge has the label `"ARG0"`.
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.headed(i=i, label=label)
def tail(self, i=None, label=None) -> "Node":
"""Get the tail of the first matching edge, searching by index, label,
both or neither.
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.tailed(i=i, label=label).tail
def sibling(self, i=None, label=None):
"""Get the first matching sibling node. Two nodes are siblings if they
are both tails of the same head.
If no matching node can be found, the graph's NoneNode is returned.
"""
if i is None:
siblings = self.siblings(label=label)
return siblings[0] if siblings else NoneNode(self)
else:
edges = []
for h in self.headed():
edges.extend([e for e in h.tailed() if e.tail.i != self.i])
if i >= len(edges):
return NoneNode(self)
elif label is not None and edges[i].label != label:
return NoneNode(self)
else:
return edges[i].tail
def heads(self, label=None) -> List["Node"]:
"""Find all matching heads of this node."""
cdef vector[int] edge_indices
self._find_edges(edge_indices, "head", label)
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
def tails(self, label=None) -> List["Node"]:
"""Find all matching tails of this node."""
cdef vector[int] edge_indices
self._find_edges(edge_indices, "tail", label)
return [Node(self.graph, self.graph.c.edges[i].tail) for i in edge_indices]
def siblings(self, label=None) -> List["Node"]:
"""Find all maching siblings of this node. Two nodes are siblings if they
are tails of the same head.
"""
edges = []
for h in self.headed():
edges.extend([e for e in h.tailed() if e.tail.i != self.i])
if label is None:
return [e.tail for e in edges]
else:
return [e.tail for e in edges if e.label == label]
def headed(self, i=None, label=None) -> Edge:
"""Find the first matching edge headed by this node.
If no matching edge can be found, the graph's NoneEdge is returned.
"""
start, end = self._find_range(i, self.c.n_head[self.i])
idx = self._find_edge("head", start, end, label)
if idx == -1:
return NoneEdge(self.graph)
else:
return Edge(self.graph, idx)
def tailed(self, i=None, label=None) -> Edge:
"""Find the first matching edge tailed by this node.
If no matching edge can be found, the graph's NoneEdge is returned.
"""
start, end = self._find_range(i, self.c.n_tail[self.i])
idx = self._find_edge("tail", start, end, label)
if idx == -1:
return NoneEdge(self.graph)
else:
return Edge(self.graph, idx)
def headeds(self, label=None) -> List[Edge]:
"""Find all matching edges headed by this node."""
cdef vector[int] edge_indices
self._find_edges(edge_indices, "head", label)
return [Edge(self.graph, i) for i in edge_indices]
def taileds(self, label=None) -> List["Edge"]:
"""Find all matching edges headed by this node."""
cdef vector[int] edge_indices
self._find_edges(edge_indices, "tail", label)
return [Edge(self.graph, i) for i in edge_indices]
def walk_heads(self):
cdef vector[int] node_indices
walk_head_nodes(node_indices, &self.graph.c, self.i)
for i in node_indices:
yield Node(self.graph, i)
def walk_tails(self):
cdef vector[int] node_indices
walk_tail_nodes(node_indices, &self.graph.c, self.i)
for i in node_indices:
yield Node(self.graph, i)
cdef (int, int) _get_range(self, i, n):
if i is None:
return (0, n)
elif i < n:
return (i, i+1)
else:
return (0, 0)
cdef int _find_edge(self, str direction, int start, int end, label) except -2:
if direction == "head":
get_edges = get_head_edges
else:
get_edges = get_tail_edges
cdef vector[int] edge_indices
get_edges(edge_indices, &self.graph.c, self.i)
if label is None:
return edge_indices[start]
for edge_index in edge_indices[start:end]:
if self.graph.c.edges[edge_index].label == label:
return edge_index
else:
return -1
cdef int _find_edges(self, vector[int]& edge_indices, str direction, label):
if direction == "head":
get_edges = get_head_edges
else:
get_edges = get_tail_edges
if label is None:
get_edges(edge_indices, &self.graph.c, self.i)
return edge_indices.size()
cdef vector[int] unfiltered
get_edges(unfiltered, &self.graph.c, self.i)
for edge_index in unfiltered:
if self.graph.c.edges[edge_index].label == label:
edge_indices.push_back(edge_index)
return edge_indices.size()
cdef class NoneEdge(Edge):
"""An Edge subclass, representing a non-result. The NoneEdge has the same
API as other Edge instances, but always returns NoneEdge, NoneNode, or empty
lists.
"""
def __init__(self, graph):
self.graph = graph
self.i = -1
@property
def doc(self) -> "Doc":
return self.graph.doc
@property
def head(self) -> "NoneNode":
return NoneNode(self.graph)
@property
def tail(self) -> "NoneNode":
return NoneNode(self.graph)
@property
def label(self) -> int:
return 0
@property
def weight(self) -> float:
return 0.0
@property
def label_(self) -> str:
return ""
cdef class NoneNode(Node):
def __init__(self, graph):
self.graph = graph
self.i = -1
def __getitem__(self, int i):
raise IndexError(Errors.E1036)
def __len__(self):
return 0
@property
def is_none(self):
return -1
@property
def doc(self):
return self.graph.doc
@property
def tokens(self):
return tuple()
def head(self, i=None, label=None):
return self
def tail(self, i=None, label=None):
return self
def walk_heads(self):
yield from []
def walk_tails(self):
yield from []
cdef class Graph:
"""A set of directed labelled relationships between sets of tokens.
EXAMPLE:
Construction 1
>>> graph = Graph(doc, name="srl")
Construction 2
>>> graph = Graph(
doc,
name="srl",
nodes=[(0,), (1, 3), (,)],
edges=[(0, 2), (2, 1)]
)
Construction 3
>>> graph = Graph(
doc,
name="srl",
nodes=[(0,), (1, 3), (,)],
edges=[(2, 0), (0, 1)],
labels=["word sense ID 1675", "agent"],
weights=[-42.6, -1.7]
)
>>> assert graph.has_node((0,))
>>> assert graph.has_edge((0,), (1,3), label="agent")
"""
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
"""Create a Graph object.
doc (Doc): The Doc object the graph will refer to.
name (str): A string name to help identify the graph. Defaults to "".
nodes (List[Tuple[int]]): A list of token-index tuples to add to the graph
as nodes. Defaults to [].
edges (List[Tuple[int, int]]): A list of edges between the provided nodes.
Each edge should be a (head, tail) tuple, where `head` and `tail`
are integers pointing into the `nodes` list. Defaults to [].
labels (Optional[List[str]]): A list of labels for the provided edges.
If None, all of the edges specified by the edges argument will have
be labelled with the empty string (""). If `labels` is not `None`,
it must have the same length as the `edges` argument.
weights (Optional[List[float]]): A list of weights for the provided edges.
If None, all of the edges specified by the edges argument will
have the weight 0.0. If `weights` is not `None`, it must have the
same length as the `edges` argument.
"""
if weights is not None:
assert len(weights) == len(edges)
else:
weights = [0.0] * len(edges)
if labels is not None:
assert len(labels) == len(edges)
else:
labels = [""] * len(edges)
self.c.node_map = new unordered_map[hash_t, int]()
self.c.edge_map = new unordered_map[hash_t, int]()
self.c.roots = new unordered_set[int]()
self.name = name
self.doc_ref = weakref.ref(doc)
for node in nodes:
self.add_node(node)
for (head, tail), label, weight in zip(edges, labels, weights):
self.add_edge(
Node(self, head),
Node(self, tail),
label=label,
weight=weight
)
def __dealloc__(self):
del self.c.node_map
del self.c.edge_map
del self.c.roots
@property
def doc(self) -> "Doc":
"""The Doc object the graph refers to."""
return self.doc_ref()
@property
def edges(self) -> Generator[Edge]:
"""Iterate over the edges in the graph."""
for i in range(self.c.edges.size()):
yield Edge(self, i)
@property
def nodes(self) -> Generator[Node]:
"""Iterate over the nodes in the graph."""
for i in range(self.c.nodes.size()):
yield Node(self, i)
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
"""Add an edge to the graph, connecting two groups of tokens.
If there is already an edge for the (head, tail, label) triple, it will
be returned, and no new edge will be created. The weight of the edge
will be updated if a weight is specified.
"""
label_hash = self.doc.vocab.strings.as_int(label)
weight_float = weight if weight is not None else 0.0
edge_index = add_edge(
&self.c,
EdgeC(
head=self.add_node(head).i,
tail=self.add_node(tail).i,
label=self.doc.vocab.strings.as_int(label),
),
weight=weight if weight is not None else 0.0
)
return Edge(self, edge_index)
def get_edge(self, head, tail, *, label="") -> Edge:
"""Look up an edge in the graph. If the graph has no matching edge,
the NoneEdge object is returned.
"""
head_node = self.get_node(head)
if head_node.is_none:
return NoneEdge(self)
tail_node = self.get_node(tail)
if tail_node.is_none:
return NoneEdge(self)
edge_index = get_edge(
&self.c,
EdgeC(head=head_node.i, tail=tail_node.i, label=get_string_id(label))
)
if edge_index < 0:
return NoneEdge(self)
else:
return Edge(self, edge_index)
def has_edge(self, head, tail, label) -> bool:
"""Check whether a (head, tail, label) triple is an edge in the graph."""
return not self.get_edge(head, tail, label=label).is_none
def add_node(self, indices) -> Node:
"""Add a node to the graph and return it. Nodes refer to ordered sets
of token indices.
This method is idempotent: if there is already a node for the given
indices, it is returned without a new node being created.
"""
if isinstance(indices, Node):
return indices
cdef vector[int32_t] node
node.reserve(len(indices))
for idx in indices:
node.push_back(idx)
i = add_node(&self.c, node)
return Node(self, i)
def get_node(self, indices) -> Node:
"""Get a node from the graph, or the NoneNode if there is no node for
the given indices.
"""
if isinstance(indices, Node):
return indices
cdef vector[int32_t] node
node.reserve(len(indices))
for idx in indices:
node.push_back(idx)
node_index = get_node(&self.c, node)
if node_index < 0:
return NoneNode(self)
else:
return Node(self, node_index)
def has_node(self, tuple indices) -> bool:
"""Check whether the graph has a node for the given indices."""
return not self.get_node(indices).is_none
cdef int add_edge(GraphC* graph, EdgeC edge, float weight) nogil:
key = hash64(&edge, sizeof(edge), 0)
it = graph.edge_map.find(key)
if it != graph.edge_map.end():
edge_index = dereference(it).second
graph.weights[edge_index] = weight
return edge_index
else:
edge_index = graph.edges.size()
graph.edge_map.insert(pair[hash_t, int](key, edge_index))
graph.edges.push_back(edge)
if graph.n_tails[edge.head] == 0:
graph.first_tail[edge.head] = edge_index
if graph.n_heads[edge.tail] == 0:
graph.first_head[edge.tail] = edge_index
graph.n_tails[edge.head] += 1
graph.n_heads[edge.tail] += 1
graph.weights.push_back(weight)
# If we had the tail marked as a root, remove it.
tail_root_index = graph.roots.find(edge.tail)
if tail_root_index != graph.roots.end():
graph.roots.erase(tail_root_index)
return edge_index
cdef int get_edge(const GraphC* graph, EdgeC edge) nogil:
key = hash64(&edge, sizeof(edge), 0)
it = graph.edge_map.find(key)
if it == graph.edge_map.end():
return -1
else:
return dereference(it).second
cdef int has_edge(const GraphC* graph, EdgeC edge) nogil:
return get_edge(graph, edge) >= 0
cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
it = graph.node_map.find(key)
if it != graph.node_map.end():
# Item found. Convert the iterator to an index value.
return dereference(it).second
else:
index = graph.nodes.size()
graph.nodes.push_back(node)
graph.n_heads.push_back(0)
graph.n_tails.push_back(0)
graph.first_head.push_back(0)
graph.first_tail.push_back(0)
graph.roots.insert(index)
graph.node_map.insert(pair[hash_t, int](key, index))
return index
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
it = graph.node_map.find(key)
if it == graph.node_map.end():
return -1
else:
return dereference(it).second
cdef int has_node(const GraphC* graph, vector[int32_t] node) nogil:
return get_node(graph, node) >= 0
cdef int get_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
todo = graph.n_heads[node]
if todo == 0:
return 0
output.reserve(output.size() + todo)
start = graph.first_head[node]
end = graph.edges.size()
for i in range(start, end):
if todo <= 0:
break
elif graph.edges[i].tail == node:
output.push_back(graph.edges[i].head)
todo -= 1
return todo
cdef int get_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
todo = graph.n_tails[node]
if todo == 0:
return 0
output.reserve(output.size() + todo)
start = graph.first_tail[node]
end = graph.edges.size()
for i in range(start, end):
if todo <= 0:
break
elif graph.edges[i].head == node:
output.push_back(graph.edges[i].tail)
todo -= 1
return todo
cdef int get_sibling_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
cdef vector[int] heads
cdef vector[int] tails
get_head_nodes(heads, graph, node)
for i in range(heads.size()):
get_tail_nodes(tails, graph, heads[i])
for j in range(tails.size()):
if tails[j] != node:
output.push_back(tails[j])
tails.clear()
return output.size()
cdef int get_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
todo = graph.n_heads[node]
if todo == 0:
return 0
output.reserve(output.size() + todo)
start = graph.first_head[node]
end = graph.edges.size()
for i in range(start, end):
if todo <= 0:
break
elif graph.edges[i].tail == node:
output.push_back(i)
todo -= 1
return todo
cdef int get_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
todo = graph.n_tails[node]
if todo == 0:
return 0
output.reserve(output.size() + todo)
start = graph.first_tail[node]
end = graph.edges.size()
for i in range(start, end):
if todo <= 0:
break
elif graph.edges[i].head == node:
output.push_back(i)
todo -= 1
return todo
cdef int walk_head_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
cdef unordered_set[int] seen = unordered_set[int]()
get_head_nodes(output, graph, node)
seen.insert(node)
i = 0
while i < output.size():
if seen.find(output[i]) == seen.end():
seen.insert(output[i])
get_head_nodes(output, graph, output[i])
i += 1
return i
cdef int walk_tail_nodes(vector[int]& output, const GraphC* graph, int node) nogil:
cdef unordered_set[int] seen = unordered_set[int]()
get_tail_nodes(output, graph, node)
seen.insert(node)
i = 0
while i < output.size():
if seen.find(output[i]) == seen.end():
seen.insert(output[i])
get_tail_nodes(output, graph, output[i])
i += 1
return i
cdef int walk_head_edges(vector[int]& output, const GraphC* graph, int node) nogil:
cdef unordered_set[int] seen = unordered_set[int]()
get_head_edges(output, graph, node)
seen.insert(node)
i = 0
while i < output.size():
if seen.find(output[i]) == seen.end():
seen.insert(output[i])
get_head_edges(output, graph, output[i])
i += 1
return i
cdef int walk_tail_edges(vector[int]& output, const GraphC* graph, int node) nogil:
cdef unordered_set[int] seen = unordered_set[int]()
get_tail_edges(output, graph, node)
seen.insert(node)
i = 0
while i < output.size():
if seen.find(output[i]) == seen.end():
seen.insert(output[i])
get_tail_edges(output, graph, output[i])
i += 1
return i