mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Officially support DependencyMatcher
Add official support for the `DependencyMatcher`. Redesign the pattern specification. Fix and extend operator implementations. Update API docs and add usage docs. Patterns -------- Refactor pattern structure to: ``` { "LEFT_ID": str, "REL_OP": str, "RIGHT_ID": str, "RIGHT_ATTRS": dict, } ``` The first node contains only `RIGHT_ID` and `RIGHT_ATTRS` and all subsequent nodes contain all four keys. New operators ------------- Because of the way patterns are constructed from left to right, it's helpful to have `follows` operators along with `precedes` operators. Add operators for simple precedes / follows alongside immediate precedes / follows. * `.*`: precedes * `;`: immediately follows * `;*`: follows Operator fixes -------------- * `<` and `<<` do not include the node itself * Fix reversed order for all operators involving linear precedence (`.`, all sibling operators) * Linear precedence operators do not match nodes outside the same parse Additional fixes ---------------- * Use v3 Matcher API * Support `get` and `remove` * Support pickling
This commit is contained in:
parent
6bfb1b3a29
commit
960d9cfadc
|
@ -284,12 +284,12 @@ class Errors:
|
|||
"Span objects, or dicts if set to manual=True.")
|
||||
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
||||
"phrase pattern (string) but got:\n{pattern}")
|
||||
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
||||
E099 = ("First node of pattern should be a root node. The root should "
|
||||
"only contain NODE_NAME.")
|
||||
E100 = ("Nodes apart from the root should contain NODE_NAME, NBOR_NAME and "
|
||||
"NBOR_RELOP.")
|
||||
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
||||
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
|
||||
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
|
||||
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
|
||||
E100 = ("Nodes other than the anchor node should all contain LEFT_ID, "
|
||||
"REL_OP and RIGHT_ID.")
|
||||
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
|
||||
"have been declared in previous edges.")
|
||||
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
||||
"tokens to merge. If you want to find the longest non-overlapping "
|
||||
|
@ -652,6 +652,9 @@ class Errors:
|
|||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||
"`ORTH` and `NORM`.")
|
||||
E1006 = ("Unable to initialize {name} model with 0 labels.")
|
||||
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
|
||||
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
|
||||
"that you are providing a list of patterns as `List[List[dict]]`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp cimport bool
|
||||
from typing import List
|
||||
|
||||
import numpy
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .matcher cimport Matcher
|
||||
from ..vocab cimport Vocab
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .matcher import unpickle_matcher
|
||||
from ..errors import Errors
|
||||
from ..tokens import Span
|
||||
|
||||
|
||||
DELIMITER = "||"
|
||||
|
@ -22,36 +22,52 @@ cdef class DependencyMatcher:
|
|||
"""Match dependency parse tree based on pattern rules."""
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly Matcher token_matcher
|
||||
cdef readonly Matcher matcher
|
||||
cdef public object _patterns
|
||||
cdef public object _raw_patterns
|
||||
cdef public object _keys_to_token
|
||||
cdef public object _root
|
||||
cdef public object _entities
|
||||
cdef public object _callbacks
|
||||
cdef public object _nodes
|
||||
cdef public object _tree
|
||||
cdef public object _ops
|
||||
|
||||
def __init__(self, vocab):
|
||||
def __init__(self, vocab, *, validate=False):
|
||||
"""Create the DependencyMatcher.
|
||||
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
documents the matcher will operate on.
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher as `validate`
|
||||
"""
|
||||
size = 20
|
||||
# TODO: make matcher work with validation
|
||||
self.token_matcher = Matcher(vocab, validate=False)
|
||||
self.matcher = Matcher(vocab, validate=validate)
|
||||
self._keys_to_token = {}
|
||||
self._patterns = {}
|
||||
self._raw_patterns = {}
|
||||
self._root = {}
|
||||
self._nodes = {}
|
||||
self._tree = {}
|
||||
self._entities = {}
|
||||
self._callbacks = {}
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self._ops = {
|
||||
"<": self.dep,
|
||||
">": self.gov,
|
||||
"<<": self.dep_chain,
|
||||
">>": self.gov_chain,
|
||||
".": self.imm_precede,
|
||||
".*": self.precede,
|
||||
";": self.imm_follow,
|
||||
";*": self.follow,
|
||||
"$+": self.imm_right_sib,
|
||||
"$-": self.imm_left_sib,
|
||||
"$++": self.right_sib,
|
||||
"$--": self.left_sib,
|
||||
}
|
||||
|
||||
def __reduce__(self):
|
||||
data = (self.vocab, self._patterns,self._tree, self._callbacks)
|
||||
data = (self.vocab, self._raw_patterns, self._callbacks)
|
||||
return (unpickle_matcher, data, None, None)
|
||||
|
||||
def __len__(self):
|
||||
|
@ -74,54 +90,61 @@ cdef class DependencyMatcher:
|
|||
idx = 0
|
||||
visited_nodes = {}
|
||||
for relation in pattern:
|
||||
if "PATTERN" not in relation or "SPEC" not in relation:
|
||||
if not isinstance(relation, dict):
|
||||
raise ValueError(Errors.E1008)
|
||||
if "RIGHT_ATTRS" not in relation and "RIGHT_ID" not in relation:
|
||||
raise ValueError(Errors.E098.format(key=key))
|
||||
if idx == 0:
|
||||
if not(
|
||||
"NODE_NAME" in relation["SPEC"]
|
||||
and "NBOR_RELOP" not in relation["SPEC"]
|
||||
and "NBOR_NAME" not in relation["SPEC"]
|
||||
"RIGHT_ID" in relation
|
||||
and "REL_OP" not in relation
|
||||
and "LEFT_ID" not in relation
|
||||
):
|
||||
raise ValueError(Errors.E099.format(key=key))
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["RIGHT_ID"]] = True
|
||||
else:
|
||||
if not(
|
||||
"NODE_NAME" in relation["SPEC"]
|
||||
and "NBOR_RELOP" in relation["SPEC"]
|
||||
and "NBOR_NAME" in relation["SPEC"]
|
||||
"RIGHT_ID" in relation
|
||||
and "RIGHT_ATTRS" in relation
|
||||
and "REL_OP" in relation
|
||||
and "LEFT_ID" in relation
|
||||
):
|
||||
raise ValueError(Errors.E100.format(key=key))
|
||||
if (
|
||||
relation["SPEC"]["NODE_NAME"] in visited_nodes
|
||||
or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
|
||||
relation["RIGHT_ID"] in visited_nodes
|
||||
or relation["LEFT_ID"] not in visited_nodes
|
||||
):
|
||||
raise ValueError(Errors.E101.format(key=key))
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||
if relation["REL_OP"] not in self._ops:
|
||||
raise ValueError(Errors.E1007.format(op=relation["REL_OP"]))
|
||||
visited_nodes[relation["RIGHT_ID"]] = True
|
||||
visited_nodes[relation["LEFT_ID"]] = True
|
||||
idx = idx + 1
|
||||
|
||||
def add(self, key, patterns, *_patterns, on_match=None):
|
||||
def add(self, key, patterns, *, on_match=None):
|
||||
"""Add a new matcher rule to the matcher.
|
||||
|
||||
key (str): The match ID.
|
||||
patterns (list): The patterns to add for the given key.
|
||||
on_match (callable): Optional callback executed on match.
|
||||
"""
|
||||
if patterns is None or hasattr(patterns, "__call__"): # old API
|
||||
on_match = patterns
|
||||
patterns = _patterns
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List): # old API
|
||||
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
self.validate_input(pattern,key)
|
||||
self.validate_input(pattern, key)
|
||||
key = self._normalize_key(key)
|
||||
self._raw_patterns.setdefault(key, [])
|
||||
self._raw_patterns[key].extend(patterns)
|
||||
_patterns = []
|
||||
for pattern in patterns:
|
||||
token_patterns = []
|
||||
for i in range(len(pattern)):
|
||||
token_pattern = [pattern[i]["PATTERN"]]
|
||||
token_pattern = [pattern[i]["RIGHT_ATTRS"]]
|
||||
token_patterns.append(token_pattern)
|
||||
# self.patterns.append(token_patterns)
|
||||
_patterns.append(token_patterns)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
@ -135,7 +158,7 @@ cdef class DependencyMatcher:
|
|||
# TODO: Better ways to hash edges in pattern?
|
||||
for j in range(len(_patterns[i])):
|
||||
k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j))
|
||||
self.token_matcher.add(k, [_patterns[i][j]])
|
||||
self.matcher.add(k, [_patterns[i][j]])
|
||||
_keys_to_token[k] = j
|
||||
_keys_to_token_list.append(_keys_to_token)
|
||||
self._keys_to_token.setdefault(key, [])
|
||||
|
@ -144,14 +167,14 @@ cdef class DependencyMatcher:
|
|||
for pattern in patterns:
|
||||
nodes = {}
|
||||
for i in range(len(pattern)):
|
||||
nodes[pattern[i]["SPEC"]["NODE_NAME"]] = i
|
||||
nodes[pattern[i]["RIGHT_ID"]] = i
|
||||
_nodes_list.append(nodes)
|
||||
self._nodes.setdefault(key, [])
|
||||
self._nodes[key].extend(_nodes_list)
|
||||
# Create an object tree to traverse later on. This data structure
|
||||
# enables easy tree pattern match. Doc-Token based tree cannot be
|
||||
# reused since it is memory-heavy and tightly coupled with the Doc.
|
||||
self.retrieve_tree(patterns, _nodes_list,key)
|
||||
self.retrieve_tree(patterns, _nodes_list, key)
|
||||
|
||||
def retrieve_tree(self, patterns, _nodes_list, key):
|
||||
_heads_list = []
|
||||
|
@ -161,13 +184,13 @@ cdef class DependencyMatcher:
|
|||
root = -1
|
||||
for j in range(len(patterns[i])):
|
||||
token_pattern = patterns[i][j]
|
||||
if ("NBOR_RELOP" not in token_pattern["SPEC"]):
|
||||
if ("REL_OP" not in token_pattern):
|
||||
heads[j] = ('root', j)
|
||||
root = j
|
||||
else:
|
||||
heads[j] = (
|
||||
token_pattern["SPEC"]["NBOR_RELOP"],
|
||||
_nodes_list[i][token_pattern["SPEC"]["NBOR_NAME"]]
|
||||
token_pattern["REL_OP"],
|
||||
_nodes_list[i][token_pattern["LEFT_ID"]]
|
||||
)
|
||||
_heads_list.append(heads)
|
||||
_root_list.append(root)
|
||||
|
@ -202,11 +225,21 @@ cdef class DependencyMatcher:
|
|||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
if key not in self._patterns:
|
||||
if key not in self._raw_patterns:
|
||||
return default
|
||||
return (self._callbacks[key], self._patterns[key])
|
||||
return (self._callbacks[key], self._raw_patterns[key])
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
def remove(self, key):
|
||||
key = self._normalize_key(key)
|
||||
if not key in self._patterns:
|
||||
raise ValueError(Errors.E175.format(key=key))
|
||||
self._patterns.pop(key)
|
||||
self._raw_patterns.pop(key)
|
||||
self._nodes.pop(key)
|
||||
self._tree.pop(key)
|
||||
self._root.pop(key)
|
||||
|
||||
def __call__(self, object doclike):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
|
@ -214,8 +247,14 @@ cdef class DependencyMatcher:
|
|||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
if isinstance(doclike, Doc):
|
||||
doc = doclike
|
||||
elif isinstance(doclike, Span):
|
||||
doc = doclike.as_doc()
|
||||
else:
|
||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||
matched_key_trees = []
|
||||
matches = self.token_matcher(doc)
|
||||
matches = self.matcher(doc)
|
||||
for key in list(self._patterns.keys()):
|
||||
_patterns_list = self._patterns[key]
|
||||
_keys_to_token_list = self._keys_to_token[key]
|
||||
|
@ -244,26 +283,26 @@ cdef class DependencyMatcher:
|
|||
length = len(_nodes)
|
||||
|
||||
matched_trees = []
|
||||
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
|
||||
matched_key_trees.append((key,matched_trees))
|
||||
|
||||
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees)
|
||||
for matched_tree in matched_trees:
|
||||
matched_key_trees.append((key, matched_tree))
|
||||
for i, (match_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(match_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matched_key_trees)
|
||||
return matched_key_trees
|
||||
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
|
||||
cdef bool isValid;
|
||||
if(patternLength == len(id_to_position.keys())):
|
||||
def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees):
|
||||
cdef bint isValid;
|
||||
if patternLength == len(id_to_position.keys()):
|
||||
isValid = True
|
||||
for node in range(patternLength):
|
||||
if(node in tree):
|
||||
if node in tree:
|
||||
for idx, (relop,nbor) in enumerate(tree[node]):
|
||||
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
|
||||
isNbor = False
|
||||
for computed_nbor in computed_nbors:
|
||||
if(computed_nbor.i == visited_nodes[nbor]):
|
||||
if computed_nbor.i == visited_nodes[nbor]:
|
||||
isNbor = True
|
||||
isValid = isValid & isNbor
|
||||
if(isValid):
|
||||
|
@ -271,14 +310,14 @@ cdef class DependencyMatcher:
|
|||
return
|
||||
allPatternNodes = numpy.asarray(id_to_position[patternLength])
|
||||
for patternNode in allPatternNodes:
|
||||
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
|
||||
self.recurse(tree, id_to_position, _node_operator_map, patternLength+1, visited_nodes+[patternNode], matched_trees)
|
||||
|
||||
# Given a node and an edge operator, to return the list of nodes
|
||||
# from the doc that belong to node+operator. This is used to store
|
||||
# all the results beforehand to prevent unnecessary computation while
|
||||
# pattern matching
|
||||
# _node_operator_map[node][operator] = [...]
|
||||
def get_node_operator_map(self,doc,tree,id_to_position,nodes,root):
|
||||
def get_node_operator_map(self, doc, tree, id_to_position, nodes, root):
|
||||
_node_operator_map = {}
|
||||
all_node_indices = nodes.values()
|
||||
all_operators = []
|
||||
|
@ -295,24 +334,14 @@ cdef class DependencyMatcher:
|
|||
_node_operator_map[node] = {}
|
||||
for operator in all_operators:
|
||||
_node_operator_map[node][operator] = []
|
||||
# Used to invoke methods for each operator
|
||||
switcher = {
|
||||
"<": self.dep,
|
||||
">": self.gov,
|
||||
"<<": self.dep_chain,
|
||||
">>": self.gov_chain,
|
||||
".": self.imm_precede,
|
||||
"$+": self.imm_right_sib,
|
||||
"$-": self.imm_left_sib,
|
||||
"$++": self.right_sib,
|
||||
"$--": self.left_sib
|
||||
}
|
||||
for operator in all_operators:
|
||||
for node in all_nodes:
|
||||
_node_operator_map[node][operator] = switcher.get(operator)(doc,node)
|
||||
_node_operator_map[node][operator] = self._ops.get(operator)(doc, node)
|
||||
return _node_operator_map
|
||||
|
||||
def dep(self, doc, node):
|
||||
if doc[node].head == doc[node]:
|
||||
return []
|
||||
return [doc[node].head]
|
||||
|
||||
def gov(self,doc,node):
|
||||
|
@ -322,36 +351,51 @@ cdef class DependencyMatcher:
|
|||
return list(doc[node].ancestors)
|
||||
|
||||
def gov_chain(self, doc, node):
|
||||
return list(doc[node].subtree)
|
||||
return [t for t in doc[node].subtree if t != doc[node]]
|
||||
|
||||
def imm_precede(self, doc, node):
|
||||
if node > 0:
|
||||
sent = self._get_sent(doc[node])
|
||||
if node < len(doc) - 1 and doc[node + 1] in sent:
|
||||
return [doc[node + 1]]
|
||||
return []
|
||||
|
||||
def precede(self, doc, node):
|
||||
sent = self._get_sent(doc[node])
|
||||
return [doc[i] for i in range(node + 1, sent.end)]
|
||||
|
||||
def imm_follow(self, doc, node):
|
||||
sent = self._get_sent(doc[node])
|
||||
if node > 0 and doc[node - 1] in sent:
|
||||
return [doc[node - 1]]
|
||||
return []
|
||||
|
||||
def follow(self, doc, node):
|
||||
sent = self._get_sent(doc[node])
|
||||
return [doc[i] for i in range(sent.start, node)]
|
||||
|
||||
def imm_right_sib(self, doc, node):
|
||||
for child in list(doc[node].head.children):
|
||||
if child.i == node - 1:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def imm_left_sib(self, doc, node):
|
||||
for child in list(doc[node].head.children):
|
||||
if child.i == node + 1:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def right_sib(self, doc, node):
|
||||
candidate_children = []
|
||||
for child in list(doc[node].head.children):
|
||||
if child.i < node:
|
||||
if child.i > node:
|
||||
candidate_children.append(doc[child.i])
|
||||
return candidate_children
|
||||
|
||||
def left_sib(self, doc, node):
|
||||
candidate_children = []
|
||||
for child in list(doc[node].head.children):
|
||||
if child.i > node:
|
||||
if child.i < node:
|
||||
candidate_children.append(doc[child.i])
|
||||
return candidate_children
|
||||
|
||||
|
@ -360,3 +404,15 @@ cdef class DependencyMatcher:
|
|||
return self.vocab.strings.add(key)
|
||||
else:
|
||||
return key
|
||||
|
||||
def _get_sent(self, token):
|
||||
root = (list(token.ancestors) or [token])[-1]
|
||||
return token.doc[root.left_edge.i:root.right_edge.i + 1]
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = DependencyMatcher(vocab)
|
||||
for key, pattern in patterns.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, pattern, on_match=callback)
|
||||
return matcher
|
||||
|
|
386
spacy/tests/matcher/test_dependency_matcher.py
Normal file
386
spacy/tests/matcher/test_dependency_matcher.py
Normal file
|
@ -0,0 +1,386 @@
|
|||
import pytest
|
||||
import pickle
|
||||
import re
|
||||
import copy
|
||||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_vocab):
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patterns(en_vocab):
|
||||
def is_brown_yellow(text):
|
||||
return bool(re.compile(r"brown|yellow").match(text))
|
||||
|
||||
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
||||
|
||||
pattern1 = [
|
||||
{"RIGHT_ID": "fox", "RIGHT_ATTRS": {"ORTH": "fox"}},
|
||||
{
|
||||
"LEFT_ID": "fox",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "q",
|
||||
"RIGHT_ATTRS": {"ORTH": "quick", "DEP": "amod"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "fox",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {IS_BROWN_YELLOW: True},
|
||||
},
|
||||
]
|
||||
|
||||
pattern2 = [
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "fox1",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": ".",
|
||||
"RIGHT_ID": "over",
|
||||
"RIGHT_ATTRS": {"ORTH": "over"},
|
||||
},
|
||||
]
|
||||
|
||||
pattern3 = [
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "fox",
|
||||
"REL_OP": ">>",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "brown"},
|
||||
},
|
||||
]
|
||||
|
||||
pattern4 = [
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
}
|
||||
]
|
||||
|
||||
pattern5 = [
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": ">>",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
return [pattern1, pattern2, pattern3, pattern4, pattern5]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dependency_matcher(en_vocab, patterns, doc):
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
mock = Mock()
|
||||
for i in range(1, len(patterns) + 1):
|
||||
if i == 1:
|
||||
matcher.add("pattern1", [patterns[0]], on_match=mock)
|
||||
else:
|
||||
matcher.add("pattern" + str(i), [patterns[i - 1]])
|
||||
|
||||
return matcher
|
||||
|
||||
|
||||
def test_dependency_matcher(dependency_matcher, doc, patterns):
|
||||
assert len(dependency_matcher) == 5
|
||||
assert "pattern3" in dependency_matcher
|
||||
assert dependency_matcher.get("pattern3") == (None, [patterns[2]])
|
||||
matches = dependency_matcher(doc)
|
||||
assert len(matches) == 6
|
||||
assert matches[0][1] == [3, 1, 2]
|
||||
assert matches[1][1] == [4, 3, 5]
|
||||
assert matches[2][1] == [4, 3, 2]
|
||||
assert matches[3][1] == [4, 3]
|
||||
assert matches[4][1] == [4, 3]
|
||||
assert matches[5][1] == [4, 8]
|
||||
|
||||
span = doc[0:6]
|
||||
matches = dependency_matcher(span)
|
||||
assert len(matches) == 5
|
||||
assert matches[0][1] == [3, 1, 2]
|
||||
assert matches[1][1] == [4, 3, 5]
|
||||
assert matches[2][1] == [4, 3, 2]
|
||||
assert matches[3][1] == [4, 3]
|
||||
assert matches[4][1] == [4, 3]
|
||||
|
||||
|
||||
def test_dependency_matcher_pickle(en_vocab, patterns, doc):
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
for i in range(1, len(patterns) + 1):
|
||||
matcher.add("pattern" + str(i), [patterns[i - 1]])
|
||||
|
||||
matches = matcher(doc)
|
||||
assert matches[0][1] == [3, 1, 2]
|
||||
assert matches[1][1] == [4, 3, 5]
|
||||
assert matches[2][1] == [4, 3, 2]
|
||||
assert matches[3][1] == [4, 3]
|
||||
assert matches[4][1] == [4, 3]
|
||||
assert matches[5][1] == [4, 8]
|
||||
|
||||
b = pickle.dumps(matcher)
|
||||
matcher_r = pickle.loads(b)
|
||||
|
||||
assert len(matcher) == len(matcher_r)
|
||||
matches = matcher_r(doc)
|
||||
assert matches[0][1] == [3, 1, 2]
|
||||
assert matches[1][1] == [4, 3, 5]
|
||||
assert matches[2][1] == [4, 3, 2]
|
||||
assert matches[3][1] == [4, 3]
|
||||
assert matches[4][1] == [4, 3]
|
||||
assert matches[5][1] == [4, 8]
|
||||
|
||||
|
||||
def test_dependency_matcher_pattern_validation(en_vocab):
|
||||
pattern = [
|
||||
{"RIGHT_ID": "fox", "RIGHT_ATTRS": {"ORTH": "fox"}},
|
||||
{
|
||||
"LEFT_ID": "fox",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "q",
|
||||
"RIGHT_ATTRS": {"ORTH": "quick", "DEP": "amod"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "fox",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "r",
|
||||
"RIGHT_ATTRS": {"ORTH": "brown"},
|
||||
},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
# original pattern is valid
|
||||
matcher.add("FOUNDED", [pattern])
|
||||
# individual pattern not wrapped in a list
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("FOUNDED", pattern)
|
||||
# no anchor node
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("FOUNDED", [pattern[1:]])
|
||||
# required keys missing
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
del pattern2[0]["RIGHT_ID"]
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
del pattern2[1]["RIGHT_ID"]
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
del pattern2[1]["RIGHT_ATTRS"]
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
del pattern2[1]["LEFT_ID"]
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
del pattern2[1]["REL_OP"]
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
# invalid operator
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
pattern2[1]["REL_OP"] = "!!!"
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
# duplicate node name
|
||||
with pytest.raises(ValueError):
|
||||
pattern2 = copy.deepcopy(pattern)
|
||||
pattern2[1]["RIGHT_ID"] = "fox"
|
||||
matcher.add("FOUNDED", [pattern2])
|
||||
|
||||
|
||||
def test_dependency_matcher_callback(en_vocab, doc):
|
||||
pattern = [
|
||||
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
mock = Mock()
|
||||
matcher.add("pattern", [pattern], on_match=mock)
|
||||
matches = matcher(doc)
|
||||
mock.assert_called_once_with(matcher, doc, 0, matches)
|
||||
|
||||
# check that matches with and without callback are the same (#4590)
|
||||
matcher2 = DependencyMatcher(en_vocab)
|
||||
matcher2.add("pattern", [pattern])
|
||||
matches2 = matcher2(doc)
|
||||
assert matches == matches2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20),]
|
||||
)
|
||||
def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||
# two sentences to test that all matches are within the same sentence
|
||||
doc = get_doc(
|
||||
en_vocab,
|
||||
words=["a", "b", "c", "d", "e"] * 2,
|
||||
heads=[0, -1, -2, -3, -4] * 2,
|
||||
deps=["dep"] * 10,
|
||||
)
|
||||
match_count = 0
|
||||
for text in ["a", "b", "c", "d", "e"]:
|
||||
pattern = [
|
||||
{"RIGHT_ID": "1", "RIGHT_ATTRS": {"ORTH": text}},
|
||||
{"LEFT_ID": "1", "REL_OP": op, "RIGHT_ID": "2", "RIGHT_ATTRS": {},},
|
||||
]
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("A", [pattern])
|
||||
matches = matcher(doc)
|
||||
match_count += len(matches)
|
||||
for match in matches:
|
||||
match_id, token_ids = match
|
||||
# token_ids[0] op token_ids[1]
|
||||
if op == ".":
|
||||
assert token_ids[0] == token_ids[1] - 1
|
||||
elif op == ";":
|
||||
assert token_ids[0] == token_ids[1] + 1
|
||||
elif op == ".*":
|
||||
assert token_ids[0] < token_ids[1]
|
||||
elif op == ";*":
|
||||
assert token_ids[0] > token_ids[1]
|
||||
# all tokens are within the same sentence
|
||||
assert doc[token_ids[0]].sent == doc[token_ids[1]].sent
|
||||
assert match_count == num_matches
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"left,right,op,num_matches",
|
||||
[
|
||||
("fox", "jumped", "<", 1),
|
||||
("the", "lazy", "<", 0),
|
||||
("jumped", "jumped", "<", 0),
|
||||
("fox", "jumped", ">", 0),
|
||||
("fox", "lazy", ">", 1),
|
||||
("lazy", "lazy", ">", 0),
|
||||
("fox", "jumped", "<<", 2),
|
||||
("jumped", "fox", "<<", 0),
|
||||
("the", "fox", "<<", 2),
|
||||
("fox", "jumped", ">>", 0),
|
||||
("over", "the", ">>", 1),
|
||||
("fox", "the", ">>", 2),
|
||||
("fox", "jumped", ".", 1),
|
||||
("lazy", "fox", ".", 1),
|
||||
("the", "fox", ".", 0),
|
||||
("the", "the", ".", 0),
|
||||
("fox", "jumped", ";", 0),
|
||||
("lazy", "fox", ";", 0),
|
||||
("the", "fox", ";", 0),
|
||||
("the", "the", ";", 0),
|
||||
("quick", "fox", ".*", 2),
|
||||
("the", "fox", ".*", 3),
|
||||
("the", "the", ".*", 1),
|
||||
("fox", "jumped", ";*", 1),
|
||||
("quick", "fox", ";*", 0),
|
||||
("the", "fox", ";*", 1),
|
||||
("the", "the", ";*", 1),
|
||||
("quick", "brown", "$+", 1),
|
||||
("brown", "quick", "$+", 0),
|
||||
("brown", "brown", "$+", 0),
|
||||
("quick", "brown", "$-", 0),
|
||||
("brown", "quick", "$-", 1),
|
||||
("brown", "brown", "$-", 0),
|
||||
("the", "brown", "$++", 1),
|
||||
("brown", "the", "$++", 0),
|
||||
("brown", "brown", "$++", 0),
|
||||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
],
|
||||
)
|
||||
def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches):
|
||||
right_id = right
|
||||
if left == right:
|
||||
right_id = right + "2"
|
||||
pattern = [
|
||||
{"RIGHT_ID": left, "RIGHT_ATTRS": {"LOWER": left}},
|
||||
{
|
||||
"LEFT_ID": left,
|
||||
"REL_OP": op,
|
||||
"RIGHT_ID": right_id,
|
||||
"RIGHT_ATTRS": {"LOWER": right},
|
||||
},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", [pattern])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == num_matches
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pattern",
|
||||
[
|
||||
# empty
|
||||
[],
|
||||
# unsupported op
|
||||
[
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": "==",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
},
|
||||
],
|
||||
# first dict isn't just a node
|
||||
[
|
||||
{
|
||||
"LEFT_ID": "jumped",
|
||||
"REL_OP": "==",
|
||||
"RIGHT_ID": "fox",
|
||||
"RIGHT_ATTRS": {"ORTH": "fox"},
|
||||
},
|
||||
],
|
||||
# missing op
|
||||
[
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{"LEFT_ID": "jumped", "RIGHT_ID": "fox", "RIGHT_ATTRS": {"ORTH": "fox"},},
|
||||
],
|
||||
# missing left-hand ID
|
||||
[
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{"REL_OP": ">", "RIGHT_ID": "fox", "RIGHT_ATTRS": {"ORTH": "fox"},},
|
||||
],
|
||||
# missing right-hand ID
|
||||
[
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{"LEFT_ID": "jumped", "REL_OP": ">", "RIGHT_ATTRS": {"ORTH": "fox"},},
|
||||
],
|
||||
# missing right-hand attrs
|
||||
[
|
||||
{"RIGHT_ID": "jumped", "RIGHT_ATTRS": {"ORTH": "jumped"}},
|
||||
{"LEFT_ID": "jumped", "REL_OP": ">", "RIGHT_ID": "fox",},
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_dependency_matcher_pattern_validation(en_vocab, pattern):
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("pattern", [pattern])
|
|
@ -1,7 +1,6 @@
|
|||
import pytest
|
||||
import re
|
||||
from mock import Mock
|
||||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc, Token, Span
|
||||
|
||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||
|
@ -292,84 +291,6 @@ def test_matcher_extension_set_membership(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text():
|
||||
return "The quick brown fox jumped over the lazy fox"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def heads():
|
||||
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deps():
|
||||
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dependency_matcher(en_vocab):
|
||||
def is_brown_yellow(text):
|
||||
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||
|
||||
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
||||
|
||||
pattern1 = [
|
||||
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {"ORTH": "quick", "DEP": "amod"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||
},
|
||||
]
|
||||
|
||||
pattern2 = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
pattern3 = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {"ORTH": "brown"},
|
||||
},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern1", [pattern1])
|
||||
matcher.add("pattern2", [pattern2])
|
||||
matcher.add("pattern3", [pattern3])
|
||||
|
||||
return matcher
|
||||
|
||||
|
||||
def test_dependency_matcher_compile(dependency_matcher):
|
||||
assert len(dependency_matcher) == 3
|
||||
|
||||
|
||||
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
|
||||
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||
# matches = dependency_matcher(doc)
|
||||
# assert matches[0][1] == [[3, 1, 2]]
|
||||
# assert matches[1][1] == [[4, 3, 3]]
|
||||
# assert matches[2][1] == [[4, 3, 2]]
|
||||
|
||||
|
||||
def test_matcher_basic_check(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
# Potential mistake: pass in pattern instead of list of patterns
|
||||
|
|
|
@ -38,32 +38,6 @@ def test_gold_misaligned(en_tokenizer, text, words):
|
|||
Example.from_dict(doc, {"words": words})
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = matcher(doc)
|
||||
on_match_args = on_match.call_args
|
||||
assert on_match_args[0][3] == matches
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialized correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
|
|
|
@ -1,65 +1,135 @@
|
|||
---
|
||||
title: DependencyMatcher
|
||||
teaser: Match sequences of tokens, based on the dependency parse
|
||||
teaser: Match subtrees within a dependency parse
|
||||
tag: class
|
||||
new: 3
|
||||
source: spacy/matcher/dependencymatcher.pyx
|
||||
---
|
||||
|
||||
The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
|
||||
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
||||
using the
|
||||
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||
using
|
||||
[Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||
It requires a pretrained [`DependencyParser`](/api/parser) or other component
|
||||
that sets the `Token.dep` attribute.
|
||||
that sets the `Token.dep` and `Token.head` attributes.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
> ```json
|
||||
> ```python
|
||||
> ### Example
|
||||
> # pattern: "[subject] ... initially founded"
|
||||
> [
|
||||
> # anchor token: founded
|
||||
> {
|
||||
> "SPEC": {"NODE_NAME": "founded"},
|
||||
> "PATTERN": {"ORTH": "founded"}
|
||||
> "RIGHT_ID": "founded",
|
||||
> "RIGHT_ATTRS": {"ORTH": "founded"}
|
||||
> },
|
||||
> # founded -> subject
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "founder",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "nsubj"}
|
||||
> "LEFT_ID": "founded",
|
||||
> "REL_OP": ">",
|
||||
> "RIGHT_ID": "subject",
|
||||
> "RIGHT_ATTRS": {"DEP": "nsubj"}
|
||||
> },
|
||||
> # "founded" follows "initially"
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "object",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "dobj"}
|
||||
> "LEFT_ID": "founded",
|
||||
> "REL_OP": ";",
|
||||
> "RIGHT_ID": "initially",
|
||||
> "RIGHT_ATTRS": {"ORTH": "initially"}
|
||||
> }
|
||||
> ]
|
||||
> ```
|
||||
|
||||
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
|
||||
with each dictionary describing a node to match. Each pattern should have the
|
||||
following top-level keys:
|
||||
with each dictionary describing a token to match. Except for the first
|
||||
dictionary, which defines an anchor token using only `RIGHT_ID` and
|
||||
`RIGHT_ATTRS`, each pattern should have the following keys:
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `PATTERN` | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ |
|
||||
| `SPEC` | The relationships of the nodes in the subtree that should be matched. ~~Dict[str, str]~~ |
|
||||
| Name | Description |
|
||||
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `LEFT_ID` | The name of the left-hand node in the relation, which has been defined in an earlier node. |
|
||||
| `REL_OP` | An operator that describes how the two nodes are related. ~~str~~ |
|
||||
| `RIGHT_ID` | A unique name for the right-hand node in the relation. ~~str~~ |
|
||||
| `RIGHT_ATTRS` | The token attributes to match for the right-hand node in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ |
|
||||
|
||||
The `SPEC` includes the following fields:
|
||||
The first pattern defines an anchor token and each additional token added to the
|
||||
pattern is linked to an existing token `LEFT_ID` by the relation `REL_OP` and is
|
||||
described by the name `RIGHT_ID` and the attributes `RIGHT_ATTRS`.
|
||||
|
||||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `NODE_NAME` | A unique name for this node to refer to it in other specs. ~~str~~ |
|
||||
| `NBOR_RELOP` | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. ~~str~~ |
|
||||
| `NBOR_NAME` | The unique name of the node that this node is connected to. ~~str~~ |
|
||||
Let's say we want to find sentences describing who founded what kind of company:
|
||||
|
||||
- `Smith founded a healthcare company in 2005.`
|
||||
- `Williams initially founded an insurance company in 1987.`
|
||||
- `Lee, an established CEO, founded yet another AI startup.`
|
||||
|
||||
Since it's the root of the dependency parse, `founded` is a good choice for the
|
||||
anchor token in our pattern:
|
||||
|
||||
```python
|
||||
pattern = [
|
||||
{"RIGHT_ID": "anchor_founded", "RIGHT_ATTRS": {"ORTH": "founded"}}
|
||||
]
|
||||
```
|
||||
|
||||
We can add the subject as the token with the dependency label `nsubj` that is a
|
||||
direct child `>` of the anchor token named `anchor_founded`:
|
||||
|
||||
```python
|
||||
pattern = [
|
||||
{"RIGHT_ID": "anchor_founded", "RIGHT_ATTRS": {"ORTH": "founded"}},
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "subject",
|
||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
And the direct object along with its modifier:
|
||||
|
||||
```python
|
||||
pattern = [ ...
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object",
|
||||
"RIGHT_ATTRS": {"DEP": "dobj"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "founded_object",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object_modifier",
|
||||
"RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Operators
|
||||
|
||||
The following operators are supported by the `DependencyMatcher`, most of which
|
||||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B` |
|
||||
| `A > B` | `A` is the immediate head of `B` |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep->head paths |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head->dep paths |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1` |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1` |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i` |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i` |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a rule-based `DependencyMatcher`.
|
||||
Create a `DependencyMatcher`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -68,13 +138,15 @@ Create a rule-based `DependencyMatcher`.
|
|||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||
| _keyword-only_ | |
|
||||
| `validate` | Validate all patterns added to this matcher. ~~bool~~ |
|
||||
|
||||
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
|
||||
|
||||
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||
Find all tokens matching the supplied patterns on the `Doc` or `Span`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -82,36 +154,32 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> from spacy.matcher import DependencyMatcher
|
||||
>
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Founder", [pattern])
|
||||
> pattern = [{"RIGHT_ID": "founded_id",
|
||||
> "RIGHT_ATTRS": {"ORTH": "founded"}}]
|
||||
> matcher.add("FOUNDED", [pattern])
|
||||
> doc = nlp("Bill Gates founded Microsoft.")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, token_ids)` tuples, describing the matches. The `match_id` is the ID of the match pattern and `token_ids` is a list of token indices matched by the pattern, where the position of each token in the list corresponds to the position of the node specification in the pattern. ~~List[Tuple[int, List[int]]]~~ |
|
||||
|
||||
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules (edges) added to the dependency matcher. Note that this
|
||||
only returns the number of rules (identical with the number of IDs), not the
|
||||
number of individual patterns.
|
||||
Get the number of rules added to the dependency matcher. Note that this only
|
||||
returns the number of rules (identical with the number of IDs), not the number
|
||||
of individual patterns.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Rule", [pattern])
|
||||
> pattern = [{"RIGHT_ID": "founded_id",
|
||||
> "RIGHT_ATTRS": {"ORTH": "founded"}}]
|
||||
> matcher.add("FOUNDED", [pattern])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
|
@ -126,10 +194,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert "Rule" not in matcher
|
||||
> matcher.add("Rule", [pattern])
|
||||
> assert "Rule" in matcher
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> assert "FOUNDED" not in matcher
|
||||
> matcher.add("FOUNDED", [pattern])
|
||||
> assert "FOUNDED" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -152,33 +220,15 @@ will be overwritten.
|
|||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> matcher.add("FOUNDED", patterns, on_match=on_match)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | An ID for the thing you're matching. ~~str~~ |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a `"PATTERN"` and `"SPEC"`. ~~List[List[Dict[str, dict]]]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
|
||||
|
||||
## DependencyMatcher.remove {#remove tag="method"}
|
||||
|
||||
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
|
||||
exist.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern]])
|
||||
> assert "Rule" in matcher
|
||||
> matcher.remove("Rule")
|
||||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | An ID for the patterns. ~~str~~ |
|
||||
| `patterns` | A list of match patterns. A pattern consists of a list of dicts, where each dict describes a token in the tree. ~~List[List[Dict[str, Union[str, Dict]]]]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[DependencyMatcher, Doc, int, List[Tuple], Any]]~~ |
|
||||
|
||||
## DependencyMatcher.get {#get tag="method"}
|
||||
|
||||
|
@ -188,11 +238,29 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern], on_match=on_match)
|
||||
> on_match, patterns = matcher.get("Rule")
|
||||
> matcher.add("FOUNDED", patterns, on_match=on_match)
|
||||
> on_match, patterns = matcher.get("FOUNDED")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[Union[Dict, Tuple]]]]~~ |
|
||||
|
||||
## DependencyMatcher.remove {#remove tag="method"}
|
||||
|
||||
Remove a rule from the dependency matcher. A `KeyError` is raised if the match
|
||||
ID does not exist.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("FOUNDED", patterns)
|
||||
> assert "FOUNDED" in matcher
|
||||
> matcher.remove("FOUNDED")
|
||||
> assert "FOUNDED" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
|
|
64
website/docs/images/dep-match-diagram.svg
Normal file
64
website/docs/images/dep-match-diagram.svg
Normal file
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/PR-SVG-20010719/DTD/svg10.dtd">
|
||||
<svg width="40cm" height="9cm" viewBox="78 215 793 171" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||
<g id="Background">
|
||||
<g>
|
||||
<rect style="fill: #ffffff; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x="79.225" y="263.65" width="169.55" height="54" rx="0" ry="0"/>
|
||||
<text font-size="12.7998" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:normal" x="164" y="286.55">
|
||||
<tspan x="164" y="286.55">ID: founded</tspan>
|
||||
<tspan x="164" y="302.55">ORTH: founded</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<rect style="fill: #ffffff; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x="426.238" y="216" width="139.1" height="54" rx="0" ry="0"/>
|
||||
<text font-size="12.7998" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:normal" x="495.788" y="238.9">
|
||||
<tspan x="495.788" y="238.9">ID: subject</tspan>
|
||||
<tspan x="495.788" y="254.9">DEP: nsubj</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<rect style="fill: #ffffff; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x="429.662" y="330.65" width="132.25" height="54" rx="0" ry="0"/>
|
||||
<text font-size="12.7998" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:normal" x="495.787" y="353.55">
|
||||
<tspan x="495.787" y="353.55">ID: object</tspan>
|
||||
<tspan x="495.787" y="369.55">DEP: dobj</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<line style="fill: none; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x1="248.775" y1="290.65" x2="416.834" y2="245.525"/>
|
||||
<polygon style="fill: #000000; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" fill-rule="evenodd" points="424.078,243.58 415.717,251.002 416.834,245.525 413.123,241.344 "/>
|
||||
</g>
|
||||
<g>
|
||||
<line style="fill: none; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x1="248.775" y1="290.65" x2="420.533" y2="354.268"/>
|
||||
<polygon style="fill: #000000; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" fill-rule="evenodd" points="427.566,356.873 416.452,358.089 420.533,354.268 419.925,348.711 "/>
|
||||
</g>
|
||||
<g>
|
||||
<ellipse style="fill: #f3e815; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" cx="342.393" cy="265.102" rx="20.1432" ry="22.8019"/>
|
||||
<text font-size="12.8" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:700" x="342.393" y="269.002">
|
||||
<tspan x="342.393" y="269.002">></tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<ellipse style="fill: #f3e815; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" cx="342.393" cy="326.65" rx="20.1432" ry="22.8019"/>
|
||||
<text font-size="12.8" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:700" x="342.393" y="330.55">
|
||||
<tspan x="342.393" y="330.55">></tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<rect style="fill: #ffffff; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x="697.337" y="330.65" width="172.4" height="54" rx="0" ry="0"/>
|
||||
<text font-size="12.7998" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:normal" x="783.537" y="353.55">
|
||||
<tspan x="783.537" y="353.55">ID: modifier</tspan>
|
||||
<tspan x="783.537" y="369.55">DEP: amod | compound</tspan>
|
||||
</text>
|
||||
</g>
|
||||
<g>
|
||||
<line style="fill: none; stroke-opacity: 1; stroke-width: 2; stroke: #000000" x1="561.912" y1="357.65" x2="687.601" y2="357.65"/>
|
||||
<polygon style="fill: #000000; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" fill-rule="evenodd" points="695.101,357.65 685.101,362.65 687.601,357.65 685.101,352.65 "/>
|
||||
</g>
|
||||
<g>
|
||||
<ellipse style="fill: #f3e815; fill-opacity: 1; stroke-opacity: 1; stroke-width: 2; stroke: #000000" cx="629.625" cy="357.65" rx="20.1432" ry="22.8019"/>
|
||||
<text font-size="12.8" style="fill: #000000; fill-opacity: 1; stroke: none;text-anchor:middle;font-family:sans-serif;font-style:normal;font-weight:700" x="629.625" y="361.55">
|
||||
<tspan x="629.625" y="361.55">></tspan>
|
||||
</text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 4.6 KiB |
58
website/docs/images/displacy-dep-founded.html
Normal file
58
website/docs/images/displacy-dep-founded.html
Normal file
|
@ -0,0 +1,58 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="c3124cc3e661444cb9d4175a5b7c09d1-0" class="displacy" width="925" height="399.5" direction="ltr" style="max-width: none; height: 399.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||
<tspan class="displacy-word" fill="currentColor" x="50">Smith</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="50"></tspan>
|
||||
</text>
|
||||
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||
<tspan class="displacy-word" fill="currentColor" x="225">founded</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="225"></tspan>
|
||||
</text>
|
||||
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||
<tspan class="displacy-word" fill="currentColor" x="400">a</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="400"></tspan>
|
||||
</text>
|
||||
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||
<tspan class="displacy-word" fill="currentColor" x="575">healthcare</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="575"></tspan>
|
||||
</text>
|
||||
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
|
||||
<tspan class="displacy-word" fill="currentColor" x="750">company.</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="750"></tspan>
|
||||
</text>
|
||||
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-0" stroke-width="2px" d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5" fill="none" stroke="currentColor"></path>
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-0" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">nsubj</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="M70,266.5 L62,254.5 78,254.5" fill="currentColor"></path>
|
||||
</g>
|
||||
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-1" stroke-width="2px" d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5" fill="none" stroke="currentColor"></path>
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-1" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">det</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="M420,266.5 L412,254.5 428,254.5" fill="currentColor"></path>
|
||||
</g>
|
||||
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-2" stroke-width="2px" d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5" fill="none" stroke="currentColor"></path>
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-2" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">compound</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="M595,266.5 L587,254.5 603,254.5" fill="currentColor"></path>
|
||||
</g>
|
||||
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-3" stroke-width="2px" d="M245,264.5 C245,2.0 750.0,2.0 750.0,264.5" fill="none" stroke="currentColor"></path>
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-c3124cc3e661444cb9d4175a5b7c09d1-0-3" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">dobj</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="M750.0,266.5 L758.0,254.5 742.0,254.5" fill="currentColor"></path>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 3.8 KiB |
|
@ -4,6 +4,7 @@ teaser: Find phrases and tokens, and match entities
|
|||
menu:
|
||||
- ['Token Matcher', 'matcher']
|
||||
- ['Phrase Matcher', 'phrasematcher']
|
||||
- ['Dependency Matcher', 'dependencymatcher']
|
||||
- ['Entity Ruler', 'entityruler']
|
||||
- ['Models & Rules', 'models-rules']
|
||||
---
|
||||
|
@ -938,10 +939,10 @@ object patterns as efficiently as possible and without running any of the other
|
|||
pipeline components. If the token attribute you want to match on are set by a
|
||||
pipeline component, **make sure that the pipeline component runs** when you
|
||||
create the pattern. For example, to match on `POS` or `LEMMA`, the pattern `Doc`
|
||||
objects need to have part-of-speech tags set by the `tagger`. You can either
|
||||
call the `nlp` object on your pattern texts instead of `nlp.make_doc`, or use
|
||||
[`nlp.select_pipes`](/api/language#select_pipes) to disable components
|
||||
selectively.
|
||||
objects need to have part-of-speech tags set by the `tagger` or `morphologizer`.
|
||||
You can either call the `nlp` object on your pattern texts instead of
|
||||
`nlp.make_doc`, or use [`nlp.select_pipes`](/api/language#select_pipes) to
|
||||
disable components selectively.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -972,10 +973,268 @@ to match phrases with the same sequence of punctuation and non-punctuation
|
|||
tokens as the pattern. But this can easily get confusing and doesn't have much
|
||||
of an advantage over writing one or two token patterns.
|
||||
|
||||
## Dependency Matcher {#dependencymatcher new="3"}
|
||||
|
||||
The [`DependencyMatcher`](/api/dependencymatcher) lets you match patterns within
|
||||
the dependency parse. It requires a model containing a parser such as the
|
||||
[`DependencyParser`](/api/dependencyparser). Instead of defining a list of
|
||||
adjacent tokens as in `Matcher` patterns, the `DependencyMatcher` patterns match
|
||||
tokens in the dependency parse and specify the relations between them.
|
||||
|
||||
> ```python
|
||||
> ### Example
|
||||
> from spacy.matcher import DependencyMatcher
|
||||
>
|
||||
> # "[subject] ... initially founded"
|
||||
> pattern = [
|
||||
> # anchor token: founded
|
||||
> {
|
||||
> "RIGHT_ID": "founded",
|
||||
> "RIGHT_ATTRS": {"ORTH": "founded"}
|
||||
> },
|
||||
> # founded -> subject
|
||||
> {
|
||||
> "LEFT_ID": "founded",
|
||||
> "REL_OP": ">",
|
||||
> "RIGHT_ID": "subject",
|
||||
> "RIGHT_ATTRS": {"DEP": "nsubj"}
|
||||
> },
|
||||
> # "founded" follows "initially"
|
||||
> {
|
||||
> "LEFT_ID": "founded",
|
||||
> "REL_OP": ";",
|
||||
> "RIGHT_ID": "initially",
|
||||
> "RIGHT_ATTRS": {"ORTH": "initially"}
|
||||
> }
|
||||
> ]
|
||||
>
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> matcher.add("FOUNDED", [pattern])
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
|
||||
with each dictionary describing a token to match and its relation to an existing
|
||||
token in the pattern. Except for the first dictionary, which defines an anchor
|
||||
token using only `RIGHT_ID` and `RIGHT_ATTRS`, each pattern should have the
|
||||
following keys:
|
||||
|
||||
| Name | Description |
|
||||
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `LEFT_ID` | The name of the left-hand node in the relation, which has been defined in an earlier node. |
|
||||
| `REL_OP` | An operator that describes how the two nodes are related. ~~str~~ |
|
||||
| `RIGHT_ID` | A unique name for the right-hand node in the relation. ~~str~~ |
|
||||
| `RIGHT_ATTRS` | The token attributes to match for the right-hand node in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ |
|
||||
|
||||
Each additional token added to the pattern is linked to an existing token
|
||||
`LEFT_ID` by the relation `REL_OP`. The new token is given the name `RIGHT_ID`
|
||||
and described by the attributes `RIGHT_ATTRS`.
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
Because the unique token **names** in `LEFT_ID` and `RIGHT_ID` are used to
|
||||
identify tokens, the order of the dicts in the patterns is important: a token
|
||||
name needs to be defined as `RIGHT_ID` in one dict in the pattern **before** it
|
||||
can be used as `LEFT_ID` in another dict.
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Dependency matcher operators
|
||||
|
||||
The following operators are supported by the `DependencyMatcher`, most of which
|
||||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B` |
|
||||
| `A > B` | `A` is the immediate head of `B` |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep->head paths |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head->dep paths |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_ |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1` |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1` |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i` |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i` |
|
||||
|
||||
### Designing dependency matcher patterns
|
||||
|
||||
Let's say we want to find sentences describing who founded what kind of company:
|
||||
|
||||
- `Smith founded a healthcare company in 2005.`
|
||||
- `Williams initially founded an insurance company in 1987.`
|
||||
- `Lee, an experienced CEO, has founded two AI startups.`
|
||||
|
||||
The dependency parse for `Smith founded a healthcare company` shows types of
|
||||
relations and tokens we want to match:
|
||||
|
||||
import DisplaCyDepFoundedHtml from 'images/displacy-dep-founded.html'
|
||||
|
||||
<Iframe title="displaCy visualization of dependencies" html={DisplaCyDepFoundedHtml} height={450} />
|
||||
|
||||
The relations we're interested in are:
|
||||
|
||||
- the founder is the subject (`nsubj`) of the token with the text `founded`
|
||||
- the company is the object (`dobj`) of `founded`
|
||||
- the kind of company may be an adjective (`amod`, not shown above) or a
|
||||
compound (`compound`)
|
||||
|
||||
The first step is to pick an anchor token for the pattern. Since it's the root
|
||||
of the dependency parse, `founded` is a good choice here. It is often easier to
|
||||
construct patterns when all dependency relation operators point from the head to
|
||||
the children. In this example, we'll only use `>`, which connects a head to an
|
||||
immediate dependent as `head > child`.
|
||||
|
||||
The simplest dependency matcher pattern will identify and name a single token in
|
||||
the tree:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import DependencyMatcher
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
matcher = DependencyMatcher(nlp.vocab)
|
||||
|
||||
pattern = [
|
||||
{
|
||||
"RIGHT_ID": "anchor_founded", # unique name
|
||||
"RIGHT_ATTRS": {"ORTH": "founded"} # token pattern for "founded"
|
||||
}
|
||||
]
|
||||
matcher.add("FOUNDED", [pattern])
|
||||
doc = nlp("Smith founded two companies.")
|
||||
matches = matcher(doc)
|
||||
print(matches) # [(4851363122962674176, [1])]
|
||||
```
|
||||
|
||||
Now that we have a named anchor token (`anchor_founded`), we can add the founder
|
||||
as the immediate dependent (`>`) of `founded` with the dependency label `nsubj`:
|
||||
|
||||
```python
|
||||
pattern = [
|
||||
{
|
||||
"RIGHT_ID": "anchor_founded",
|
||||
"RIGHT_ATTRS": {"ORTH": "founded"}
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "subject",
|
||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
The direct object (`dobj`) is added in the same way:
|
||||
|
||||
```python
|
||||
pattern = [ ...
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object",
|
||||
"RIGHT_ATTRS": {"DEP": "dobj"},
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
When the subject and object tokens are added, they are required to have names
|
||||
under the key `RIGHT_ID`, which are allowed to be any unique string, e.g.
|
||||
`founded_subject`. These names can then be used as `LEFT_ID` to link new tokens
|
||||
into the pattern. For the final part of our pattern, we'll specify that the
|
||||
token `founded_object` should have a modifier with the dependency relation
|
||||
`amod` or `compound`:
|
||||
|
||||
```python
|
||||
pattern = [ ...
|
||||
{
|
||||
"LEFT_ID": "founded_object",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object_modifier",
|
||||
"RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
You can picture the process of creating a dependency matcher pattern as defining
|
||||
an anchor token on the left and building up the pattern by linking tokens
|
||||
one-by-one on the right using relation operators. To create a valid pattern,
|
||||
each new token needs to be linked to an existing token on its left. As for
|
||||
`founded` in this example, a token may be linked to more than one token on its
|
||||
right:
|
||||
|
||||
<!-- TODO: adjust for final example, prettify -->
|
||||
|
||||
![Dependency matcher pattern](../images/dep-match-diagram.svg)
|
||||
|
||||
The full pattern comes together as shown in the example below:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import DependencyMatcher
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
matcher = DependencyMatcher(nlp.vocab)
|
||||
|
||||
pattern = [
|
||||
{
|
||||
"RIGHT_ID": "anchor_founded",
|
||||
"RIGHT_ATTRS": {"ORTH": "founded"}
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "subject",
|
||||
"RIGHT_ATTRS": {"DEP": "nsubj"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "anchor_founded",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object",
|
||||
"RIGHT_ATTRS": {"DEP": "dobj"},
|
||||
},
|
||||
{
|
||||
"LEFT_ID": "founded_object",
|
||||
"REL_OP": ">",
|
||||
"RIGHT_ID": "founded_object_modifier",
|
||||
"RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
|
||||
}
|
||||
]
|
||||
|
||||
matcher.add("FOUNDED", [pattern])
|
||||
doc = nlp("Lee, an experienced CEO, has founded two AI startups.")
|
||||
|
||||
matches = matcher(doc)
|
||||
print(matches) # [(4851363122962674176, [6, 0, 10, 9])]
|
||||
|
||||
# each token_id corresponds to one pattern dict
|
||||
match_id, token_ids = matches[0]
|
||||
for i in range(len(token_ids)):
|
||||
print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)
|
||||
```
|
||||
|
||||
<Infobox title="Important note on speed" variant="warning">
|
||||
|
||||
The dependency matcher may be slow when token patterns can potentially match
|
||||
many tokens in the sentence or when relation operators allow longer paths in the
|
||||
dependency parse, e.g. `<<`, `>>`, `.*` and `;*`.
|
||||
|
||||
To improve the matcher speed, try to make your token patterns and operators as
|
||||
specific as possible. For example, use `>` instead of `>>` if possible and use
|
||||
token patterns that include dependency labels and other token attributes instead
|
||||
of patterns such as `{}` that match any token in the sentence.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Rule-based entity recognition {#entityruler new="2.1"}
|
||||
|
||||
The [`EntityRuler`](/api/entityruler) is an exciting new component that lets you
|
||||
add named entities based on pattern dictionaries, and makes it easy to combine
|
||||
The [`EntityRuler`](/api/entityruler) is a component that lets you add named
|
||||
entities based on pattern dictionaries, which makes it easy to combine
|
||||
rule-based and statistical named entity recognition for even more powerful
|
||||
models.
|
||||
|
||||
|
|
|
@ -153,6 +153,7 @@ add to your pipeline and customize for your use case:
|
|||
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||
| [`DependencyMatcher`](/api/dependencymatcher) | Component for matching subtrees within a dependency parse. |
|
||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
@ -314,7 +315,8 @@ format for documenting argument and return types.
|
|||
[`Transformer`](/api/transformer), [`Lemmatizer`](/api/lemmatizer),
|
||||
[`Morphologizer`](/api/morphologizer),
|
||||
[`AttributeRuler`](/api/attributeruler),
|
||||
[`SentenceRecognizer`](/api/sentencerecognizer), [`Pipe`](/api/pipe),
|
||||
[`SentenceRecognizer`](/api/sentencerecognizer),
|
||||
[`DependencyMatcher`])(/api/dependencymatcher), [`Pipe`](/api/pipe),
|
||||
[`Corpus`](/api/corpus)
|
||||
|
||||
</Infobox>
|
||||
|
|
Loading…
Reference in New Issue
Block a user