From e83c818a7897c2e4964580530671b7c0e607538c Mon Sep 17 00:00:00 2001 From: Luigi Coniglio Date: Fri, 22 Jan 2021 00:20:08 +0000 Subject: [PATCH] DependencyMatcher improvements (fix #6678) (#6744) * Adding contributor agreement for user werew * [DependencyMatcher] Comment and clean code * [DependencyMatcher] Use defaultdicts * [DependencyMatcher] Simplify _retrieve_tree method * [DependencyMatcher] Remove prepended underscores * [DependencyMatcher] Address TODO and move grouping of token's positions out of the loop * [DependencyMatcher] Remove _nodes attribute * [DependencyMatcher] Use enumerate in _retrieve_tree method * [DependencyMatcher] Clean unused vars and use camel_case naming * [DependencyMatcher] Memoize node+operator map * Add root property to Token * [DependencyMatcher] Groups matches by root * [DependencyMatcher] Remove unused _keys_to_token attribute * [DependencyMatcher] Use a list to map tokens to matcher's keys * [DependencyMatcher] Remove recursion * [DependencyMatcher] Use a generator to retrieve matches * [DependencyMatcher] Remove unused memory pool * [DependencyMatcher] Hide private methods and attributes * [DependencyMatcher] Improvements to the matches validation * Apply suggestions from code review Co-authored-by: Matthew Honnibal * [DependencyMatcher] Fix keys_to_position_maps * Remove Token.root property * [DependencyMatcher] Remove functools' lru_cache Co-authored-by: Matthew Honnibal --- .github/contributors/werew.md | 106 ++++++++ spacy/matcher/dependencymatcher.pyx | 369 +++++++++++++++------------- spacy/tokens/token.pyx | 3 +- 3 files changed, 306 insertions(+), 172 deletions(-) create mode 100644 .github/contributors/werew.md diff --git a/.github/contributors/werew.md b/.github/contributors/werew.md new file mode 100644 index 000000000..04a79ed9b --- /dev/null +++ b/.github/contributors/werew.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Luigi Coniglio | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 10/01/2021 | +| GitHub username | werew | +| Website (optional) | | diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 02f7c9318..69de57026 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -1,10 +1,10 @@ # cython: infer_types=True, profile=True from typing import List +from collections import defaultdict +from itertools import product import numpy -from cymem.cymem cimport Pool - from .matcher cimport Matcher from ..vocab cimport Vocab from ..tokens.doc cimport Doc @@ -20,15 +20,13 @@ INDEX_RELOP = 0 cdef class DependencyMatcher: """Match dependency parse tree based on pattern rules.""" - cdef Pool mem cdef readonly Vocab vocab - cdef readonly Matcher matcher + cdef readonly Matcher _matcher cdef public object _patterns cdef public object _raw_patterns - cdef public object _keys_to_token + cdef public object _tokens_to_key cdef public object _root cdef public object _callbacks - cdef public object _nodes cdef public object _tree cdef public object _ops @@ -40,30 +38,50 @@ cdef class DependencyMatcher: validate (bool): Whether patterns should be validated, passed to Matcher as `validate` """ - size = 20 - self.matcher = Matcher(vocab, validate=validate) - self._keys_to_token = {} - self._patterns = {} - self._raw_patterns = {} - self._root = {} - self._nodes = {} - self._tree = {} + self._matcher = Matcher(vocab, validate=validate) + + # Associates each key to the raw patterns list added by the user + # e.g. {'key': [[{'RIGHT_ID': ..., 'RIGHT_ATTRS': ... }, ... ], ... ], ... } + self._raw_patterns = defaultdict(list) + + # Associates each key to a list of lists of 'RIGHT_ATTRS' + # e.g. {'key': [[{'POS': ... }, ... ], ... ], ... } + self._patterns = defaultdict(list) + + # Associates each key to a list of lists where each list associates + # a token position within the pattern to the key used by the internal matcher) + # e.g. {'key': [ ['token_key', ... ], ... ], ... } + self._tokens_to_key = defaultdict(list) + + # Associates each key to a list of ints where each int corresponds + # to the position of the root token within the pattern + # e.g. {'key': [3, 1, 4, ... ], ... } + self._root = defaultdict(list) + + # Associates each key to a list of dicts where each dict describes all + # branches from a token (identifed by its position) to other tokens as + # a list of tuples: ('REL_OP', 'LEFT_ID') + # e.g. {'key': [{2: [('<', 'left_id'), ...] ...}, ... ], ...} + self._tree = defaultdict(list) + + # Associates each key to its on_match callback + # e.g. {'key': on_match_callback, ...} self._callbacks = {} + self.vocab = vocab - self.mem = Pool() self._ops = { - "<": self.dep, - ">": self.gov, - "<<": self.dep_chain, - ">>": self.gov_chain, - ".": self.imm_precede, - ".*": self.precede, - ";": self.imm_follow, - ";*": self.follow, - "$+": self.imm_right_sib, - "$-": self.imm_left_sib, - "$++": self.right_sib, - "$--": self.left_sib, + "<": self._dep, + ">": self._gov, + "<<": self._dep_chain, + ">>": self._gov_chain, + ".": self._imm_precede, + ".*": self._precede, + ";": self._imm_follow, + ";*": self._follow, + "$+": self._imm_right_sib, + "$-": self._imm_left_sib, + "$++": self._right_sib, + "$--": self._left_sib, } def __reduce__(self): @@ -86,7 +104,7 @@ cdef class DependencyMatcher: """ return self.has_key(key) - def validate_input(self, pattern, key): + def _validate_input(self, pattern, key): idx = 0 visited_nodes = {} for relation in pattern: @@ -121,6 +139,16 @@ cdef class DependencyMatcher: visited_nodes[relation["LEFT_ID"]] = True idx = idx + 1 + def _get_matcher_key(self, key, pattern_idx, token_idx): + """ + Creates a token key to be used by the matcher + """ + return self._normalize_key( + unicode(key) + DELIMITER + + unicode(pattern_idx) + DELIMITER + + unicode(token_idx) + ) + def add(self, key, patterns, *, on_match=None): """Add a new matcher rule to the matcher. @@ -135,10 +163,15 @@ cdef class DependencyMatcher: for pattern in patterns: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) - self.validate_input(pattern, key) + self._validate_input(pattern, key) + key = self._normalize_key(key) - self._raw_patterns.setdefault(key, []) + + # Save raw patterns and on_match callback self._raw_patterns[key].extend(patterns) + self._callbacks[key] = on_match + + # Add 'RIGHT_ATTRS' to self._patterns[key] _patterns = [] for pattern in patterns: token_patterns = [] @@ -146,69 +179,64 @@ cdef class DependencyMatcher: token_pattern = [pattern[i]["RIGHT_ATTRS"]] token_patterns.append(token_pattern) _patterns.append(token_patterns) - self._patterns.setdefault(key, []) - self._callbacks[key] = on_match self._patterns[key].extend(_patterns) + # Add each node pattern of all the input patterns individually to the # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. - _keys_to_token_list = [] + tokens_to_key_list = [] for i in range(len(_patterns)): - _keys_to_token = {} + + # Preallocate list space + tokens_to_key = [None]*len(_patterns[i]) + # TODO: Better ways to hash edges in pattern? for j in range(len(_patterns[i])): - k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j)) - self.matcher.add(k, [_patterns[i][j]]) - _keys_to_token[k] = j - _keys_to_token_list.append(_keys_to_token) - self._keys_to_token.setdefault(key, []) - self._keys_to_token[key].extend(_keys_to_token_list) - _nodes_list = [] - for pattern in patterns: - nodes = {} - for i in range(len(pattern)): - nodes[pattern[i]["RIGHT_ID"]] = i - _nodes_list.append(nodes) - self._nodes.setdefault(key, []) - self._nodes[key].extend(_nodes_list) + k = self._get_matcher_key(key, i, j) + self._matcher.add(k, [_patterns[i][j]]) + tokens_to_key[j] = k + + tokens_to_key_list.append(tokens_to_key) + + self._tokens_to_key[key].extend(tokens_to_key_list) + # Create an object tree to traverse later on. This data structure # enables easy tree pattern match. Doc-Token based tree cannot be # reused since it is memory-heavy and tightly coupled with the Doc. - self.retrieve_tree(patterns, _nodes_list, key) + self._retrieve_tree(patterns, key) - def retrieve_tree(self, patterns, _nodes_list, key): - _heads_list = [] - _root_list = [] - for i in range(len(patterns)): - heads = {} + def _retrieve_tree(self, patterns, key): + + # New trees belonging to this key + tree_list = [] + + # List of indices to the root nodes + root_list = [] + + # Group token id to create trees + for i, pattern in enumerate(patterns): + tree = defaultdict(list) root = -1 - for j in range(len(patterns[i])): - token_pattern = patterns[i][j] - if ("REL_OP" not in token_pattern): - heads[j] = ('root', j) - root = j - else: - heads[j] = ( - token_pattern["REL_OP"], - _nodes_list[i][token_pattern["LEFT_ID"]] + + right_id_to_token = {} + for j, token in enumerate(pattern): + right_id_to_token[token["RIGHT_ID"]] = j + + for j, token in enumerate(pattern): + if "REL_OP" in token: + # Add tree branch + tree[right_id_to_token[token["LEFT_ID"]]].append( + (token["REL_OP"], j), ) - _heads_list.append(heads) - _root_list.append(root) - _tree_list = [] - for i in range(len(patterns)): - tree = {} - for j in range(len(patterns[i])): - if(_heads_list[i][j][INDEX_HEAD] == j): - continue - head = _heads_list[i][j][INDEX_HEAD] - if(head not in tree): - tree[head] = [] - tree[head].append((_heads_list[i][j][INDEX_RELOP], j)) - _tree_list.append(tree) - self._tree.setdefault(key, []) - self._tree[key].extend(_tree_list) - self._root.setdefault(key, []) - self._root[key].extend(_root_list) + else: + # No 'REL_OP', this is the root + root = j + + tree_list.append(tree) + root_list.append(root) + + self._tree[key].extend(tree_list) + self._root[key].extend(root_list) def has_key(self, key): """Check whether the matcher has a rule with a given key. @@ -235,9 +263,25 @@ cdef class DependencyMatcher: raise ValueError(Errors.E175.format(key=key)) self._patterns.pop(key) self._raw_patterns.pop(key) - self._nodes.pop(key) self._tree.pop(key) self._root.pop(key) + self._tokens_to_key.pop(key) + + def _get_keys_to_position_maps(self, doc): + """ + Processes the doc and groups all matches by their root and match id. + Returns a dict mapping each (root, match id) pair to the list of + tokens indices which are descendants of root and match the token + pattern identified by the given match id. + + e.g. keys_to_position_maps[root_index][match_id] = [...] + """ + keys_to_position_maps = defaultdict(lambda: defaultdict(list)) + for match_id, start, _ in self._matcher(doc): + token = doc[start] + root = ([token] + list(token.ancestors))[-1] + keys_to_position_maps[root.i][match_id].append(start) + return keys_to_position_maps def __call__(self, object doclike): """Find all token sequences matching the supplied pattern. @@ -253,146 +297,131 @@ cdef class DependencyMatcher: doc = doclike.as_doc() else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) - matched_key_trees = [] - matches = self.matcher(doc) - for key in list(self._patterns.keys()): - _patterns_list = self._patterns[key] - _keys_to_token_list = self._keys_to_token[key] - _root_list = self._root[key] - _tree_list = self._tree[key] - _nodes_list = self._nodes[key] - length = len(_patterns_list) - for i in range(length): - _keys_to_token = _keys_to_token_list[i] - _root = _root_list[i] - _tree = _tree_list[i] - _nodes = _nodes_list[i] - id_to_position = {} - for i in range(len(_nodes)): - id_to_position[i]=[] - # TODO: This could be taken outside to improve running time..? - for match_id, start, end in matches: - if match_id in _keys_to_token: - id_to_position[_keys_to_token[match_id]].append(start) - _node_operator_map = self.get_node_operator_map( - doc, - _tree, - id_to_position, - _nodes,_root - ) - length = len(_nodes) - matched_trees = [] - self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) - for matched_tree in matched_trees: - matched_key_trees.append((key, matched_tree)) + matched_key_trees = [] + keys_to_position_maps = self._get_keys_to_position_maps(doc) + cache = {} + + for key in self._patterns.keys(): + for root, tree, tokens_to_key in zip( + self._root[key], + self._tree[key], + self._tokens_to_key[key], + ): + for keys_to_position in keys_to_position_maps.values(): + for matched_tree in self._get_matches(cache, doc, tree, tokens_to_key, keys_to_position): + matched_key_trees.append((key, matched_tree)) + for i, (match_id, nodes) in enumerate(matched_key_trees): on_match = self._callbacks.get(match_id) if on_match is not None: on_match(self, doc, i, matched_key_trees) + return matched_key_trees - def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): - cdef bint isValid; - if patternLength == len(id_to_position.keys()): - isValid = True - for node in range(patternLength): - if node in tree: - for idx, (relop,nbor) in enumerate(tree[node]): - computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop]) - isNbor = False - for computed_nbor in computed_nbors: - if computed_nbor.i == visited_nodes[nbor]: - isNbor = True - isValid = isValid & isNbor - if(isValid): - matched_trees.append(visited_nodes) - return - allPatternNodes = numpy.asarray(id_to_position[patternLength]) - for patternNode in allPatternNodes: - self.recurse(tree, id_to_position, _node_operator_map, patternLength+1, visited_nodes+[patternNode], matched_trees) + def _get_matches(self, cache, doc, tree, tokens_to_key, keys_to_position): + cdef bint is_valid - # Given a node and an edge operator, to return the list of nodes - # from the doc that belong to node+operator. This is used to store - # all the results beforehand to prevent unnecessary computation while - # pattern matching - # _node_operator_map[node][operator] = [...] - def get_node_operator_map(self, doc, tree, id_to_position, nodes, root): - _node_operator_map = {} - all_node_indices = nodes.values() - all_operators = [] - for node in all_node_indices: - if node in tree: - for child in tree[node]: - all_operators.append(child[INDEX_RELOP]) - all_operators = list(set(all_operators)) - all_nodes = [] - for node in all_node_indices: - all_nodes = all_nodes + id_to_position[node] - all_nodes = list(set(all_nodes)) - for node in all_nodes: - _node_operator_map[node] = {} - for operator in all_operators: - _node_operator_map[node][operator] = [] - for operator in all_operators: - for node in all_nodes: - _node_operator_map[node][operator] = self._ops.get(operator)(doc, node) - return _node_operator_map + all_positions = [keys_to_position[key] for key in tokens_to_key] - def dep(self, doc, node): + # Generate all potential matches by computing the cartesian product of all + # position of the matched tokens + for candidate_match in product(*all_positions): + + # A potential match is a valid match if all relationhips between the + # matched tokens are satisfied. + is_valid = True + for left_idx in range(len(candidate_match)): + is_valid = self._check_relationships(cache, doc, candidate_match, left_idx, tree) + + if not is_valid: + break + + if is_valid: + yield list(candidate_match) + + def _check_relationships(self, cache, doc, candidate_match, left_idx, tree): + # Position of the left operand within the document + left_pos = candidate_match[left_idx] + + for relop, right_idx in tree[left_idx]: + + # Position of the left operand within the document + right_pos = candidate_match[right_idx] + + # List of valid right matches + valid_right_matches = self._resolve_node_operator(cache, doc, left_pos, relop) + + # If the proposed right token is not within the valid ones, fail + if right_pos not in valid_right_matches: + return False + + return True + + def _resolve_node_operator(self, cache, doc, node, operator): + """ + Given a doc, a node (as a index in the doc) and a REL_OP operator + returns the list of nodes from the doc that belong to node+operator. + """ + key = (node, operator) + if key not in cache: + cache[key] = [token.i for token in self._ops[operator](doc, node)] + return cache[key] + + def _dep(self, doc, node): if doc[node].head == doc[node]: return [] return [doc[node].head] - def gov(self,doc,node): + def _gov(self,doc,node): return list(doc[node].children) - def dep_chain(self, doc, node): + def _dep_chain(self, doc, node): return list(doc[node].ancestors) - def gov_chain(self, doc, node): + def _gov_chain(self, doc, node): return [t for t in doc[node].subtree if t != doc[node]] - def imm_precede(self, doc, node): + def _imm_precede(self, doc, node): sent = self._get_sent(doc[node]) if node < len(doc) - 1 and doc[node + 1] in sent: return [doc[node + 1]] return [] - def precede(self, doc, node): + def _precede(self, doc, node): sent = self._get_sent(doc[node]) return [doc[i] for i in range(node + 1, sent.end)] - def imm_follow(self, doc, node): + def _imm_follow(self, doc, node): sent = self._get_sent(doc[node]) if node > 0 and doc[node - 1] in sent: return [doc[node - 1]] return [] - def follow(self, doc, node): + def _follow(self, doc, node): sent = self._get_sent(doc[node]) return [doc[i] for i in range(sent.start, node)] - def imm_right_sib(self, doc, node): + def _imm_right_sib(self, doc, node): for child in list(doc[node].head.children): if child.i == node + 1: return [doc[child.i]] return [] - def imm_left_sib(self, doc, node): + def _imm_left_sib(self, doc, node): for child in list(doc[node].head.children): if child.i == node - 1: return [doc[child.i]] return [] - def right_sib(self, doc, node): + def _right_sib(self, doc, node): candidate_children = [] for child in list(doc[node].head.children): if child.i > node: candidate_children.append(doc[child.i]) return candidate_children - def left_sib(self, doc, node): + def _left_sib(self, doc, node): candidate_children = [] for child in list(doc[node].head.children): if child.i < node: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index e294af823..71c0baf63 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -680,9 +680,8 @@ cdef class Token: # Find the widest l/r_edges of the roots of the two tokens involved # to limit the number of tokens for set_children_from_heads cdef Token self_root, new_head_root - self_ancestors = list(self.ancestors) + self_root = ([self] + list(self.ancestors))[-1] new_head_ancestors = list(new_head.ancestors) - self_root = self_ancestors[-1] if self_ancestors else self new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge