Added more constucts for dependency tree matcher (#2836)

2025-08-31 17:35:00 +03:00 · 2018-10-30 03:51:39 +05:30 · 2018-10-30 03:51:39 +05:30 · 0bf14082a4
commit 0bf14082a4
parent 817e1fc5e5
2 changed files with 142 additions and 39 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -261,8 +261,13 @@ class Errors(object):
             "Span objects, or dicts if set to manual=True.")
    E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
            "phrase pattern (string) but got:\n{pattern}")
-
+    E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
-
+    E099 = ("First node of pattern should be a root node. The root should "
            "only contain NODE_NAME.")
    E100 = ("Nodes apart from the root should contain NODE_NAME, NBOR_NAME and "
            "NBOR_RELOP.")
    E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
            "have been declared in previous edges.")
@add_codes
 class TempErrors(object):
    T001 = ("Max length currently 10 for phrase matching")
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -29,7 +29,8 @@ from .attrs import FLAG41 as I4_ENT
 DELIMITER = '||'
-
+INDEX_HEAD = 1
 INDEX_RELOP = 0
 cdef enum action_t:
    REJECT = 0000
@ -731,18 +732,31 @@ cdef class DependencyTreeMatcher:
        """
        return self._normalize_key(key) in self._patterns
    def validateInput(self, pattern, key):
        idx = 0
        visitedNodes = {}
        for relation in pattern:
            if 'PATTERN' not in relation or 'SPEC' not in relation:
                raise ValueError(Errors.E098.format(key=key))
            if idx == 0:
                if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' not in relation['SPEC'] and 'NBOR_NAME' not in relation['SPEC']):
                    raise ValueError(Errors.E099.format(key=key))
                visitedNodes[relation['SPEC']['NODE_NAME']] = True
            else:
                if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' in relation['SPEC'] and 'NBOR_NAME' in relation['SPEC']):
                    raise ValueError(Errors.E100.format(key=key))
                if relation['SPEC']['NODE_NAME'] in visitedNodes or relation['SPEC']['NBOR_NAME'] not in visitedNodes:
                    raise ValueError(Errors.E101.format(key=key))
                visitedNodes[relation['SPEC']['NODE_NAME']] = True
                visitedNodes[relation['SPEC']['NBOR_NAME']] = True
            idx = idx + 1
    def add(self, key, on_match, *patterns):
        # TODO : validations
        # 1. check if input pattern is connected
        # 2. check if pattern format is correct
        # 3. check if atleast one root node is present
        # 4. check if node names are not repeated
        # 5. check if each node has only one head
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
            self.validateInput(pattern,key)
        key = self._normalize_key(key)
@ -792,7 +806,6 @@ cdef class DependencyTreeMatcher:
        self.retrieve_tree(patterns,_nodes_list,key)
    def retrieve_tree(self,patterns,_nodes_list,key):
        _heads_list = []
        _root_list = []
        for i in range(len(patterns)):
@ -801,16 +814,10 @@ cdef class DependencyTreeMatcher:
            for j in range(len(patterns[i])):
                token_pattern = patterns[i][j]
                if('NBOR_RELOP' not in token_pattern['SPEC']):
-                    heads[j] = j
+                    heads[j] = ('root',j)
                    root = j
                else:
-                    # TODO: Add semgrex rules
+                    heads[j] = (token_pattern['SPEC']['NBOR_RELOP'],_nodes_list[i][token_pattern['SPEC']['NBOR_NAME']])
                    # 1. >
                    if(token_pattern['SPEC']['NBOR_RELOP'] == '>'):
                        heads[j] = _nodes_list[i][token_pattern['SPEC']['NBOR_NAME']]
                    # 2. <
                    if(token_pattern['SPEC']['NBOR_RELOP'] == '<'):
                        heads[_nodes_list[i][token_pattern['SPEC']['NBOR_NAME']]] = j
            _heads_list.append(heads)
            _root_list.append(root)
@ -819,12 +826,13 @@ cdef class DependencyTreeMatcher:
        for i in range(len(patterns)):
            tree = {}
            for j in range(len(patterns[i])):
-                if(j == _heads_list[i][j]):
+                if(_heads_list[i][j][INDEX_HEAD] == j):
                    continue
-                head = _heads_list[i][j]
+
                head = _heads_list[i][j][INDEX_HEAD]
                if(head not in tree):
                    tree[head] = []
-                tree[head].append(j)
+                tree[head].append( (_heads_list[i][j][INDEX_RELOP],j) )
            _tree_list.append(tree)
        self._tree.setdefault(key, [])
@ -869,23 +877,25 @@ cdef class DependencyTreeMatcher:
                _root = _root_list[i]
                _tree = _tree_list[i]
                _nodes = _nodes_list[i]
                id_to_position = {}
                for i in range(len(_nodes)):
                    id_to_position[i]=[]
                # This could be taken outside to improve running time..?
                for match_id, start, end in matches:
                    if match_id in _keys_to_token:
                        if _keys_to_token[match_id] not in id_to_position:
                            id_to_position[_keys_to_token[match_id]] = []
                        id_to_position[_keys_to_token[match_id]].append(start)
                _node_operator_map = self.get_node_operator_map(doc,_tree,id_to_position,_nodes,_root)
                length = len(_nodes)
                if _root in id_to_position:
                    candidates = id_to_position[_root]
                    for candidate in candidates:
                        isVisited = {}
-                        self.dfs(candidate,_root,_tree,id_to_position,doc,isVisited)
+                        self.dfs(candidate,_root,_tree,id_to_position,doc,isVisited,_node_operator_map)
-                        # to check if the subtree pattern is completely identified
+                        # To check if the subtree pattern is completely identified. This is a heuristic.
                        # This is done to reduce the complexity of exponential unordered subtree matching.
                        # Will give approximate matches in some cases.
                        if(len(isVisited) == length):
                            matched_trees.append((key,list(isVisited)))
@ -896,23 +906,111 @@ cdef class DependencyTreeMatcher:
        return matched_trees
-    def dfs(self,candidate,root,tree,id_to_position,doc,isVisited):
+    def dfs(self,candidate,root,tree,id_to_position,doc,isVisited,_node_operator_map):
        if(root in id_to_position and candidate in id_to_position[root]):
            # color the node since it is valid
            isVisited[candidate] = True
            candidate_children = doc[candidate].children
            for candidate_child in candidate_children:
            if root in tree:
                for root_child in tree[root]:
-                        self.dfs(
+                    if candidate in _node_operator_map and root_child[INDEX_RELOP] in _node_operator_map[candidate]:
                        candidate_children = _node_operator_map[candidate][root_child[INDEX_RELOP]]
                        for candidate_child in candidate_children:
                            result = self.dfs(
                                candidate_child.i,
-                            root_child,
+                                root_child[INDEX_HEAD],
                                tree,
                                id_to_position,
                                doc,
-                            isVisited
+                                isVisited,
                                _node_operator_map
                            )
    # Given a node and an edge operator, to return the list of nodes
    # from the doc that belong to node+operator. This is used to store
    # all the results beforehand to prevent unnecessary computation while
    # pattern matching
    # _node_operator_map[node][operator] = [...]
    def get_node_operator_map(self,doc,tree,id_to_position,nodes,root):
        _node_operator_map = {}
        all_node_indices = nodes.values()
        all_operators = []
        for node in all_node_indices:
            if node in tree:
                for child in tree[node]:
                    all_operators.append(child[INDEX_RELOP])
        all_operators = list(set(all_operators))
        all_nodes = []
        for node in all_node_indices:
            all_nodes = all_nodes + id_to_position[node]
        all_nodes = list(set(all_nodes))
        for node in all_nodes:
            _node_operator_map[node] = {}
            for operator in all_operators:
                _node_operator_map[node][operator] = []
        # Used to invoke methods for each operator
        switcher = {
            '<':self.dep,
            '>':self.gov,
            '>>':self.dep_chain,
            '<<':self.gov_chain,
            '.':self.imm_precede,
            '$+':self.imm_right_sib,
            '$-':self.imm_left_sib,
            '$++':self.right_sib,
            '$--':self.left_sib
        }
        for operator in all_operators:
            for node in all_nodes:
                _node_operator_map[node][operator] = switcher.get(operator)(doc,node)
        return _node_operator_map
    def dep(self,doc,node):
        return list(doc[node].head)
    def gov(self,doc,node):
        return list(doc[node].children)
    def dep_chain(self,doc,node):
        return list(doc[node].ancestors)
    def gov_chain(self,doc,node):
        return list(doc[node].subtree)
    def imm_precede(self,doc,node):
        if node>0:
            return [doc[node-1]]
        return []
    def imm_right_sib(self,doc,node):
        for idx in range(list(doc[node].head.children)):
            if idx == node-1:
                return [doc[idx]]
        return []
    def imm_left_sib(self,doc,node):
        for idx in range(list(doc[node].head.children)):
            if idx == node+1:
                return [doc[idx]]
        return []
    def right_sib(self,doc,node):
        candidate_children = []
        for idx in range(list(doc[node].head.children)):
            if idx < node:
                candidate_children.append(doc[idx])
        return candidate_children
    def left_sib(self,doc,node):
        candidate_children = []
        for idx in range(list(doc[node].head.children)):
            if idx > node:
                candidate_children.append(doc[idx])
        return candidate_children
    def _normalize_key(self, key):
        if isinstance(key, basestring):
            return self.vocab.strings.add(key)