From e7cf6c7d9b1df2aa0bc6146276697b10d0866f5b Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 3 Jul 2023 12:20:04 +0200
Subject: [PATCH] Fix matcher/.

---
 spacy/matcher/dependencymatcher.pyx |  10 +-
 spacy/matcher/matcher.pyx           | 235 +++++++++++++++++-----------
 spacy/matcher/phrasematcher.pyx     |   4 +-
 3 files changed, 151 insertions(+), 98 deletions(-)

diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index a214c0668..348e000ff 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -108,7 +108,7 @@ cdef class DependencyMatcher:
         key (str): The match ID.
         RETURNS (bool): Whether the matcher contains rules for this match ID.
         """
-        return self.has_key(key)
+        return self.has_key(key)  # no-cython-lint: W601
 
     def _validate_input(self, pattern, key):
         idx = 0
@@ -264,7 +264,7 @@ cdef class DependencyMatcher:
 
     def remove(self, key):
         key = self._normalize_key(key)
-        if not key in self._patterns:
+        if key not in self._patterns:
             raise ValueError(Errors.E175.format(key=key))
         self._patterns.pop(key)
         self._raw_patterns.pop(key)
@@ -382,7 +382,7 @@ cdef class DependencyMatcher:
             return []
         return [doc[node].head]
 
-    def _gov(self,doc,node):
+    def _gov(self, doc, node):
         return list(doc[node].children)
 
     def _dep_chain(self, doc, node):
@@ -443,7 +443,7 @@ cdef class DependencyMatcher:
 
     def _right_child(self, doc, node):
         return [child for child in doc[node].rights]
-    
+
     def _left_child(self, doc, node):
         return [child for child in doc[node].lefts]
 
@@ -461,7 +461,7 @@ cdef class DependencyMatcher:
         if doc[node].head.i > node:
             return [doc[node].head]
         return []
-    
+
     def _left_parent(self, doc, node):
         if doc[node].head.i < node:
             return [doc[node].head]
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 3d03f37ae..ea35f5f73 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -19,10 +19,8 @@ from ..attrs cimport (
     LEMMA,
     MORPH,
     NULL_ATTR,
-    ORTH,
     POS,
     TAG,
-    attr_id_t,
 )
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
@@ -30,13 +28,11 @@ from ..tokens.morphanalysis cimport MorphAnalysis
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t
-from ..vocab cimport Vocab
 
 from ..attrs import IDS
 from ..errors import Errors, MatchPatternError, Warnings
 from ..schemas import validate_token_pattern
 from ..strings import get_string_id
-from ..util import registry
 from .levenshtein import levenshtein_compare
 
 DEF PADDING = 5
@@ -87,9 +83,9 @@ cdef class Matcher:
         key (str): The match ID.
         RETURNS (bool): Whether the matcher contains rules for this match ID.
         """
-        return self.has_key(key)
+        return self.has_key(key)  # no-cython-lint: W601
 
-    def add(self, key, patterns, *, on_match=None, greedy: str=None):
+    def add(self, key, patterns, *, on_match=None, greedy: str = None):
         """Add a match-rule to the matcher. A match-rule consists of: an ID
         key, an on_match callback, and one or more patterns.
 
@@ -143,8 +139,13 @@ cdef class Matcher:
         key = self._normalize_key(key)
         for pattern in patterns:
             try:
-                specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates, self._fuzzy_compare)
+                specs = _preprocess_pattern(
+                    pattern,
+                    self.vocab,
+                    self._extensions,
+                    self._extra_predicates,
+                    self._fuzzy_compare
+                )
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
                     for attr, _ in spec[1]:
@@ -168,7 +169,7 @@ cdef class Matcher:
         key (str): The ID of the match rule.
         """
         norm_key = self._normalize_key(key)
-        if not norm_key in self._patterns:
+        if norm_key not in self._patterns:
             raise ValueError(Errors.E175.format(key=key))
         self._patterns.pop(norm_key)
         self._callbacks.pop(norm_key)
@@ -268,8 +269,15 @@ cdef class Matcher:
         if self.patterns.empty():
             matches = []
         else:
-            matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
+            matches = find_matches(
+                &self.patterns[0],
+                self.patterns.size(),
+                doclike,
+                length,
+                extensions=self._extensions,
+                predicates=self._extra_predicates,
+                with_alignments=with_alignments
+            )
         final_matches = []
         pairs_by_id = {}
         # For each key, either add all matches, or only the filtered,
@@ -289,9 +297,9 @@ cdef class Matcher:
             memset(matched, 0, length * sizeof(matched[0]))
             span_filter = self._filter.get(key)
             if span_filter == "FIRST":
-                sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
+                sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False)  # sort by start
             elif span_filter == "LONGEST":
-                sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
+                sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True)  # reverse sort by length
             else:
                 raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
             for match in sorted_pairs:
@@ -366,7 +374,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     cdef vector[MatchC] matches
     cdef vector[vector[MatchAlignmentC]] align_states
     cdef vector[vector[MatchAlignmentC]] align_matches
-    cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
     output = []
@@ -388,14 +395,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
                 value = token.vocab.strings[value]
             extra_attr_values[i * nr_extra_attr + index] = value
     # Main loop
-    cdef int nr_predicate = len(predicates)
     for i in range(length):
         for j in range(n):
             states.push_back(PatternStateC(patterns[j], i, 0))
         if with_alignments != 0:
             align_states.resize(states.size())
-        transition_states(states, matches, align_states, align_matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates, with_alignments)
+        transition_states(
+            states,
+            matches,
+            align_states,
+            align_matches,
+            predicate_cache,
+            doclike[i],
+            extra_attr_values,
+            predicates,
+            with_alignments
+        )
         extra_attr_values += nr_extra_attr
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
@@ -421,18 +436,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     return output
 
 
-cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
-                            vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
-                            int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
+cdef void transition_states(
+    vector[PatternStateC]& states,
+    vector[MatchC]& matches,
+    vector[vector[MatchAlignmentC]]& align_states,
+    vector[vector[MatchAlignmentC]]& align_matches,
+    int8_t* cached_py_predicates,
+    Token token,
+    const attr_t* extra_attrs,
+    py_predicates,
+    bint with_alignments
+) except *:
     cdef int q = 0
     cdef vector[PatternStateC] new_states
     cdef vector[vector[MatchAlignmentC]] align_new_states
-    cdef int nr_predicate = len(py_predicates)
     for i in range(states.size()):
         if states[i].pattern.nr_py >= 1:
-            update_predicate_cache(cached_py_predicates,
-                states[i].pattern, token, py_predicates)
+            update_predicate_cache(
+                cached_py_predicates,
+                states[i].pattern,
+                token,
+                py_predicates
+            )
         action = get_action(states[i], token.c, extra_attrs,
                             cached_py_predicates)
         if action == REJECT:
@@ -468,8 +493,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                     align_new_states.push_back(align_states[q])
             states[q].pattern += 1
             if states[q].pattern.nr_py != 0:
-                update_predicate_cache(cached_py_predicates,
-                    states[q].pattern, token, py_predicates)
+                update_predicate_cache(
+                    cached_py_predicates,
+                    states[q].pattern,
+                    token,
+                    py_predicates
+                )
             action = get_action(states[q], token.c, extra_attrs,
                                 cached_py_predicates)
         # Update alignment before the transition of current state
@@ -485,8 +514,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             ent_id = get_ent_id(state.pattern)
             if action == MATCH:
                 matches.push_back(
-                    MatchC(pattern_id=ent_id, start=state.start,
-                            length=state.length+1))
+                    MatchC(
+                        pattern_id=ent_id,
+                        start=state.start,
+                        length=state.length+1
+                    )
+                )
                 # `align_matches` always corresponds to `matches` 1:1
                 if with_alignments != 0:
                     align_matches.push_back(align_states[q])
@@ -494,23 +527,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                 # push match without last token if length > 0
                 if state.length > 0:
                     matches.push_back(
-                        MatchC(pattern_id=ent_id, start=state.start,
-                                length=state.length))
+                        MatchC(
+                            pattern_id=ent_id,
+                            start=state.start,
+                            length=state.length
+                        )
+                    )
                     # MATCH_DOUBLE emits matches twice,
                     # add one more to align_matches in order to keep 1:1 relationship
                     if with_alignments != 0:
                         align_matches.push_back(align_states[q])
                 # push match with last token
                 matches.push_back(
-                    MatchC(pattern_id=ent_id, start=state.start,
-                            length=state.length+1))
+                    MatchC(
+                        pattern_id=ent_id,
+                        start=state.start,
+                        length=state.length + 1
+                    )
+                )
                 # `align_matches` always corresponds to `matches` 1:1
                 if with_alignments != 0:
                     align_matches.push_back(align_states[q])
             elif action == MATCH_REJECT:
                 matches.push_back(
-                    MatchC(pattern_id=ent_id, start=state.start,
-                            length=state.length))
+                    MatchC(
+                        pattern_id=ent_id,
+                        start=state.start,
+                        length=state.length
+                    )
+                )
                 # `align_matches` always corresponds to `matches` 1:1
                 if with_alignments != 0:
                     align_matches.push_back(align_states[q])
@@ -533,8 +578,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             align_states.push_back(align_new_states[i])
 
 
-cdef int update_predicate_cache(int8_t* cache,
-        const TokenPatternC* pattern, Token token, predicates) except -1:
+cdef int update_predicate_cache(
+    int8_t* cache,
+    const TokenPatternC* pattern,
+    Token token,
+    predicates
+) except -1:
     # If the state references any extra predicates, check whether they match.
     # These are cached, so that we don't call these potentially expensive
     # Python functions more than we need to.
@@ -580,10 +629,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
             else:
                 state.pattern += 1
 
-
-cdef action_t get_action(PatternStateC state,
-        const TokenC* token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches) nogil:
+cdef action_t get_action(
+    PatternStateC state,
+    const TokenC * token,
+    const attr_t * extra_attrs,
+    const int8_t * predicate_matches
+) nogil:
     """We need to consider:
     a) Does the token match the specification? [Yes, No]
     b) What's the quantifier? [1, 0+, ?]
@@ -649,53 +700,56 @@ cdef action_t get_action(PatternStateC state,
         is_match = not is_match
         quantifier = ONE
     if quantifier == ONE:
-      if is_match and is_final:
-          # Yes, final: 1000
-          return MATCH
-      elif is_match and not is_final:
-          # Yes, non-final: 0100
-          return ADVANCE
-      elif not is_match and is_final:
-          # No, final: 0000
-          return REJECT
-      else:
-          return REJECT
+        if is_match and is_final:
+            # Yes, final: 1000
+            return MATCH
+        elif is_match and not is_final:
+            # Yes, non-final: 0100
+            return ADVANCE
+        elif not is_match and is_final:
+            # No, final: 0000
+            return REJECT
+        else:
+            return REJECT
     elif quantifier == ZERO_PLUS:
-      if is_match and is_final:
-          # Yes, final: 1001
-          return MATCH_EXTEND
-      elif is_match and not is_final:
-          # Yes, non-final: 0011
-          return RETRY_EXTEND
-      elif not is_match and is_final:
-          # No, final 2000 (note: Don't include last token!)
-          return MATCH_REJECT
-      else:
-          # No, non-final 0010
-          return RETRY
+        if is_match and is_final:
+            # Yes, final: 1001
+            return MATCH_EXTEND
+        elif is_match and not is_final:
+            # Yes, non-final: 0011
+            return RETRY_EXTEND
+        elif not is_match and is_final:
+            # No, final 2000 (note: Don't include last token!)
+            return MATCH_REJECT
+        else:
+            # No, non-final 0010
+            return RETRY
     elif quantifier == ZERO_ONE:
-      if is_match and is_final:
-          # Yes, final: 3000
-          # To cater for a pattern ending in "?", we need to add
-          # a match both with and without the last token
-          return MATCH_DOUBLE
-      elif is_match and not is_final:
-          # Yes, non-final: 0110
-          # We need both branches here, consider a pair like:
-          # pattern: .?b string: b
-          # If we 'ADVANCE' on the .?, we miss the match.
-          return RETRY_ADVANCE
-      elif not is_match and is_final:
-          # No, final 2000 (note: Don't include last token!)
-          return MATCH_REJECT
-      else:
-          # No, non-final 0010
-          return RETRY
+        if is_match and is_final:
+            # Yes, final: 3000
+            # To cater for a pattern ending in "?", we need to add
+            # a match both with and without the last token
+            return MATCH_DOUBLE
+        elif is_match and not is_final:
+            # Yes, non-final: 0110
+            # We need both branches here, consider a pair like:
+            # pattern: .?b string: b
+            # If we 'ADVANCE' on the .?, we miss the match.
+            return RETRY_ADVANCE
+        elif not is_match and is_final:
+            # No, final 2000 (note: Don't include last token!)
+            return MATCH_REJECT
+        else:
+            # No, non-final 0010
+            return RETRY
 
 
-cdef int8_t get_is_match(PatternStateC state,
-        const TokenC* token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches) nogil:
+cdef int8_t get_is_match(
+    PatternStateC state,
+    const TokenC* token,
+    const attr_t* extra_attrs,
+    const int8_t* predicate_matches
+) nogil:
     for i in range(state.pattern.nr_py):
         if predicate_matches[state.pattern.py_predicates[i]] == -1:
             return 0
@@ -860,7 +914,7 @@ class _FuzzyPredicate:
         self.is_extension = is_extension
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
-        fuzz = self.predicate[len("FUZZY"):] # number after prefix
+        fuzz = self.predicate[len("FUZZY"):]  # number after prefix
         self.fuzzy = int(fuzz) if fuzz else -1
         self.fuzzy_compare = fuzzy_compare
         self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
@@ -1082,7 +1136,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
         elif cls == _FuzzyPredicate:
             if isinstance(value, dict):
                 # add predicates inside fuzzy operator
-                fuzz = type_[len("FUZZY"):] # number after prefix
+                fuzz = type_[len("FUZZY"):]  # number after prefix
                 fuzzy_val = int(fuzz) if fuzz else -1
                 output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
                                                          extra_predicates, seen_predicates,
@@ -1101,8 +1155,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
     return output
 
 
-def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
-        seen_predicates):
+def _get_extension_extra_predicates(
+    spec, extra_predicates, predicate_types, seen_predicates
+):
     output = []
     for attr, value in spec.items():
         if isinstance(value, dict):
@@ -1131,7 +1186,7 @@ def _get_operators(spec):
         return (ONE,)
     elif spec["OP"] in lookup:
         return lookup[spec["OP"]]
-    #Min_max {n,m}
+    # Min_max {n,m}
     elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
         # {n}  --> {n,n}  exactly n                 ONE,(n)
         # {n,m}--> {n,m}  min of n, max of m        ONE,(n),ZERO_ONE,(m)
@@ -1142,8 +1197,8 @@ def _get_operators(spec):
         min_max = min_max if "," in min_max else f"{min_max},{min_max}"
         n, m = min_max.split(",")
 
-        #1. Either n or m is a blank string and the other is numeric -->isdigit
-        #2. Both are numeric and n <= m
+        # 1. Either n or m is a blank string and the other is numeric -->isdigit
+        # 2. Both are numeric and n <= m
         if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
             keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
             raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index c407cf1cc..26633e6d6 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,14 +1,12 @@
 # cython: infer_types=True, profile=True
-from libc.stdint cimport uintptr_t
 from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 
 import warnings
 
-from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
+from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
 
 from ..attrs import IDS
 
-from ..structs cimport TokenC
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..typedefs cimport attr_t