From 1f2e57eca4ccbb283cda9b82d400872cbbec81c9 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 22 Aug 2022 17:02:47 +0200
Subject: [PATCH 01/15] enable fuzzy matching

---
 requirements.txt                        |  1 +
 setup.cfg                               |  1 +
 spacy/matcher/matcher.pxd               |  1 +
 spacy/matcher/matcher.pyi               |  2 +-
 spacy/matcher/matcher.pyx               | 44 ++++++++++++++---------
 spacy/tests/matcher/test_matcher_api.py | 48 +++++++++++++++++++++++++
 6 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 437dd415a..38b4cbf0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,7 @@ tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
 jinja2
 langcodes>=3.2.0,<4.0.0
+rapidfuzz>=2.4.0,<3.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index 708300b04..536322ab1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -63,6 +63,7 @@ install_requires =
     packaging>=20.0
     typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
     langcodes>=3.2.0,<4.0.0
+    rapidfuzz>=2.4.0,<3.0.0
 
 [options.entry_points]
 console_scripts =
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 455f978cc..b5e24e0e2 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -71,6 +71,7 @@ cdef class Matcher:
     cdef vector[TokenPatternC*] patterns
     cdef readonly Vocab vocab
     cdef public object validate
+    cdef public object fuzzy
     cdef public object _patterns
     cdef public object _callbacks
     cdef public object _filter
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 390629ff8..c7f487450 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -5,7 +5,7 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span
 
 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
+    def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ...
     def __reduce__(self) -> Any: ...
     def __len__(self) -> int: ...
     def __contains__(self, key: str) -> bool: ...
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 5105f69ed..0d847c219 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -10,6 +10,7 @@ from murmurhash.mrmr cimport hash64
 import re
 import srsly
 import warnings
+from rapidfuzz import fuzz_cpp
 
 from ..typedefs cimport attr_t
 from ..structs cimport TokenC
@@ -19,6 +20,7 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
+from ..attrs cimport LOWER, NORM
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -36,7 +38,7 @@ cdef class Matcher:
     USAGE: https://spacy.io/usage/rule-based-matching
     """
 
-    def __init__(self, vocab, validate=True):
+    def __init__(self, vocab, validate=True, fuzzy=None):
         """Create the Matcher.
 
         vocab (Vocab): The vocabulary object, which must be shared with the
@@ -51,6 +53,7 @@ cdef class Matcher:
         self.vocab = vocab
         self.mem = Pool()
         self.validate = validate
+        self.fuzzy = fuzzy if fuzzy is not None else 0
 
     def __reduce__(self):
         data = (self.vocab, self._patterns, self._callbacks)
@@ -253,7 +256,8 @@ cdef class Matcher:
             matches = []
         else:
             matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
+                                    extensions=self._extensions, predicates=self._extra_predicates,
+                                    with_alignments=with_alignments, fuzzy=self.fuzzy)
         final_matches = []
         pairs_by_id = {}
         # For each key, either add all matches, or only the filtered,
@@ -334,7 +338,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
     return matcher
 
 
-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0):
     """Find matches in a doc, with a compiled array of patterns. Matches are
     returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
 
@@ -379,7 +383,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         if with_alignments != 0:
             align_states.resize(states.size())
         transition_states(states, matches, align_states, align_matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates, with_alignments)
+            doclike[i], extra_attr_values, predicates, with_alignments, fuzzy)
         extra_attr_values += nr_extra_attr
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
@@ -408,7 +412,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
                             vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
                             int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
+        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *:
     cdef int q = 0
     cdef vector[PatternStateC] new_states
     cdef vector[vector[MatchAlignmentC]] align_new_states
@@ -417,8 +421,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
         if states[i].pattern.nr_py >= 1:
             update_predicate_cache(cached_py_predicates,
                 states[i].pattern, token, py_predicates)
-        action = get_action(states[i], token.c, extra_attrs,
-                            cached_py_predicates)
+        action = get_action(states[i], token, extra_attrs,
+                            cached_py_predicates, fuzzy)
         if action == REJECT:
             continue
         # Keep only a subset of states (the active ones). Index q is the
@@ -454,8 +458,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             if states[q].pattern.nr_py != 0:
                 update_predicate_cache(cached_py_predicates,
                     states[q].pattern, token, py_predicates)
-            action = get_action(states[q], token.c, extra_attrs,
-                                cached_py_predicates)
+            action = get_action(states[q], token, extra_attrs,
+                                cached_py_predicates, fuzzy)
         # Update alignment before the transition of current state
         if with_alignments != 0:
             align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
@@ -566,8 +570,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
 
 
 cdef action_t get_action(PatternStateC state,
-        const TokenC* token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches) nogil:
+        Token token, const attr_t* extra_attrs,
+        const int8_t* predicate_matches, float fuzzy) nogil:
     """We need to consider:
     a) Does the token match the specification? [Yes, No]
     b) What's the quantifier? [1, 0+, ?]
@@ -626,7 +630,7 @@ cdef action_t get_action(PatternStateC state,
     Problem: If a quantifier is matching, we're adding a lot of open partials
     """
     cdef int8_t is_match
-    is_match = get_is_match(state, token, extra_attrs, predicate_matches)
+    is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy)
     quantifier = get_quantifier(state)
     is_final = get_is_final(state)
     if quantifier == ZERO:
@@ -678,16 +682,24 @@ cdef action_t get_action(PatternStateC state,
 
 
 cdef int8_t get_is_match(PatternStateC state,
-        const TokenC* token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches) nogil:
+        Token token, const attr_t* extra_attrs,
+        const int8_t* predicate_matches, float fuzzy) nogil:
     for i in range(state.pattern.nr_py):
         if predicate_matches[state.pattern.py_predicates[i]] == -1:
             return 0
     spec = state.pattern
     if spec.nr_attr > 0:
         for attr in spec.attrs[:spec.nr_attr]:
-            if get_token_attr_for_matcher(token, attr.attr) != attr.value:
-                return 0
+            token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
+            if token_attr_value != attr.value:
+                if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA
+                                   or attr.attr == LOWER or attr.attr == NORM):
+                    with gil:
+                        if fuzz_cpp.ratio(token.vocab.strings[token_attr_value],
+                                          token.vocab.strings[attr.value]) < fuzzy:
+                            return 0
+                else:
+                    return 0
     for i in range(spec.nr_extra_attr):
         if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
             return 0
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 7c16da9f8..c29a349af 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -118,6 +118,54 @@ def test_matcher_match_multi(matcher):
     ]
 
 
+def test_matcher_match_fuzz_all(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+        (doc.vocab.strings["Java"], 5, 6),
+    ]
+
+def test_matcher_match_fuzz_some(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=85)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["Java"], 5, 6),
+    ]
+
+def test_matcher_match_fuzz_none(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=90)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == []
+
+
 def test_matcher_empty_dict(en_vocab):
     """Test matcher allows empty token specs, meaning match on any token."""
     matcher = Matcher(en_vocab)

From b617382dc65432956accb91f150e52d79019dcaa Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Wed, 24 Aug 2022 13:13:27 +0200
Subject: [PATCH 02/15] add fuzzy param to EntityMatcher

---
 spacy/pipeline/entityruler.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 3cb1ca676..d1b05334e 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -28,6 +28,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "overwrite_ents": False,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
         "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
+        "fuzzy": None,
     },
     default_score_weights={
         "ents_f": 1.0,
@@ -44,6 +45,7 @@ def make_entity_ruler(
     overwrite_ents: bool,
     ent_id_sep: str,
     scorer: Optional[Callable],
+    fuzzy: Optional[float],
 ):
     return EntityRuler(
         nlp,
@@ -53,6 +55,7 @@ def make_entity_ruler(
         overwrite_ents=overwrite_ents,
         ent_id_sep=ent_id_sep,
         scorer=scorer,
+        fuzzy=fuzzy,
     )
 
 
@@ -87,6 +90,7 @@ class EntityRuler(Pipe):
         ent_id_sep: str = DEFAULT_ENT_ID_SEP,
         patterns: Optional[List[PatternType]] = None,
         scorer: Optional[Callable] = entity_ruler_score,
+        fuzzy: Optional[float] = None,
     ) -> None:
         """Initialize the entity ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -118,7 +122,7 @@ class EntityRuler(Pipe):
         self.token_patterns = defaultdict(list)  # type: ignore
         self.phrase_patterns = defaultdict(list)  # type: ignore
         self._validate = validate
-        self.matcher = Matcher(nlp.vocab, validate=validate)
+        self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=fuzzy)
         self.phrase_matcher_attr = phrase_matcher_attr
         self.phrase_matcher = PhraseMatcher(
             nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
@@ -128,6 +132,7 @@ class EntityRuler(Pipe):
         if patterns is not None:
             self.add_patterns(patterns)
         self.scorer = scorer
+        self.fuzzy = fuzzy
 
     def __len__(self) -> int:
         """The number of all patterns added to the entity ruler."""
@@ -338,7 +343,7 @@ class EntityRuler(Pipe):
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
+        self.matcher = Matcher(self.nlp.vocab, validate=self._validate, fuzzy=self.fuzzy)
         self.phrase_matcher = PhraseMatcher(
             self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
         )

From ee985a382e47729cba079c03c0cc5b15a618f6eb Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Wed, 24 Aug 2022 13:13:54 +0200
Subject: [PATCH 03/15] include rapidfuzz_capi

not yet used
---
 requirements.txt | 1 +
 setup.cfg        | 3 ++-
 setup.py         | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 38b4cbf0d..47dcede1f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 rapidfuzz>=2.4.0,<3.0.0
+rapidfuzz_capi>=1.0.5,<2.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index 536322ab1..658683df7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -58,12 +58,13 @@ install_requires =
     requests>=2.13.0,<3.0.0
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
     jinja2
+    rapidfuzz>=2.4.0,<3.0.0
+    rapidfuzz_capi>=1.0.5,<2.0.0
     # Official Python utilities
     setuptools
     packaging>=20.0
     typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
     langcodes>=3.2.0,<4.0.0
-    rapidfuzz>=2.4.0,<3.0.0
 
 [options.entry_points]
 console_scripts =
diff --git a/setup.py b/setup.py
index ec1bd35fa..413c55d22 100755
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@ from Cython.Build import cythonize
 from Cython.Compiler import Options
 import os
 import subprocess
+import rapidfuzz_capi
 
 
 ROOT = Path(__file__).parent
@@ -202,6 +203,7 @@ def setup_package():
 
     include_dirs = [
         numpy.get_include(),
+        rapidfuzz_capi.get_include(),
         get_python_inc(plat_specific=True),
     ]
     ext_modules = []

From 9600fe1d99923d57666332d5d62399a1e7ed8873 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Wed, 24 Aug 2022 15:04:09 +0200
Subject: [PATCH 04/15] fix type

---
 spacy/pipeline/entityruler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index d1b05334e..e5852e4e8 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -90,7 +90,7 @@ class EntityRuler(Pipe):
         ent_id_sep: str = DEFAULT_ENT_ID_SEP,
         patterns: Optional[List[PatternType]] = None,
         scorer: Optional[Callable] = entity_ruler_score,
-        fuzzy: Optional[float] = None,
+        fuzzy: float = None,
     ) -> None:
         """Initialize the entity ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`

From 3dc5b9c7be99854c146e2ab14ff3c7750a2f934e Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Wed, 24 Aug 2022 17:54:42 +0200
Subject: [PATCH 05/15] add FUZZY predicate

---
 spacy/matcher/matcher.pyx               | 41 +++++++++++++++++++------
 spacy/schemas.py                        |  2 ++
 spacy/tests/matcher/test_matcher_api.py | 16 ++++++++++
 3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 0d847c219..b4f0a3f5e 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -131,7 +131,7 @@ cdef class Matcher:
         for pattern in patterns:
             try:
                 specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates)
+                    self._extensions, self._extra_predicates, self.fuzzy)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
                     for attr, _ in spec[1]:
@@ -766,7 +766,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -793,7 +793,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
     return tokens
@@ -838,10 +838,32 @@ def _get_attr_values(spec, string_store):
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.
 
+class _FuzzyPredicate:
+    operators = ("FUZZY",)
+
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
+        self.i = i
+        self.attr = attr
+        self.value = value
+        self.predicate = predicate
+        self.is_extension = is_extension
+        self.fuzzy = fuzzy
+        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        if self.predicate not in self.operators:
+            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
+
+    def __call__(self, Token token):
+        if self.is_extension:
+            value = token._.get(self.attr)
+        else:
+            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
+        return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
+
+
 class _RegexPredicate:
     operators = ("REGEX",)
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
         self.i = i
         self.attr = attr
         self.value = re.compile(value)
@@ -862,7 +884,7 @@ class _RegexPredicate:
 class _SetPredicate:
     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
         self.i = i
         self.attr = attr
         self.vocab = vocab
@@ -894,9 +916,9 @@ class _SetPredicate:
                 else:
                     value = set(get_string_id(v) for v in value)
         if self.predicate == "IN":
-            return value in self.value
+            return value in self.value # handle fuzzy
         elif self.predicate == "NOT_IN":
-            return value not in self.value
+            return value not in self.value # handle fuzzy
         elif self.predicate == "IS_SUBSET":
             return value <= self.value
         elif self.predicate == "IS_SUPERSET":
@@ -940,8 +962,9 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates, vocab):
+def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
     predicate_types = {
+        "FUZZY": _FuzzyPredicate,
         "REGEX": _RegexPredicate,
         "IN": _SetPredicate,
         "NOT_IN": _SetPredicate,
@@ -975,7 +998,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
             value_with_upper_keys = {k.upper(): v for k, v in value.items()}
             for type_, cls in predicate_types.items():
                 if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy)
                     # Don't create a redundant predicates.
                     # This helps with efficiency, as we're caching the results.
                     if predicate.key in seen_predicates:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 9f91451a9..2677378d6 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -157,6 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]:
 
 class TokenPatternString(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
+    FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
     IN: Optional[List[StrictStr]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
@@ -176,6 +177,7 @@ class TokenPatternString(BaseModel):
 
 class TokenPatternNumber(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
+    FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
     IN: Optional[List[StrictInt]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c29a349af..595488bf4 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -166,6 +166,22 @@ def test_matcher_match_fuzz_none(en_vocab):
     assert matcher(doc) == []
 
 
+def test_matcher_match_fuzz_pred(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == []
+
+
+
 def test_matcher_empty_dict(en_vocab):
     """Test matcher allows empty token specs, meaning match on any token."""
     matcher = Matcher(en_vocab)

From 78699ab0ce105720203c95a4bcd3e9c729090819 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Fri, 26 Aug 2022 00:10:53 +0200
Subject: [PATCH 06/15] add fuzzy attribute list

---
 spacy/matcher/matcher.pxd               |   1 +
 spacy/matcher/matcher.pyi               |   3 +-
 spacy/matcher/matcher.pyx               |  66 ++++++++++-----
 spacy/tests/matcher/test_matcher_api.py | 105 ++++++++++++++++--------
 4 files changed, 119 insertions(+), 56 deletions(-)

diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index b5e24e0e2..98041e199 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -72,6 +72,7 @@ cdef class Matcher:
     cdef readonly Vocab vocab
     cdef public object validate
     cdef public object fuzzy
+    cdef public object fuzzy_attrs
     cdef public object _patterns
     cdef public object _callbacks
     cdef public object _filter
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index c7f487450..676be6a45 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -5,7 +5,8 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span
 
 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ...
+    def __init__(self, vocab: Vocab, validate: bool = ...,
+                 fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ...
     def __reduce__(self) -> Any: ...
     def __len__(self) -> int: ...
     def __contains__(self, key: str) -> bool: ...
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index b4f0a3f5e..17d965eaa 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -38,7 +38,7 @@ cdef class Matcher:
     USAGE: https://spacy.io/usage/rule-based-matching
     """
 
-    def __init__(self, vocab, validate=True, fuzzy=None):
+    def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None):
         """Create the Matcher.
 
         vocab (Vocab): The vocabulary object, which must be shared with the
@@ -54,6 +54,7 @@ cdef class Matcher:
         self.mem = Pool()
         self.validate = validate
         self.fuzzy = fuzzy if fuzzy is not None else 0
+        self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else []
 
     def __reduce__(self):
         data = (self.vocab, self._patterns, self._callbacks)
@@ -131,7 +132,8 @@ cdef class Matcher:
         for pattern in patterns:
             try:
                 specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates, self.fuzzy)
+                    self._extensions, self._extra_predicates,
+                    self.fuzzy, self.fuzzy_attrs)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
                     for attr, _ in spec[1]:
@@ -257,7 +259,8 @@ cdef class Matcher:
         else:
             matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                     extensions=self._extensions, predicates=self._extra_predicates,
-                                    with_alignments=with_alignments, fuzzy=self.fuzzy)
+                                    with_alignments=with_alignments,
+                                    fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs)
         final_matches = []
         pairs_by_id = {}
         # For each key, either add all matches, or only the filtered,
@@ -338,7 +341,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
     return matcher
 
 
-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0, list fuzzy_attrs=[]):
     """Find matches in a doc, with a compiled array of patterns. Matches are
     returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
 
@@ -357,6 +360,9 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
+    cdef int8_t* fuzzy_attrs_array
+    cdef int n_fuzzy_attrs = len(fuzzy_attrs)
+
     output = []
     if length == 0:
         # avoid any processing or mem alloc if the document is empty
@@ -375,6 +381,10 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
             if isinstance(value, str):
                 value = token.vocab.strings[value]
             extra_attr_values[i * nr_extra_attr + index] = value
+    if n_fuzzy_attrs > 0:
+        fuzzy_attrs_array = <int8_t*>mem.alloc(n_fuzzy_attrs, sizeof(int8_t))
+        for i in range(n_fuzzy_attrs):
+            fuzzy_attrs_array[i] = fuzzy_attrs[i]
     # Main loop
     cdef int nr_predicate = len(predicates)
     for i in range(length):
@@ -383,7 +393,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         if with_alignments != 0:
             align_states.resize(states.size())
         transition_states(states, matches, align_states, align_matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates, with_alignments, fuzzy)
+            doclike[i], extra_attr_values, predicates, with_alignments,
+            fuzzy, fuzzy_attrs_array, n_fuzzy_attrs)
         extra_attr_values += nr_extra_attr
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
@@ -412,7 +423,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
                             vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
                             int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *:
+        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments,
+        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *:
     cdef int q = 0
     cdef vector[PatternStateC] new_states
     cdef vector[vector[MatchAlignmentC]] align_new_states
@@ -422,7 +434,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             update_predicate_cache(cached_py_predicates,
                 states[i].pattern, token, py_predicates)
         action = get_action(states[i], token, extra_attrs,
-                            cached_py_predicates, fuzzy)
+                            cached_py_predicates,
+                            fuzzy, fuzzy_attrs, n_fuzzy_attrs)
         if action == REJECT:
             continue
         # Keep only a subset of states (the active ones). Index q is the
@@ -459,7 +472,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                 update_predicate_cache(cached_py_predicates,
                     states[q].pattern, token, py_predicates)
             action = get_action(states[q], token, extra_attrs,
-                                cached_py_predicates, fuzzy)
+                                cached_py_predicates,
+                                fuzzy, fuzzy_attrs, n_fuzzy_attrs)
         # Update alignment before the transition of current state
         if with_alignments != 0:
             align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
@@ -571,7 +585,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
 
 cdef action_t get_action(PatternStateC state,
         Token token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches, float fuzzy) nogil:
+        const int8_t* predicate_matches,
+        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
     """We need to consider:
     a) Does the token match the specification? [Yes, No]
     b) What's the quantifier? [1, 0+, ?]
@@ -630,7 +645,8 @@ cdef action_t get_action(PatternStateC state,
     Problem: If a quantifier is matching, we're adding a lot of open partials
     """
     cdef int8_t is_match
-    is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy)
+    is_match = get_is_match(state, token, extra_attrs, predicate_matches,
+                            fuzzy, fuzzy_attrs, n_fuzzy_attrs)
     quantifier = get_quantifier(state)
     is_final = get_is_final(state)
     if quantifier == ZERO:
@@ -683,7 +699,8 @@ cdef action_t get_action(PatternStateC state,
 
 cdef int8_t get_is_match(PatternStateC state,
         Token token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches, float fuzzy) nogil:
+        const int8_t* predicate_matches,
+        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
     for i in range(state.pattern.nr_py):
         if predicate_matches[state.pattern.py_predicates[i]] == -1:
             return 0
@@ -692,16 +709,22 @@ cdef int8_t get_is_match(PatternStateC state,
         for attr in spec.attrs[:spec.nr_attr]:
             token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
             if token_attr_value != attr.value:
-                if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA
-                                   or attr.attr == LOWER or attr.attr == NORM):
-                    with gil:
-                        if fuzz_cpp.ratio(token.vocab.strings[token_attr_value],
-                                          token.vocab.strings[attr.value]) < fuzzy:
-                            return 0
+                if fuzzy != 0: # and n_fuzzy_attrs > 0:
+                    fuzzy_match = False
+                    for i in range(n_fuzzy_attrs):
+                        if attr.attr == fuzzy_attrs[i]:
+                            with gil:
+                                if fuzz_cpp.ratio(token.vocab.strings[token_attr_value],
+                                                  token.vocab.strings[attr.value]) >= fuzzy:
+                                    fuzzy_match = True
+                                    break
+                    if not fuzzy_match:
+                        return 0
                 else:
                     return 0
     for i in range(spec.nr_extra_attr):
         if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
+            # TODO: fuzzy match
             return 0
     return True
 
@@ -766,7 +789,8 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
+                        fuzzy, fuzzy_attrs):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -916,9 +940,9 @@ class _SetPredicate:
                 else:
                     value = set(get_string_id(v) for v in value)
         if self.predicate == "IN":
-            return value in self.value # handle fuzzy
+            return value in self.value # TODO: handle fuzzy
         elif self.predicate == "NOT_IN":
-            return value not in self.value # handle fuzzy
+            return value not in self.value # TODO: handle fuzzy
         elif self.predicate == "IS_SUBSET":
             return value <= self.value
         elif self.predicate == "IS_SUPERSET":
@@ -933,7 +957,7 @@ class _SetPredicate:
 class _ComparisonPredicate:
     operators = ("==", "!=", ">=", "<=", ">", "<")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
         self.i = i
         self.attr = attr
         self.value = value
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 595488bf4..798222cc3 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -6,15 +6,16 @@ from spacy.tokens import Doc, Token, Span
 from ..doc.test_underscore import clean_underscore  # noqa: F401
 
 
+matcher_rules = {
+    "JS": [[{"ORTH": "JavaScript"}]],
+    "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+    "Java": [[{"LOWER": "java"}]],
+}
+
 @pytest.fixture
 def matcher(en_vocab):
-    rules = {
-        "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": "java"}]],
-    }
     matcher = Matcher(en_vocab)
-    for key, patterns in rules.items():
+    for key, patterns in matcher_rules.items():
         matcher.add(key, patterns)
     return matcher
 
@@ -118,57 +119,58 @@ def test_matcher_match_multi(matcher):
     ]
 
 
+# fuzzy matches on specific attributes
+
 def test_matcher_match_fuzz_all(en_vocab):
-    rules = {
-        "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": "java"}]],
-    }
-    matcher = Matcher(en_vocab, fuzzy=80)
-    for key, patterns in rules.items():
+    matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"])
+    for key, patterns in matcher_rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 2, 4),
         (doc.vocab.strings["Java"], 5, 6),
+        (doc.vocab.strings["JS"], 8, 9),
+    ]
+
+def test_matcher_match_fuzz_all_lower(en_vocab):
+    matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"])
+    for key, patterns in matcher_rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["Java"], 5, 6),
     ]
 
 def test_matcher_match_fuzz_some(en_vocab):
-    rules = {
-        "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": "java"}]],
-    }
-    matcher = Matcher(en_vocab, fuzzy=85)
-    for key, patterns in rules.items():
+    matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"])
+    for key, patterns in matcher_rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["Java"], 5, 6),
     ]
 
 def test_matcher_match_fuzz_none(en_vocab):
-    rules = {
-        "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": "java"}]],
-    }
-    matcher = Matcher(en_vocab, fuzzy=90)
-    for key, patterns in rules.items():
+    matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"])
+    for key, patterns in matcher_rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"]
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == []
 
 
-def test_matcher_match_fuzz_pred(en_vocab):
+# fuzzy matches on specific tokens
+
+def test_matcher_match_fuzz_pred1(en_vocab):
     rules = {
-        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+        "JS": [[{"ORTH": "JavaScript"}]],
         "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
         "Java": [[{"LOWER": "java"}]],
     }
@@ -176,10 +178,45 @@ def test_matcher_match_fuzz_pred(en_vocab):
     for key, patterns in rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"]
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
-    assert matcher(doc) == []
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
 
+def test_matcher_match_fuzz_pred2(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["Java"], 5, 6),
+    ]
+
+def test_matcher_match_fuzz_preds(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+        (doc.vocab.strings["Java"], 5, 6),
+        (doc.vocab.strings["JS"], 8, 9),
+    ]
 
 
 def test_matcher_empty_dict(en_vocab):

From c017de997a795445850e96bd2a413b2e27ea3c15 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Fri, 26 Aug 2022 01:30:44 +0200
Subject: [PATCH 07/15] fix type properly

---
 spacy/pipeline/entityruler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index e5852e4e8..1e816ab16 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -26,9 +26,9 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "phrase_matcher_attr": None,
         "validate": False,
         "overwrite_ents": False,
+        "fuzzy": 0.0,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
         "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
-        "fuzzy": None,
     },
     default_score_weights={
         "ents_f": 1.0,
@@ -43,9 +43,9 @@ def make_entity_ruler(
     phrase_matcher_attr: Optional[Union[int, str]],
     validate: bool,
     overwrite_ents: bool,
+    fuzzy: float,
     ent_id_sep: str,
     scorer: Optional[Callable],
-    fuzzy: Optional[float],
 ):
     return EntityRuler(
         nlp,
@@ -53,9 +53,9 @@ def make_entity_ruler(
         phrase_matcher_attr=phrase_matcher_attr,
         validate=validate,
         overwrite_ents=overwrite_ents,
+        fuzzy=fuzzy,
         ent_id_sep=ent_id_sep,
         scorer=scorer,
-        fuzzy=fuzzy,
     )
 
 
@@ -87,10 +87,10 @@ class EntityRuler(Pipe):
         phrase_matcher_attr: Optional[Union[int, str]] = None,
         validate: bool = False,
         overwrite_ents: bool = False,
+        fuzzy: float = 0,
         ent_id_sep: str = DEFAULT_ENT_ID_SEP,
         patterns: Optional[List[PatternType]] = None,
         scorer: Optional[Callable] = entity_ruler_score,
-        fuzzy: float = None,
     ) -> None:
         """Initialize the entity ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -122,7 +122,8 @@ class EntityRuler(Pipe):
         self.token_patterns = defaultdict(list)  # type: ignore
         self.phrase_patterns = defaultdict(list)  # type: ignore
         self._validate = validate
-        self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=fuzzy)
+        self.fuzzy = fuzzy
+        self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=self.fuzzy)
         self.phrase_matcher_attr = phrase_matcher_attr
         self.phrase_matcher = PhraseMatcher(
             nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
@@ -132,7 +133,6 @@ class EntityRuler(Pipe):
         if patterns is not None:
             self.add_patterns(patterns)
         self.scorer = scorer
-        self.fuzzy = fuzzy
 
     def __len__(self) -> int:
         """The number of all patterns added to the entity ruler."""

From c03394810b6d46dc13e8c68ba962f75f45aeeb9c Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Fri, 26 Aug 2022 02:06:05 +0200
Subject: [PATCH 08/15] tidying

---
 pyproject.toml            | 2 ++
 setup.cfg                 | 1 +
 spacy/matcher/matcher.pyx | 7 +++----
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 317c5fdbe..37d041b6d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,5 +8,7 @@ requires = [
     "thinc>=8.1.0,<8.2.0",
     "pathy",
     "numpy>=1.15.0",
+    "rapidfuzz>=2.4.0,<3.0.0",
+    "rapidfuzz_capi>=1.0.5,<2.0.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
index 658683df7..91c73cb5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,6 +34,7 @@ python_requires = >=3.6
 setup_requires =
     cython>=0.25,<3.0
     numpy>=1.15.0
+    rapidfuzz_capi>=1.0.5,<2.0.0
     # We also need our Cython packages here to compile against
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 17d965eaa..4a5468b98 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -20,7 +20,6 @@ from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
-from ..attrs cimport LOWER, NORM
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -258,8 +257,7 @@ cdef class Matcher:
             matches = []
         else:
             matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                    extensions=self._extensions, predicates=self._extra_predicates,
-                                    with_alignments=with_alignments,
+                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments,
                                     fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs)
         final_matches = []
         pairs_by_id = {}
@@ -341,7 +339,8 @@ def unpickle_matcher(vocab, patterns, callbacks):
     return matcher
 
 
-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0, list fuzzy_attrs=[]):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0,
+                  float fuzzy=0, list fuzzy_attrs=[]):
     """Find matches in a doc, with a compiled array of patterns. Matches are
     returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
 

From b189f25aaae1dc25b489adbe4e2a7127a49664c2 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 10:58:11 +0200
Subject: [PATCH 09/15] remove unnecessary dependency

---
 pyproject.toml            | 1 -
 spacy/matcher/matcher.pyx | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 37d041b6d..b01055bdf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ requires = [
     "thinc>=8.1.0,<8.2.0",
     "pathy",
     "numpy>=1.15.0",
-    "rapidfuzz>=2.4.0,<3.0.0",
     "rapidfuzz_capi>=1.0.5,<2.0.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 4a5468b98..3badec56c 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -708,7 +708,7 @@ cdef int8_t get_is_match(PatternStateC state,
         for attr in spec.attrs[:spec.nr_attr]:
             token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
             if token_attr_value != attr.value:
-                if fuzzy != 0: # and n_fuzzy_attrs > 0:
+                if fuzzy:
                     fuzzy_match = False
                     for i in range(n_fuzzy_attrs):
                         if attr.attr == fuzzy_attrs[i]:

From 9bdccf94e5d0c46a78fa26a978dcd88867d1ef89 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 10:58:50 +0200
Subject: [PATCH 10/15] handle fuzzy sets

---
 spacy/matcher/matcher.pyx               | 79 ++++++++++++++++++-------
 spacy/schemas.py                        |  2 +-
 spacy/tests/matcher/test_matcher_api.py | 44 ++++++++++++++
 3 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 3badec56c..f6a09b9f6 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -816,7 +816,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
     return tokens
@@ -915,9 +915,14 @@ class _SetPredicate:
             # normalize morph strings
             self.value = set(self.vocab.morphology.add(v) for v in value)
         else:
-            self.value = set(get_string_id(v) for v in value)
+            if fuzzy:
+                # add to string store
+                self.value = set(self.vocab.strings.add(v) for v in value)
+            else:
+                self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
+        self.fuzzy = fuzzy
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@@ -939,9 +944,23 @@ class _SetPredicate:
                 else:
                     value = set(get_string_id(v) for v in value)
         if self.predicate == "IN":
-            return value in self.value # TODO: handle fuzzy
+            if value in self.value:
+                return True
+            elif self.fuzzy:
+                for v in self.value:
+                    if fuzz_cpp.ratio(self.vocab.strings[value],
+                                      self.vocab.strings[v]) >= self.fuzzy:
+                        return True
+            return False
         elif self.predicate == "NOT_IN":
-            return value not in self.value # TODO: handle fuzzy
+            if value in self.value:
+                return False
+            elif self.fuzzy:
+                for v in self.value:
+                    if fuzz_cpp.ratio(self.vocab.strings[value],
+                                      self.vocab.strings[v]) >= self.fuzzy:
+                        return False
+            return True
         elif self.predicate == "IS_SUBSET":
             return value <= self.value
         elif self.predicate == "IS_SUPERSET":
@@ -985,7 +1004,7 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
+def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
     predicate_types = {
         "FUZZY": _FuzzyPredicate,
         "REGEX": _RegexPredicate,
@@ -1016,23 +1035,41 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy):
             if attr.upper() == "TEXT":
                 attr = "ORTH"
             attr = IDS.get(attr.upper())
+
         if isinstance(value, dict):
-            processed = False
-            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
-            for type_, cls in predicate_types.items():
-                if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy)
-                    # Don't create a redundant predicates.
-                    # This helps with efficiency, as we're caching the results.
-                    if predicate.key in seen_predicates:
-                        output.append(seen_predicates[predicate.key])
-                    else:
-                        extra_predicates.append(predicate)
-                        output.append(predicate.i)
-                        seen_predicates[predicate.key] = predicate.i
-                    processed = True
-            if not processed:
-                warnings.warn(Warnings.W035.format(pattern=value))
+            output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
+                                                       predicate_types,
+                                                       extra_predicates, seen_predicates))
+    return output
+
+
+def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
+                                 predicate_types, extra_predicates, seen_predicates):
+    output = []
+    processed = False #TODO: not working as intended
+    value_with_upper_keys = {k.upper(): v for k, v in value.items()}
+    for type_, cls in predicate_types.items(): #TODO: switch this loop
+        if type_ in value_with_upper_keys:
+            if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict):
+                # add predicates inside fuzzy operator
+                output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_],
+                                                           vocab, fuzzy, fuzzy_attrs,
+                                                           predicate_types,
+                                                           extra_predicates, seen_predicates))
+            else:
+                predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_,
+                                vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0)
+                # Don't create a redundant predicates.
+                # This helps with efficiency, as we're caching the results.
+                if predicate.key in seen_predicates:
+                    output.append(seen_predicates[predicate.key])
+                else:
+                    extra_predicates.append(predicate)
+                    output.append(predicate.i)
+                    seen_predicates[predicate.key] = predicate.i
+            processed = True
+    if not processed:
+        warnings.warn(Warnings.W035.format(pattern=value))
     return output
 
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 2677378d6..882815dfa 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -157,7 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]:
 
 class TokenPatternString(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
-    FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
+    FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
     IN: Optional[List[StrictStr]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 798222cc3..22eb18245 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -218,6 +218,50 @@ def test_matcher_match_fuzz_preds(en_vocab):
         (doc.vocab.strings["JS"], 8, 9),
     ]
 
+def test_matcher_match_fuzz_pred_in_set(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["I", "like", "Goggle", "Now"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
+
+def test_matcher_match_fuzz_pred_not_in_set(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["I", "like", "Goggle", "Now"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 0, 2),
+    ]
+
+def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]},
+                                 "NOT_IN": ["Goggle"]},
+                        "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab, fuzzy=80)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["I", "like", "Goggle", "Now"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 3, 4),
+    ]
+
 
 def test_matcher_empty_dict(en_vocab):
     """Test matcher allows empty token specs, meaning match on any token."""

From ecebb5b145874568e3c62263487f8c68af1ce8d7 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 12:49:14 +0200
Subject: [PATCH 11/15] simplify fuzzy sets

---
 spacy/matcher/matcher.pyx | 61 ++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index f6a09b9f6..7a098aac2 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -880,7 +880,7 @@ class _FuzzyPredicate:
             value = token._.get(self.attr)
         else:
             value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
-        return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
+        return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
 
 
 class _RegexPredicate:
@@ -1006,7 +1006,6 @@ class _ComparisonPredicate:
 
 def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
     predicate_types = {
-        "FUZZY": _FuzzyPredicate,
         "REGEX": _RegexPredicate,
         "IN": _SetPredicate,
         "NOT_IN": _SetPredicate,
@@ -1019,6 +1018,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
         "<=": _ComparisonPredicate,
         ">": _ComparisonPredicate,
         "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
     }
     seen_predicates = {pred.key: pred.i for pred in extra_predicates}
     output = []
@@ -1037,39 +1037,40 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
             attr = IDS.get(attr.upper())
 
         if isinstance(value, dict):
-            output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
-                                                       predicate_types,
-                                                       extra_predicates, seen_predicates))
+            fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr
+            output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
+                                                     predicate_types,
+                                                     extra_predicates, seen_predicates))
     return output
 
 
-def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
-                                 predicate_types, extra_predicates, seen_predicates):
+def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
+                               predicate_types, extra_predicates, seen_predicates):
     output = []
-    processed = False #TODO: not working as intended
-    value_with_upper_keys = {k.upper(): v for k, v in value.items()}
-    for type_, cls in predicate_types.items(): #TODO: switch this loop
-        if type_ in value_with_upper_keys:
-            if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict):
+    for type_, value in value_dict.items():
+        if type_ == 'FUZZY':
+            fuzzy_match = True # explicit fuzzy match
+            if isinstance(value, dict):
                 # add predicates inside fuzzy operator
-                output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_],
-                                                           vocab, fuzzy, fuzzy_attrs,
-                                                           predicate_types,
-                                                           extra_predicates, seen_predicates))
-            else:
-                predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_,
-                                vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0)
-                # Don't create a redundant predicates.
-                # This helps with efficiency, as we're caching the results.
-                if predicate.key in seen_predicates:
-                    output.append(seen_predicates[predicate.key])
-                else:
-                    extra_predicates.append(predicate)
-                    output.append(predicate.i)
-                    seen_predicates[predicate.key] = predicate.i
-            processed = True
-    if not processed:
-        warnings.warn(Warnings.W035.format(pattern=value))
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
+                                                         predicate_types,
+                                                         extra_predicates, seen_predicates))
+                continue
+        cls = predicate_types.get(type_.upper())
+        if cls is None:
+            warnings.warn(Warnings.W035.format(pattern=value_dict))
+            # ignore unrecongized predicate type
+            continue
+        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
+                        fuzzy=fuzzy if fuzzy_match else 0)
+        # Don't create a redundant predicates.
+        # This helps with efficiency, as we're caching the results.
+        if predicate.key in seen_predicates:
+            output.append(seen_predicates[predicate.key])
+        else:
+            extra_predicates.append(predicate)
+            output.append(predicate.i)
+            seen_predicates[predicate.key] = predicate.i
     return output
 
 

From ecd0455acdadb0aface60825b5f3f301cf096cf3 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 15:49:15 +0200
Subject: [PATCH 12/15] case fix

---
 spacy/matcher/matcher.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 7a098aac2..54481258b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1048,6 +1048,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
                                predicate_types, extra_predicates, seen_predicates):
     output = []
     for type_, value in value_dict.items():
+        type_ = type_.upper()
         if type_ == 'FUZZY':
             fuzzy_match = True # explicit fuzzy match
             if isinstance(value, dict):
@@ -1056,10 +1057,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
                                                          predicate_types,
                                                          extra_predicates, seen_predicates))
                 continue
-        cls = predicate_types.get(type_.upper())
+        cls = predicate_types.get(type_)
         if cls is None:
             warnings.warn(Warnings.W035.format(pattern=value_dict))
-            # ignore unrecongized predicate type
+            # ignore unrecognized predicate type
             continue
         predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
                         fuzzy=fuzzy if fuzzy_match else 0)

From 43948f731b44b18379a0cceb41f64877e5c9cd34 Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 18:10:42 +0200
Subject: [PATCH 13/15] switch to FUZZYn predicates

use Levenshtein distance.
remove fuzzy param.
remove rapidfuzz_capi.
---
 pyproject.toml                          |   1 -
 requirements.txt                        |   1 -
 setup.cfg                               |   2 -
 spacy/matcher/matcher.pxd               |   2 -
 spacy/matcher/matcher.pyi               |   3 +-
 spacy/matcher/matcher.pyx               | 127 +++++++++---------------
 spacy/pipeline/entityruler.py           |   9 +-
 spacy/schemas.py                        |  12 ++-
 spacy/tests/matcher/test_matcher_api.py | 112 ++++++---------------
 9 files changed, 93 insertions(+), 176 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b01055bdf..317c5fdbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,5 @@ requires = [
     "thinc>=8.1.0,<8.2.0",
     "pathy",
     "numpy>=1.15.0",
-    "rapidfuzz_capi>=1.0.5,<2.0.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 47dcede1f..38b4cbf0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,7 +19,6 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
 jinja2
 langcodes>=3.2.0,<4.0.0
 rapidfuzz>=2.4.0,<3.0.0
-rapidfuzz_capi>=1.0.5,<2.0.0
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index 91c73cb5c..a149b1f7e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,7 +34,6 @@ python_requires = >=3.6
 setup_requires =
     cython>=0.25,<3.0
     numpy>=1.15.0
-    rapidfuzz_capi>=1.0.5,<2.0.0
     # We also need our Cython packages here to compile against
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
@@ -60,7 +59,6 @@ install_requires =
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
     jinja2
     rapidfuzz>=2.4.0,<3.0.0
-    rapidfuzz_capi>=1.0.5,<2.0.0
     # Official Python utilities
     setuptools
     packaging>=20.0
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 98041e199..455f978cc 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -71,8 +71,6 @@ cdef class Matcher:
     cdef vector[TokenPatternC*] patterns
     cdef readonly Vocab vocab
     cdef public object validate
-    cdef public object fuzzy
-    cdef public object fuzzy_attrs
     cdef public object _patterns
     cdef public object _callbacks
     cdef public object _filter
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 676be6a45..390629ff8 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -5,8 +5,7 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span
 
 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...,
-                 fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ...
+    def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
     def __reduce__(self) -> Any: ...
     def __len__(self) -> int: ...
     def __contains__(self, key: str) -> bool: ...
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 54481258b..cb6152ed0 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -10,7 +10,7 @@ from murmurhash.mrmr cimport hash64
 import re
 import srsly
 import warnings
-from rapidfuzz import fuzz_cpp
+from rapidfuzz.distance import Levenshtein
 
 from ..typedefs cimport attr_t
 from ..structs cimport TokenC
@@ -37,7 +37,7 @@ cdef class Matcher:
     USAGE: https://spacy.io/usage/rule-based-matching
     """
 
-    def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None):
+    def __init__(self, vocab, validate=True):
         """Create the Matcher.
 
         vocab (Vocab): The vocabulary object, which must be shared with the
@@ -52,8 +52,6 @@ cdef class Matcher:
         self.vocab = vocab
         self.mem = Pool()
         self.validate = validate
-        self.fuzzy = fuzzy if fuzzy is not None else 0
-        self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else []
 
     def __reduce__(self):
         data = (self.vocab, self._patterns, self._callbacks)
@@ -131,8 +129,7 @@ cdef class Matcher:
         for pattern in patterns:
             try:
                 specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates,
-                    self.fuzzy, self.fuzzy_attrs)
+                    self._extensions, self._extra_predicates)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
                     for attr, _ in spec[1]:
@@ -257,8 +254,7 @@ cdef class Matcher:
             matches = []
         else:
             matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments,
-                                    fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs)
+                                    extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
         final_matches = []
         pairs_by_id = {}
         # For each key, either add all matches, or only the filtered,
@@ -339,8 +335,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
     return matcher
 
 
-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0,
-                  float fuzzy=0, list fuzzy_attrs=[]):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
     """Find matches in a doc, with a compiled array of patterns. Matches are
     returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
 
@@ -359,8 +354,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
-    cdef int8_t* fuzzy_attrs_array
-    cdef int n_fuzzy_attrs = len(fuzzy_attrs)
 
     output = []
     if length == 0:
@@ -380,10 +373,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
             if isinstance(value, str):
                 value = token.vocab.strings[value]
             extra_attr_values[i * nr_extra_attr + index] = value
-    if n_fuzzy_attrs > 0:
-        fuzzy_attrs_array = <int8_t*>mem.alloc(n_fuzzy_attrs, sizeof(int8_t))
-        for i in range(n_fuzzy_attrs):
-            fuzzy_attrs_array[i] = fuzzy_attrs[i]
     # Main loop
     cdef int nr_predicate = len(predicates)
     for i in range(length):
@@ -392,8 +381,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         if with_alignments != 0:
             align_states.resize(states.size())
         transition_states(states, matches, align_states, align_matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates, with_alignments,
-            fuzzy, fuzzy_attrs_array, n_fuzzy_attrs)
+            doclike[i], extra_attr_values, predicates, with_alignments)
         extra_attr_values += nr_extra_attr
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
@@ -422,8 +410,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
                             vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
                             int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments,
-        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *:
+        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
     cdef int q = 0
     cdef vector[PatternStateC] new_states
     cdef vector[vector[MatchAlignmentC]] align_new_states
@@ -433,8 +420,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             update_predicate_cache(cached_py_predicates,
                 states[i].pattern, token, py_predicates)
         action = get_action(states[i], token, extra_attrs,
-                            cached_py_predicates,
-                            fuzzy, fuzzy_attrs, n_fuzzy_attrs)
+                            cached_py_predicates)
         if action == REJECT:
             continue
         # Keep only a subset of states (the active ones). Index q is the
@@ -471,8 +457,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                 update_predicate_cache(cached_py_predicates,
                     states[q].pattern, token, py_predicates)
             action = get_action(states[q], token, extra_attrs,
-                                cached_py_predicates,
-                                fuzzy, fuzzy_attrs, n_fuzzy_attrs)
+                                cached_py_predicates)
         # Update alignment before the transition of current state
         if with_alignments != 0:
             align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
@@ -584,8 +569,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
 
 cdef action_t get_action(PatternStateC state,
         Token token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches,
-        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
+        const int8_t* predicate_matches) nogil:
     """We need to consider:
     a) Does the token match the specification? [Yes, No]
     b) What's the quantifier? [1, 0+, ?]
@@ -644,8 +628,7 @@ cdef action_t get_action(PatternStateC state,
     Problem: If a quantifier is matching, we're adding a lot of open partials
     """
     cdef int8_t is_match
-    is_match = get_is_match(state, token, extra_attrs, predicate_matches,
-                            fuzzy, fuzzy_attrs, n_fuzzy_attrs)
+    is_match = get_is_match(state, token, extra_attrs, predicate_matches)
     quantifier = get_quantifier(state)
     is_final = get_is_final(state)
     if quantifier == ZERO:
@@ -698,8 +681,7 @@ cdef action_t get_action(PatternStateC state,
 
 cdef int8_t get_is_match(PatternStateC state,
         Token token, const attr_t* extra_attrs,
-        const int8_t* predicate_matches,
-        float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil:
+        const int8_t* predicate_matches) nogil:
     for i in range(state.pattern.nr_py):
         if predicate_matches[state.pattern.py_predicates[i]] == -1:
             return 0
@@ -708,22 +690,9 @@ cdef int8_t get_is_match(PatternStateC state,
         for attr in spec.attrs[:spec.nr_attr]:
             token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
             if token_attr_value != attr.value:
-                if fuzzy:
-                    fuzzy_match = False
-                    for i in range(n_fuzzy_attrs):
-                        if attr.attr == fuzzy_attrs[i]:
-                            with gil:
-                                if fuzz_cpp.ratio(token.vocab.strings[token_attr_value],
-                                                  token.vocab.strings[attr.value]) >= fuzzy:
-                                    fuzzy_match = True
-                                    break
-                    if not fuzzy_match:
-                        return 0
-                else:
-                    return 0
+                return 0
     for i in range(spec.nr_extra_attr):
         if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
-            # TODO: fuzzy match
             return 0
     return True
 
@@ -788,8 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
-                        fuzzy, fuzzy_attrs):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -816,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
     return tokens
@@ -862,31 +830,31 @@ def _get_attr_values(spec, string_store):
 # extensions to the matcher introduced in #3173.
 
 class _FuzzyPredicate:
-    operators = ("FUZZY",)
+    operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
         self.i = i
         self.attr = attr
         self.value = value
         self.predicate = predicate
         self.is_extension = is_extension
-        self.fuzzy = fuzzy
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
+        self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix
 
     def __call__(self, Token token):
         if self.is_extension:
             value = token._.get(self.attr)
         else:
             value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
-        return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
+        return bool(Levenshtein.distance(self.value, value) <= self.distance)
 
 
 class _RegexPredicate:
     operators = ("REGEX",)
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
         self.i = i
         self.attr = attr
         self.value = re.compile(value)
@@ -907,22 +875,22 @@ class _RegexPredicate:
 class _SetPredicate:
     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
         self.i = i
         self.attr = attr
         self.vocab = vocab
+        self.distance = distance
         if self.attr == MORPH:
             # normalize morph strings
             self.value = set(self.vocab.morphology.add(v) for v in value)
         else:
-            if fuzzy:
+            if self.distance:
                 # add to string store
                 self.value = set(self.vocab.strings.add(v) for v in value)
             else:
                 self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
-        self.fuzzy = fuzzy
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@@ -946,19 +914,19 @@ class _SetPredicate:
         if self.predicate == "IN":
             if value in self.value:
                 return True
-            elif self.fuzzy:
+            elif self.distance:
                 for v in self.value:
-                    if fuzz_cpp.ratio(self.vocab.strings[value],
-                                      self.vocab.strings[v]) >= self.fuzzy:
+                    if Levenshtein.distance(self.vocab.strings[value],
+                                            self.vocab.strings[v]) <= self.distance:
                         return True
             return False
         elif self.predicate == "NOT_IN":
             if value in self.value:
                 return False
-            elif self.fuzzy:
+            elif self.distance:
                 for v in self.value:
-                    if fuzz_cpp.ratio(self.vocab.strings[value],
-                                      self.vocab.strings[v]) >= self.fuzzy:
+                    if Levenshtein.distance(self.vocab.strings[value],
+                                            self.vocab.strings[v]) <= self.distance:
                         return False
             return True
         elif self.predicate == "IS_SUBSET":
@@ -975,7 +943,7 @@ class _SetPredicate:
 class _ComparisonPredicate:
     operators = ("==", "!=", ">=", "<=", ">", "<")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
         self.i = i
         self.attr = attr
         self.value = value
@@ -1004,7 +972,7 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
+def _get_extra_predicates(spec, extra_predicates, vocab):
     predicate_types = {
         "REGEX": _RegexPredicate,
         "IN": _SetPredicate,
@@ -1018,7 +986,11 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
         "<=": _ComparisonPredicate,
         ">": _ComparisonPredicate,
         "<": _ComparisonPredicate,
-        "FUZZY": _FuzzyPredicate,
+        "FUZZY1": _FuzzyPredicate,
+        "FUZZY2": _FuzzyPredicate,
+        "FUZZY3": _FuzzyPredicate,
+        "FUZZY4": _FuzzyPredicate,
+        "FUZZY5": _FuzzyPredicate,
     }
     seen_predicates = {pred.key: pred.i for pred in extra_predicates}
     output = []
@@ -1037,33 +1009,30 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
             attr = IDS.get(attr.upper())
 
         if isinstance(value, dict):
-            fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr
-            output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
-                                                     predicate_types,
+            output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
                                                      extra_predicates, seen_predicates))
     return output
 
 
-def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
-                               predicate_types, extra_predicates, seen_predicates):
+def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
+                               extra_predicates, seen_predicates, distance=None):
     output = []
     for type_, value in value_dict.items():
         type_ = type_.upper()
-        if type_ == 'FUZZY':
-            fuzzy_match = True # explicit fuzzy match
-            if isinstance(value, dict):
-                # add predicates inside fuzzy operator
-                output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
-                                                         predicate_types,
-                                                         extra_predicates, seen_predicates))
-                continue
         cls = predicate_types.get(type_)
         if cls is None:
             warnings.warn(Warnings.W035.format(pattern=value_dict))
             # ignore unrecognized predicate type
             continue
-        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
-                        fuzzy=fuzzy if fuzzy_match else 0)
+        elif cls == _FuzzyPredicate:
+            distance = int(type_[len("FUZZY"):]) # number after prefix
+            if isinstance(value, dict):
+                # add predicates inside fuzzy operator
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                         extra_predicates, seen_predicates,
+                                                         distance=distance))
+                continue
+        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, distance=distance)
         # Don't create a redundant predicates.
         # This helps with efficiency, as we're caching the results.
         if predicate.key in seen_predicates:
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 1e816ab16..3cb1ca676 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -26,7 +26,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "phrase_matcher_attr": None,
         "validate": False,
         "overwrite_ents": False,
-        "fuzzy": 0.0,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
         "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
     },
@@ -43,7 +42,6 @@ def make_entity_ruler(
     phrase_matcher_attr: Optional[Union[int, str]],
     validate: bool,
     overwrite_ents: bool,
-    fuzzy: float,
     ent_id_sep: str,
     scorer: Optional[Callable],
 ):
@@ -53,7 +51,6 @@ def make_entity_ruler(
         phrase_matcher_attr=phrase_matcher_attr,
         validate=validate,
         overwrite_ents=overwrite_ents,
-        fuzzy=fuzzy,
         ent_id_sep=ent_id_sep,
         scorer=scorer,
     )
@@ -87,7 +84,6 @@ class EntityRuler(Pipe):
         phrase_matcher_attr: Optional[Union[int, str]] = None,
         validate: bool = False,
         overwrite_ents: bool = False,
-        fuzzy: float = 0,
         ent_id_sep: str = DEFAULT_ENT_ID_SEP,
         patterns: Optional[List[PatternType]] = None,
         scorer: Optional[Callable] = entity_ruler_score,
@@ -122,8 +118,7 @@ class EntityRuler(Pipe):
         self.token_patterns = defaultdict(list)  # type: ignore
         self.phrase_patterns = defaultdict(list)  # type: ignore
         self._validate = validate
-        self.fuzzy = fuzzy
-        self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=self.fuzzy)
+        self.matcher = Matcher(nlp.vocab, validate=validate)
         self.phrase_matcher_attr = phrase_matcher_attr
         self.phrase_matcher = PhraseMatcher(
             nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
@@ -343,7 +338,7 @@ class EntityRuler(Pipe):
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(tuple)
-        self.matcher = Matcher(self.nlp.vocab, validate=self._validate, fuzzy=self.fuzzy)
+        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
         self.phrase_matcher = PhraseMatcher(
             self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
         )
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 882815dfa..a9012d7d9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -157,12 +157,16 @@ def validate_token_pattern(obj: list) -> List[str]:
 
 class TokenPatternString(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
-    FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
     IN: Optional[List[StrictStr]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
     IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
     INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
+    FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1")
+    FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2")
+    FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3")
+    FUZZY4: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy4")
+    FUZZY5: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy5")
 
     class Config:
         extra = "forbid"
@@ -177,7 +181,6 @@ class TokenPatternString(BaseModel):
 
 class TokenPatternNumber(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
-    FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy")
     IN: Optional[List[StrictInt]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
     IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
@@ -189,6 +192,11 @@ class TokenPatternNumber(BaseModel):
     LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
     GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
     LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+    FUZZY1: Optional[StrictStr] = Field(None, alias="fuzzy1")
+    FUZZY2: Optional[StrictStr] = Field(None, alias="fuzzy2")
+    FUZZY3: Optional[StrictStr] = Field(None, alias="fuzzy3")
+    FUZZY4: Optional[StrictStr] = Field(None, alias="fuzzy4")
+    FUZZY5: Optional[StrictStr] = Field(None, alias="fuzzy5")
 
     class Config:
         extra = "forbid"
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 22eb18245..1b6dda273 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -6,16 +6,15 @@ from spacy.tokens import Doc, Token, Span
 from ..doc.test_underscore import clean_underscore  # noqa: F401
 
 
-matcher_rules = {
-    "JS": [[{"ORTH": "JavaScript"}]],
-    "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-    "Java": [[{"LOWER": "java"}]],
-}
-
 @pytest.fixture
 def matcher(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
     matcher = Matcher(en_vocab)
-    for key, patterns in matcher_rules.items():
+    for key, patterns in rules.items():
         matcher.add(key, patterns)
     return matcher
 
@@ -119,98 +118,51 @@ def test_matcher_match_multi(matcher):
     ]
 
 
-# fuzzy matches on specific attributes
-
-def test_matcher_match_fuzz_all(en_vocab):
-    matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"])
-    for key, patterns in matcher_rules.items():
-        matcher.add(key, patterns)
-
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
-    doc = Doc(matcher.vocab, words=words)
-    assert matcher(doc) == [
-        (doc.vocab.strings["GoogleNow"], 2, 4),
-        (doc.vocab.strings["Java"], 5, 6),
-        (doc.vocab.strings["JS"], 8, 9),
-    ]
-
-def test_matcher_match_fuzz_all_lower(en_vocab):
-    matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"])
-    for key, patterns in matcher_rules.items():
-        matcher.add(key, patterns)
-
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
-    doc = Doc(matcher.vocab, words=words)
-    assert matcher(doc) == [
-        (doc.vocab.strings["Java"], 5, 6),
-    ]
-
-def test_matcher_match_fuzz_some(en_vocab):
-    matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"])
-    for key, patterns in matcher_rules.items():
-        matcher.add(key, patterns)
-
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
-    doc = Doc(matcher.vocab, words=words)
-    assert matcher(doc) == [
-        (doc.vocab.strings["Java"], 5, 6),
-    ]
-
-def test_matcher_match_fuzz_none(en_vocab):
-    matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"])
-    for key, patterns in matcher_rules.items():
-        matcher.add(key, patterns)
-
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
-    doc = Doc(matcher.vocab, words=words)
-    assert matcher(doc) == []
-
-
 # fuzzy matches on specific tokens
 
-def test_matcher_match_fuzz_pred1(en_vocab):
+def test_matcher_match_fuzzy1(en_vocab):
     rules = {
         "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
         "Java": [[{"LOWER": "java"}]],
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 2, 4),
     ]
 
-def test_matcher_match_fuzz_pred2(en_vocab):
+def test_matcher_match_fuzzy2(en_vocab):
     rules = {
         "JS": [[{"ORTH": "JavaScript"}]],
         "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["Java"], 5, 6),
     ]
 
-def test_matcher_match_fuzz_preds(en_vocab):
+def test_matcher_match_fuzzy3(en_vocab):
     rules = {
-        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
-        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+        "JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns)
 
-    words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 2, 4),
@@ -218,45 +170,45 @@ def test_matcher_match_fuzz_preds(en_vocab):
         (doc.vocab.strings["JS"], 8, 9),
     ]
 
-def test_matcher_match_fuzz_pred_in_set(en_vocab):
+def test_matcher_match_fuzzy_set1(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]]
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]]
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["I", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Now"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 2, 4),
     ]
 
-def test_matcher_match_fuzz_pred_not_in_set(en_vocab):
+def test_matcher_match_fuzzy_set2(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["I", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Now"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 0, 2),
     ]
 
-def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab):
+def test_matcher_match_fuzzy_set3(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]},
+        "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]},
                                  "NOT_IN": ["Goggle"]},
                         "OP": "+"}]]
     }
-    matcher = Matcher(en_vocab, fuzzy=80)
+    matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["I", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Now"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 3, 4),

From a8a4d86bae2001e58a32e334840d2956ad0ad6ac Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 18:28:17 +0200
Subject: [PATCH 14/15] revert changes added for fuzzy param

---
 setup.py                  |  2 --
 spacy/matcher/matcher.pyx | 13 +++++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 413c55d22..ec1bd35fa 100755
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,6 @@ from Cython.Build import cythonize
 from Cython.Compiler import Options
 import os
 import subprocess
-import rapidfuzz_capi
 
 
 ROOT = Path(__file__).parent
@@ -203,7 +202,6 @@ def setup_package():
 
     include_dirs = [
         numpy.get_include(),
-        rapidfuzz_capi.get_include(),
         get_python_inc(plat_specific=True),
     ]
     ext_modules = []
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index cb6152ed0..56fd11365 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -354,7 +354,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
-
     output = []
     if length == 0:
         # avoid any processing or mem alloc if the document is empty
@@ -419,7 +418,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
         if states[i].pattern.nr_py >= 1:
             update_predicate_cache(cached_py_predicates,
                 states[i].pattern, token, py_predicates)
-        action = get_action(states[i], token, extra_attrs,
+        action = get_action(states[i], token.c, extra_attrs,
                             cached_py_predicates)
         if action == REJECT:
             continue
@@ -456,7 +455,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
             if states[q].pattern.nr_py != 0:
                 update_predicate_cache(cached_py_predicates,
                     states[q].pattern, token, py_predicates)
-            action = get_action(states[q], token, extra_attrs,
+            action = get_action(states[q], token.c, extra_attrs,
                                 cached_py_predicates)
         # Update alignment before the transition of current state
         if with_alignments != 0:
@@ -568,7 +567,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
 
 
 cdef action_t get_action(PatternStateC state,
-        Token token, const attr_t* extra_attrs,
+        const TokenC* token, const attr_t* extra_attrs,
         const int8_t* predicate_matches) nogil:
     """We need to consider:
     a) Does the token match the specification? [Yes, No]
@@ -680,7 +679,7 @@ cdef action_t get_action(PatternStateC state,
 
 
 cdef int8_t get_is_match(PatternStateC state,
-        Token token, const attr_t* extra_attrs,
+        const TokenC* token, const attr_t* extra_attrs,
         const int8_t* predicate_matches) nogil:
     for i in range(state.pattern.nr_py):
         if predicate_matches[state.pattern.py_predicates[i]] == -1:
@@ -688,8 +687,7 @@ cdef int8_t get_is_match(PatternStateC state,
     spec = state.pattern
     if spec.nr_attr > 0:
         for attr in spec.attrs[:spec.nr_attr]:
-            token_attr_value = get_token_attr_for_matcher(token.c, attr.attr)
-            if token_attr_value != attr.value:
+            if get_token_attr_for_matcher(token, attr.attr) != attr.value:
                 return 0
     for i in range(spec.nr_extra_attr):
         if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
@@ -1007,7 +1005,6 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
             if attr.upper() == "TEXT":
                 attr = "ORTH"
             attr = IDS.get(attr.upper())
-
         if isinstance(value, dict):
             output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
                                                      extra_predicates, seen_predicates))

From 59021f7d25bd8fa3c5dc8e5ab594023257b0ed5a Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Mon, 29 Aug 2022 21:42:10 +0200
Subject: [PATCH 15/15] switch to polyleven

(Python package)
---
 requirements.txt          |  2 +-
 setup.cfg                 |  2 +-
 spacy/matcher/matcher.pyx | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 38b4cbf0d..070ffe7a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ tqdm>=4.38.0,<5.0.0
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
 jinja2
 langcodes>=3.2.0,<4.0.0
-rapidfuzz>=2.4.0,<3.0.0
+polyleven>=0.7,<1.0
 # Official Python utilities
 setuptools
 packaging>=20.0
diff --git a/setup.cfg b/setup.cfg
index a149b1f7e..de58de3bc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -58,7 +58,7 @@ install_requires =
     requests>=2.13.0,<3.0.0
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
     jinja2
-    rapidfuzz>=2.4.0,<3.0.0
+    polyleven>=0.7,<1.0
     # Official Python utilities
     setuptools
     packaging>=20.0
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 56fd11365..d27397f8b 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -10,7 +10,7 @@ from murmurhash.mrmr cimport hash64
 import re
 import srsly
 import warnings
-from rapidfuzz.distance import Levenshtein
+from polyleven import levenshtein
 
 from ..typedefs cimport attr_t
 from ..structs cimport TokenC
@@ -846,7 +846,7 @@ class _FuzzyPredicate:
             value = token._.get(self.attr)
         else:
             value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
-        return bool(Levenshtein.distance(self.value, value) <= self.distance)
+        return bool(levenshtein(self.value, value) <= self.distance)
 
 
 class _RegexPredicate:
@@ -914,8 +914,8 @@ class _SetPredicate:
                 return True
             elif self.distance:
                 for v in self.value:
-                    if Levenshtein.distance(self.vocab.strings[value],
-                                            self.vocab.strings[v]) <= self.distance:
+                    if levenshtein(self.vocab.strings[value],
+                                   self.vocab.strings[v]) <= self.distance:
                         return True
             return False
         elif self.predicate == "NOT_IN":
@@ -923,8 +923,8 @@ class _SetPredicate:
                 return False
             elif self.distance:
                 for v in self.value:
-                    if Levenshtein.distance(self.vocab.strings[value],
-                                            self.vocab.strings[v]) <= self.distance:
+                    if levenshtein(self.vocab.strings[value],
+                                   self.vocab.strings[v]) <= self.distance:
                         return False
             return True
         elif self.predicate == "IS_SUBSET":