From 0da324ab5ba0463fdaf018a13d077fe9a3fc260b Mon Sep 17 00:00:00 2001
From: Kevin Humphreys <kevin.humphreys@dialpad.com>
Date: Thu, 22 Sep 2022 19:26:52 -0700
Subject: [PATCH] reinstate FUZZY operator

with length-based distance function
---
 spacy/matcher/matcher.pyx               |  37 +++++---
 spacy/schemas.py                        |   1 +
 spacy/tests/matcher/test_matcher_api.py | 119 +++++++++++++++++++++---
 3 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 8cc022d7e..c49bb92de 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -205,6 +205,18 @@ cdef class Matcher:
                 else:
                     yield doc
 
+    @staticmethod
+    def fuzzy_match(s1: str, s2: str, distance: int, token: Token) -> bool:
+        if token.is_oov: # (TODO: param?)
+            threshold = min(len(s1), len(s2)) - 1 # max edit distance
+            if distance: # FUZZYn operators
+                threshold = min(distance, threshold)
+            else: # FUZZY operator
+                threshold = min(5, threshold - 1) # default fuzziness (TODO: param?)
+            if threshold > 0:
+                return levenshtein(s1, s2) <= threshold
+        return False
+
     def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
         """Find all token sequences matching the supplied pattern.
 
@@ -829,7 +841,7 @@ def _get_attr_values(spec, string_store):
 # extensions to the matcher introduced in #3173.
 
 class _FuzzyPredicate:
-    operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
+    operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
 
     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
         self.i = i
@@ -840,7 +852,8 @@ class _FuzzyPredicate:
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
-        self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix
+        self.distance = self.predicate[len('FUZZY'):] # number after prefix
+        self.distance = int(self.distance) if self.distance else 0
 
     def __call__(self, Token token):
         if self.is_extension:
@@ -849,9 +862,7 @@ class _FuzzyPredicate:
             value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
         if self.value == value:
             return True
-        elif self.distance and token.is_oov:
-            return bool(levenshtein(value, self.value) <= min(self.distance, min(len(value), len(self.value))-1))
-        return False
+        return Matcher.fuzzy_match(value, self.value, self.distance, token)
 
 
 class _RegexPredicate:
@@ -887,7 +898,7 @@ class _SetPredicate:
             # normalize morph strings
             self.value = set(self.vocab.morphology.add(v) for v in value)
         else:
-            if self.distance:
+            if self.distance is not None:
                 # add to string store
                 self.value = set(self.vocab.strings.add(v) for v in value)
             else:
@@ -924,21 +935,19 @@ class _SetPredicate:
         if self.predicate == "IN":
             if value in self.value:
                 return True
-            elif self.distance and token.is_oov:
+            elif self.distance is not None:
                 s1 = self.vocab.strings[value]
                 for v in self.value:
-                    s2 = self.vocab.strings[v]
-                    if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
+                    if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
                         return True
             return False
         elif self.predicate == "NOT_IN":
             if value in self.value:
                 return False
-            elif self.distance and token.is_oov:
+            elif self.distance is not None:
                 s1 = self.vocab.strings[value]
                 for v in self.value:
-                    s2 = self.vocab.strings[v]
-                    if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
+                    if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
                         return False
             return True
         elif self.predicate == "IS_SUBSET":
@@ -998,6 +1007,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
         "<=": _ComparisonPredicate,
         ">": _ComparisonPredicate,
         "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
         "FUZZY1": _FuzzyPredicate,
         "FUZZY2": _FuzzyPredicate,
         "FUZZY3": _FuzzyPredicate,
@@ -1036,7 +1046,8 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
             # ignore unrecognized predicate type
             continue
         elif cls == _FuzzyPredicate:
-            distance = int(type_[len("FUZZY"):]) # number after prefix
+            distance = type_[len("FUZZY"):] # number after prefix
+            distance = int(distance) if distance else 0
             if isinstance(value, dict):
                 # add predicates inside fuzzy operator
                 output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 053bd7cc1..f2be4428b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -162,6 +162,7 @@ class TokenPatternString(BaseModel):
     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
     IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
     INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
+    FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
     FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1")
     FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2")
     FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3")
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 77e3d9713..3c50489b8 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -123,7 +123,7 @@ def test_matcher_match_multi(matcher):
 def test_matcher_match_fuzzy1(en_vocab):
     rules = {
         "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
         "Java": [[{"LOWER": "java"}]],
     }
     matcher = Matcher(en_vocab)
@@ -140,7 +140,7 @@ def test_matcher_match_fuzzy2(en_vocab):
     rules = {
         "JS": [[{"ORTH": "JavaScript"}]],
         "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
     }
     matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
@@ -153,6 +153,101 @@ def test_matcher_match_fuzzy2(en_vocab):
     ]
 
 def test_matcher_match_fuzzy3(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+        (doc.vocab.strings["Java"], 5, 6),
+        (doc.vocab.strings["JS"], 8, 9),
+    ]
+
+def test_matcher_match_fuzzy_set1(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
+
+def test_matcher_match_fuzzy_set2(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 0, 2),
+    ]
+
+def test_matcher_match_fuzzy_set3(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]},
+                                 "NOT_IN": ["Goggle"]},
+                        "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 3, 4),
+    ]
+
+
+def test_matcher_match_fuzzyn1(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
+
+def test_matcher_match_fuzzyn2(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["Java"], 5, 6),
+    ]
+
+def test_matcher_match_fuzzyn3(en_vocab):
     rules = {
         "JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]],
         "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
@@ -170,37 +265,37 @@ def test_matcher_match_fuzzy3(en_vocab):
         (doc.vocab.strings["JS"], 8, 9),
     ]
 
-def test_matcher_match_fuzzy_set1(en_vocab):
+def test_matcher_match_fuzzyn_set1(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]]
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
     }
     matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 2, 4),
     ]
 
-def test_matcher_match_fuzzy_set2(en_vocab):
+def test_matcher_match_fuzzyn_set2(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
     }
     matcher = Matcher(en_vocab)
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 0, 2),
     ]
 
-def test_matcher_match_fuzzy_set3(en_vocab):
+def test_matcher_match_fuzzyn_set3(en_vocab):
     rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]},
+        "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]},
                                  "NOT_IN": ["Goggle"]},
                         "OP": "+"}]]
     }
@@ -208,12 +303,12 @@ def test_matcher_match_fuzzy_set3(en_vocab):
     for key, patterns in rules.items():
         matcher.add(key, patterns, greedy="LONGEST")
 
-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
     doc = Doc(matcher.vocab, words=words)
     assert matcher(doc) == [
         (doc.vocab.strings["GoogleNow"], 3, 4),
     ]
-
+    
 
 def test_matcher_empty_dict(en_vocab):
     """Test matcher allows empty token specs, meaning match on any token."""