reinstate FUZZY operator

with length-based distance function
2025-09-15 16:42:36 +03:00 · 2022-09-22 19:26:52 -07:00 · 2022-09-22 19:26:52 -07:00 · 0da324ab5b
commit 0da324ab5b
parent eab96f7c03
3 changed files with 132 additions and 25 deletions
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -205,6 +205,18 @@ cdef class Matcher:
                else:
                    yield doc

+    @staticmethod
+    def fuzzy_match(s1: str, s2: str, distance: int, token: Token) -> bool:
+        if token.is_oov: # (TODO: param?)
+            threshold = min(len(s1), len(s2)) - 1 # max edit distance
+            if distance: # FUZZYn operators
+                threshold = min(distance, threshold)
+            else: # FUZZY operator
+                threshold = min(5, threshold - 1) # default fuzziness (TODO: param?)
+            if threshold > 0:
+                return levenshtein(s1, s2) <= threshold
+        return False
+
    def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
        """Find all token sequences matching the supplied pattern.

@ -829,7 +841,7 @@ def _get_attr_values(spec, string_store):
 # extensions to the matcher introduced in #3173.

 class _FuzzyPredicate:
-    operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
+    operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")

    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
        self.i = i
@ -840,7 +852,8 @@ class _FuzzyPredicate:
        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
-        self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix
+        self.distance = self.predicate[len('FUZZY'):] # number after prefix
+        self.distance = int(self.distance) if self.distance else 0

    def __call__(self, Token token):
        if self.is_extension:
@ -849,9 +862,7 @@ class _FuzzyPredicate:
            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
        if self.value == value:
            return True
-        elif self.distance and token.is_oov:
-            return bool(levenshtein(value, self.value) <= min(self.distance, min(len(value), len(self.value))-1))
-        return False
+        return Matcher.fuzzy_match(value, self.value, self.distance, token)


 class _RegexPredicate:
@ -887,7 +898,7 @@ class _SetPredicate:
            # normalize morph strings
            self.value = set(self.vocab.morphology.add(v) for v in value)
        else:
-            if self.distance:
+            if self.distance is not None:
                # add to string store
                self.value = set(self.vocab.strings.add(v) for v in value)
            else:
@ -924,21 +935,19 @@ class _SetPredicate:
        if self.predicate == "IN":
            if value in self.value:
                return True
-            elif self.distance and token.is_oov:
+            elif self.distance is not None:
                s1 = self.vocab.strings[value]
                for v in self.value:
-                    s2 = self.vocab.strings[v]
-                    if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
+                    if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
                        return True
            return False
        elif self.predicate == "NOT_IN":
            if value in self.value:
                return False
-            elif self.distance and token.is_oov:
+            elif self.distance is not None:
                s1 = self.vocab.strings[value]
                for v in self.value:
-                    s2 = self.vocab.strings[v]
-                    if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
+                    if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
                        return False
            return True
        elif self.predicate == "IS_SUBSET":
@ -998,6 +1007,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
        "<=": _ComparisonPredicate,
        ">": _ComparisonPredicate,
        "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
        "FUZZY1": _FuzzyPredicate,
        "FUZZY2": _FuzzyPredicate,
        "FUZZY3": _FuzzyPredicate,
@ -1036,7 +1046,8 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
            # ignore unrecognized predicate type
            continue
        elif cls == _FuzzyPredicate:
-            distance = int(type_[len("FUZZY"):]) # number after prefix
+            distance = type_[len("FUZZY"):] # number after prefix
+            distance = int(distance) if distance else 0
            if isinstance(value, dict):
                # add predicates inside fuzzy operator
                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -162,6 +162,7 @@ class TokenPatternString(BaseModel):
    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
+    FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
    FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1")
    FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2")
    FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3")
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -123,7 +123,7 @@ def test_matcher_match_multi(matcher):
 def test_matcher_match_fuzzy1(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
-        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
        "Java": [[{"LOWER": "java"}]],
    }
    matcher = Matcher(en_vocab)
@ -140,7 +140,7 @@ def test_matcher_match_fuzzy2(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
-        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
@ -153,6 +153,101 @@ def test_matcher_match_fuzzy2(en_vocab):
    ]

 def test_matcher_match_fuzzy3(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY": "java"}}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+        (doc.vocab.strings["Java"], 5, 6),
+        (doc.vocab.strings["JS"], 8, 9),
+    ]
+
+def test_matcher_match_fuzzy_set1(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
+
+def test_matcher_match_fuzzy_set2(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 0, 2),
+    ]
+
+def test_matcher_match_fuzzy_set3(en_vocab):
+    rules = {
+        "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]},
+                                 "NOT_IN": ["Goggle"]},
+                        "OP": "+"}]]
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns, greedy="LONGEST")
+
+    words = ["They", "like", "Goggle", "Noo"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 3, 4),
+    ]
+
+
+def test_matcher_match_fuzzyn1(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": "java"}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["GoogleNow"], 2, 4),
+    ]
+
+def test_matcher_match_fuzzyn2(en_vocab):
+    rules = {
+        "JS": [[{"ORTH": "JavaScript"}]],
+        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
+        "Java": [[{"LOWER": {"FUZZY1": "java"}}]],
+    }
+    matcher = Matcher(en_vocab)
+    for key, patterns in rules.items():
+        matcher.add(key, patterns)
+
+    words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
+    doc = Doc(matcher.vocab, words=words)
+    assert matcher(doc) == [
+        (doc.vocab.strings["Java"], 5, 6),
+    ]
+
+def test_matcher_match_fuzzyn3(en_vocab):
    rules = {
        "JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]],
        "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
@ -170,37 +265,37 @@ def test_matcher_match_fuzzy3(en_vocab):
        (doc.vocab.strings["JS"], 8, 9),
    ]

-def test_matcher_match_fuzzy_set1(en_vocab):
+def test_matcher_match_fuzzyn_set1(en_vocab):
    rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]]
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, patterns, greedy="LONGEST")

-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [
        (doc.vocab.strings["GoogleNow"], 2, 4),
    ]

-def test_matcher_match_fuzzy_set2(en_vocab):
+def test_matcher_match_fuzzyn_set2(en_vocab):
    rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
+        "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, patterns, greedy="LONGEST")

-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [
        (doc.vocab.strings["GoogleNow"], 0, 2),
    ]

-def test_matcher_match_fuzzy_set3(en_vocab):
+def test_matcher_match_fuzzyn_set3(en_vocab):
    rules = {
-        "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]},
+        "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]},
                                 "NOT_IN": ["Goggle"]},
                        "OP": "+"}]]
    }
@ -208,12 +303,12 @@ def test_matcher_match_fuzzy_set3(en_vocab):
    for key, patterns in rules.items():
        matcher.add(key, patterns, greedy="LONGEST")

-    words = ["They", "like", "Goggle", "Now"]
+    words = ["They", "like", "Goggle", "Noo"]
    doc = Doc(matcher.vocab, words=words)
    assert matcher(doc) == [
        (doc.vocab.strings["GoogleNow"], 3, 4),
    ]
-
+    

 def test_matcher_empty_dict(en_vocab):
    """Test matcher allows empty token specs, meaning match on any token."""