move fuzzy_match back inside Matcher

simplify logic and add tests
2025-09-21 19:42:37 +03:00 · 2022-11-03 13:39:17 -07:00 · 2022-11-03 13:39:17 -07:00 · 49e93170bb
commit 49e93170bb
parent 6e64a5cd0d
3 changed files with 47 additions and 26 deletions
--- a/spacy/matcher/fuzzy.py
+++ b/spacy/matcher/fuzzy.py
@ -1,19 +0,0 @@
-from .levenshtein import levenshtein
-
-def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool:
-    """Define in pure Python outside Matcher to allow patching.
-
-    Patch with e.g.:
-        import wrapt
-        from spacy.matcher import fuzzy
-        @wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match')
-    *before* import spacy
-    """
-    min_length = min(len(input_string), len(rule_string))
-    if distance: # FUZZYn operators with explicit distance
-        threshold = min(distance, min_length - 1)
-    else: # FUZZY operator with default distance
-        threshold = min(5, min_length - 2)
-    if threshold > 0:
-        return levenshtein(input_string, rule_string) <= threshold
-    return False
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -25,7 +25,8 @@ from ..errors import Errors, MatchPatternError, Warnings
 from ..strings import get_string_id
 from ..attrs import IDS

-import fuzzy
+from .levenshtein import levenshtein
+

 DEF PADDING = 5

@ -204,6 +205,17 @@ cdef class Matcher:
                else:
                    yield doc

+    @staticmethod
+    def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool:
+        distance = min(len(input_string), len(rule_string))
+        distance -= 1 # don't allow completely different tokens
+        if fuzzy == -1: # FUZZY operator with unspecified fuzzy
+            fuzzy = 5 # default max fuzzy
+            distance -= 1 # be more restrictive
+        distance = min(fuzzy, distance if distance > 0 else 1)
+        return levenshtein(input_string, rule_string, distance) <= distance
+
+
    def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
        """Find all token sequences matching the supplied pattern.

@ -326,7 +338,6 @@ cdef class Matcher:
        else:
            return key

-
 def unpickle_matcher(vocab, patterns, callbacks):
    matcher = Matcher(vocab)
    for key, pattern in patterns.items():
@ -841,7 +852,7 @@ class _FuzzyPredicate:
        if self.predicate not in self.operators:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
        self.fuzzy = self.predicate[len('FUZZY'):] # number after prefix
-        self.fuzzy = int(self.fuzzy) if self.fuzzy else 0
+        self.fuzzy = int(self.fuzzy) if self.fuzzy else -1

    def __call__(self, Token token):
        if self.is_extension:
@ -850,7 +861,7 @@ class _FuzzyPredicate:
            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
        if self.value == value:
            return True
-        return fuzzy.fuzzy_match(value, self.value, self.fuzzy)
+        return Matcher.fuzzy_match(value, self.value, self.fuzzy)


 class _RegexPredicate:
@ -933,7 +944,7 @@ class _SetPredicate:
                return True
            elif self.fuzzy is not None:
                value = self.vocab.strings[value]
-                return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
+                return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
                           for v in self.value)
            else:
                return False
@ -945,7 +956,7 @@ class _SetPredicate:
                return False
            elif self.fuzzy is not None:
                value = self.vocab.strings[value]
-                return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
+                return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
                               for v in self.value)
            else:
                return True
@ -1054,7 +1065,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
                continue
        elif cls == _FuzzyPredicate:
            fuzz = type_[len("FUZZY"):] # number after prefix
-            fuzz = int(fuzz) if fuzz else 0
+            fuzz = int(fuzz) if fuzz else -1
            if isinstance(value, dict):
                # add predicates inside fuzzy operator
                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
--- a/spacy/tests/matcher/test_levenshtein.py
+++ b/spacy/tests/matcher/test_levenshtein.py
@ -1,5 +1,6 @@
 import pytest
 from spacy.matcher import levenshtein
+from spacy.matcher import Matcher


 # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
 )
 def test_levenshtein(dist, a, b):
    assert levenshtein(a, b) == dist
+
+
+@pytest.mark.parametrize(
+    "a,b,fuzzy,expected",
+    [
+        ("a", "a", 1, True),
+        ("a", "a", 0, True),
+        ("a", "a", -1, True),
+        ("a", "ab", 1, True),
+        ("a", "ab", 0, False),
+        ("a", "ab", -1, True),
+        ("ab", "ac", 1, True),
+        ("ab", "ac", -1, True),
+        ("abc", "cde", 4, False), # 4 reduced because of token length
+        ("abc", "cde", -1, False),
+        ("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length
+        ("abcdef", "cdefgh", 3, False),
+        ("abcdef", "cdefgh", -1, True), # default equivalent to 4
+        ("abcdefgh", "cdefghijk", 5, True),
+        ("abcdefgh", "cdefghijk", 4, False),
+        ("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5
+        ("abcdefgh", "cdefghijkl", 6, True),
+        ("abcdefgh", "cdefghijkl", 5, False),
+        ("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
+    ]
+)
+def test_fuzzy_match(a, b, fuzzy, expected):
+    assert Matcher.fuzzy_match(a, b, fuzzy) == expected