Rename to fuzzy_compare

2025-08-05 04:40:20 +03:00 · 2022-11-28 10:18:07 +01:00 · 2022-11-28 10:18:07 +01:00 · c3f446f71b
commit c3f446f71b
parent efa638a11c
4 changed files with 30 additions and 30 deletions
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@ -77,6 +77,6 @@ cdef class Matcher:
    cdef public object _extensions
    cdef public object _extra_predicates
    cdef public object _seen_attrs
-    cdef public object _fuzzy_match
+    cdef public object _fuzzy_compare

-cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int)
+cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int)
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -53,4 +53,4 @@ class Matcher:
    ) -> List[Span]: ...
    def _normalize_key(self, key: Any) -> Any: ...

-def _default_fuzzy_match(s1: str, s2: str, fuzzy: int=-1) -> bool: ...
+def _default_fuzzy_compare(s1: str, s2: str, fuzzy: int) -> bool: ...
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -38,7 +38,7 @@ cdef class Matcher:
    USAGE: https://spacy.io/usage/rule-based-matching
    """

-    def __init__(self, vocab, validate=True, *, fuzzy_match=_default_fuzzy_match):
+    def __init__(self, vocab, validate=True, *, fuzzy_compare=_default_fuzzy_compare):
        """Create the Matcher.

        vocab (Vocab): The vocabulary object, which must be shared with the
@ -53,10 +53,10 @@ cdef class Matcher:
        self.vocab = vocab
        self.mem = Pool()
        self.validate = validate
-        self._fuzzy_match = fuzzy_match
+        self._fuzzy_compare = fuzzy_compare

    def __reduce__(self):
-        data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_match)
+        data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
        return (unpickle_matcher, data, None, None)

    def __len__(self):
@ -131,7 +131,7 @@ cdef class Matcher:
        for pattern in patterns:
            try:
                specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates, self._fuzzy_match)
+                    self._extensions, self._extra_predicates, self._fuzzy_compare)
                self.patterns.push_back(init_pattern(self.mem, key, specs))
                for spec in specs:
                    for attr, _ in spec[1]:
@ -329,8 +329,8 @@ cdef class Matcher:
            return key


-def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_match):
-    matcher = Matcher(vocab, validate=validate, fuzzy_match=fuzzy_match)
+def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
+    matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
    for key, pattern in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, pattern, on_match=callback)
@ -757,7 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
    return id_attr.value


-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_match):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
    """This function interprets the pattern, converting the various bits of
    syntactic sugar before we compile it into a struct with init_pattern.

@ -784,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
        ops = _get_operators(spec)
        attr_values = _get_attr_values(spec, string_store)
        extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
        for op in ops:
            tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
    return tokens
@ -833,7 +833,7 @@ class _FuzzyPredicate:
    operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")

    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
-                 regex=False, fuzzy=None, fuzzy_match=None):
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.value = value
@ -844,7 +844,7 @@ class _FuzzyPredicate:
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
        fuzz = self.predicate[len("FUZZY"):] # number after prefix
        self.fuzzy = int(fuzz) if fuzz else -1
-        self.fuzzy_match = fuzzy_match
+        self.fuzzy_compare = fuzzy_compare

    def __call__(self, Token token):
        if self.is_extension:
@ -853,14 +853,14 @@ class _FuzzyPredicate:
            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
        if self.value == value:
            return True
-        return self.fuzzy_match(value, self.value, self.fuzzy)
+        return self.fuzzy_compare(value, self.value, self.fuzzy)


 class _RegexPredicate:
    operators = ("REGEX",)

    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
-                 regex=False, fuzzy=None, fuzzy_match=None):
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.value = re.compile(value)
@ -882,13 +882,13 @@ class _SetPredicate:
    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")

    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
-                 regex=False, fuzzy=None, fuzzy_match=None):
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.vocab = vocab
        self.regex = regex
        self.fuzzy = fuzzy
-        self.fuzzy_match = fuzzy_match
+        self.fuzzy_compare = fuzzy_compare
        if self.attr == MORPH:
            # normalize morph strings
            self.value = set(self.vocab.morphology.add(v) for v in value)
@ -937,7 +937,7 @@ class _SetPredicate:
                return True
            elif self.fuzzy is not None:
                value = self.vocab.strings[value]
-                return any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
+                return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
                           for v in self.value)
            else:
                return False
@ -949,7 +949,7 @@ class _SetPredicate:
                return False
            elif self.fuzzy is not None:
                value = self.vocab.strings[value]
-                return not any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
+                return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
                               for v in self.value)
            else:
                return True
@ -968,7 +968,7 @@ class _ComparisonPredicate:
    operators = ("==", "!=", ">=", "<=", ">", "<")

    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
-                 regex=False, fuzzy=None, fuzzy_match=None):
+                 regex=False, fuzzy=None, fuzzy_compare=None):
        self.i = i
        self.attr = attr
        self.value = value
@ -997,7 +997,7 @@ class _ComparisonPredicate:
            return value < self.value


-def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match):
+def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
    predicate_types = {
        "REGEX": _RegexPredicate,
        "IN": _SetPredicate,
@ -1035,12 +1035,12 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match):
            attr = IDS.get(attr.upper())
        if isinstance(value, dict):
            output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
-                                                     extra_predicates, seen_predicates, fuzzy_match=fuzzy_match))
+                                                     extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
    return output


 def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
-                               extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_match=None):
+                               extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
    output = []
    for type_, value in value_dict.items():
        type_ = type_.upper()
@ -1063,10 +1063,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
                fuzzy_val = int(fuzz) if fuzz else -1
                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
                                                         extra_predicates, seen_predicates,
-                                                         fuzzy=fuzzy_val, fuzzy_match=fuzzy_match))
+                                                         fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
                continue
        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
-                        regex=regex, fuzzy=fuzzy, fuzzy_match=fuzzy_match)
+                        regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
        # Don't create a redundant predicates.
        # This helps with efficiency, as we're caching the results.
        if predicate.key in seen_predicates:
@ -1150,7 +1150,7 @@ def _get_extensions(spec, string_store, name2index):
    return attr_values


-cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int):
+cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int):
    distance = min(len(s1), len(s2))
    distance -= 1 # don't allow completely different tokens
    if fuzzy == -1: # FUZZY operator with unspecified fuzzy
--- a/spacy/tests/matcher/test_levenshtein.py
+++ b/spacy/tests/matcher/test_levenshtein.py
@ -1,6 +1,6 @@
 import pytest
 from spacy.matcher import levenshtein
-from spacy.matcher.matcher import _default_fuzzy_match
+from spacy.matcher.matcher import _default_fuzzy_compare


 # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -69,6 +69,6 @@ def test_levenshtein(dist, a, b):
        ("abcdefgh", "cdefghijkl", -1, False),  # default equivalent to 5 (max)
    ],
 )
-def test_default_fuzzy_match(a, b, fuzzy, expected):
-    assert _default_fuzzy_match(a, b, fuzzy) == expected
-    assert _default_fuzzy_match(b, a, fuzzy) == expected
+def test_default_fuzzy_compare(a, b, fuzzy, expected):
+    assert _default_fuzzy_compare(a, b, fuzzy) == expected
+    assert _default_fuzzy_compare(b, a, fuzzy) == expected