simplify fuzzy sets

2025-09-16 00:52:38 +03:00 · 2022-08-29 12:49:14 +02:00 · 2022-08-29 12:49:14 +02:00 · e636f4941b
commit e636f4941b
parent 9c0f9368a9
1 changed files with 31 additions and 30 deletions
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -880,7 +880,7 @@ class _FuzzyPredicate:
            value = token._.get(self.attr)
        else:
            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
-        return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy)
+        return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy)


 class _RegexPredicate:
@ -1006,7 +1006,6 @@ class _ComparisonPredicate:

 def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
    predicate_types = {
-        "FUZZY": _FuzzyPredicate,
        "REGEX": _RegexPredicate,
        "IN": _SetPredicate,
        "NOT_IN": _SetPredicate,
@ -1019,6 +1018,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
        "<=": _ComparisonPredicate,
        ">": _ComparisonPredicate,
        "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
    }
    seen_predicates = {pred.key: pred.i for pred in extra_predicates}
    output = []
@ -1037,39 +1037,40 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs):
            attr = IDS.get(attr.upper())

        if isinstance(value, dict):
-            output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
-                                                       predicate_types,
-                                                       extra_predicates, seen_predicates))
+            fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr
+            output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
+                                                     predicate_types,
+                                                     extra_predicates, seen_predicates))
    return output


-def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs,
-                                 predicate_types, extra_predicates, seen_predicates):
+def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match,
+                               predicate_types, extra_predicates, seen_predicates):
    output = []
-    processed = False #TODO: not working as intended
-    value_with_upper_keys = {k.upper(): v for k, v in value.items()}
-    for type_, cls in predicate_types.items(): #TODO: switch this loop
-        if type_ in value_with_upper_keys:
-            if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict):
+    for type_, value in value_dict.items():
+        if type_ == 'FUZZY':
+            fuzzy_match = True # explicit fuzzy match
+            if isinstance(value, dict):
                # add predicates inside fuzzy operator
-                output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_],
-                                                           vocab, fuzzy, fuzzy_attrs,
-                                                           predicate_types,
-                                                           extra_predicates, seen_predicates))
-            else:
-                predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_,
-                                vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0)
-                # Don't create a redundant predicates.
-                # This helps with efficiency, as we're caching the results.
-                if predicate.key in seen_predicates:
-                    output.append(seen_predicates[predicate.key])
-                else:
-                    extra_predicates.append(predicate)
-                    output.append(predicate.i)
-                    seen_predicates[predicate.key] = predicate.i
-            processed = True
-    if not processed:
-        warnings.warn(Warnings.W035.format(pattern=value))
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match,
+                                                         predicate_types,
+                                                         extra_predicates, seen_predicates))
+                continue
+        cls = predicate_types.get(type_.upper())
+        if cls is None:
+            warnings.warn(Warnings.W035.format(pattern=value_dict))
+            # ignore unrecongized predicate type
+            continue
+        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
+                        fuzzy=fuzzy if fuzzy_match else 0)
+        # Don't create a redundant predicates.
+        # This helps with efficiency, as we're caching the results.
+        if predicate.key in seen_predicates:
+            output.append(seen_predicates[predicate.key])
+        else:
+            extra_predicates.append(predicate)
+            output.append(predicate.i)
+            seen_predicates[predicate.key] = predicate.i
    return output