From c3f446f71bf0a84ba974ef40acb3bf3cbad7c5eb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 28 Nov 2022 10:18:07 +0100 Subject: [PATCH] Rename to fuzzy_compare --- spacy/matcher/matcher.pxd | 4 +-- spacy/matcher/matcher.pyi | 2 +- spacy/matcher/matcher.pyx | 46 ++++++++++++------------- spacy/tests/matcher/test_levenshtein.py | 8 ++--- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index a4ae10aee..98df3ea92 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -77,6 +77,6 @@ cdef class Matcher: cdef public object _extensions cdef public object _extra_predicates cdef public object _seen_attrs - cdef public object _fuzzy_match + cdef public object _fuzzy_compare -cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int) +cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int) diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 2b17d28de..8443ddd14 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -53,4 +53,4 @@ class Matcher: ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... -def _default_fuzzy_match(s1: str, s2: str, fuzzy: int=-1) -> bool: ... +def _default_fuzzy_compare(s1: str, s2: str, fuzzy: int) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index c2d9969d8..570c3bc1a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -38,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True, *, fuzzy_match=_default_fuzzy_match): + def __init__(self, vocab, validate=True, *, fuzzy_compare=_default_fuzzy_compare): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -53,10 +53,10 @@ cdef class Matcher: self.vocab = vocab self.mem = Pool() self.validate = validate - self._fuzzy_match = fuzzy_match + self._fuzzy_compare = fuzzy_compare def __reduce__(self): - data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_match) + data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare) return (unpickle_matcher, data, None, None) def __len__(self): @@ -131,7 +131,7 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates, self._fuzzy_match) + self._extensions, self._extra_predicates, self._fuzzy_compare) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -329,8 +329,8 @@ cdef class Matcher: return key -def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_match): - matcher = Matcher(vocab, validate=validate, fuzzy_match=fuzzy_match) +def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare): + matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare) for key, pattern in patterns.items(): callback = callbacks.get(key, None) matcher.add(key, pattern, on_match=callback) @@ -757,7 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_match): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -784,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match) + predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -833,7 +833,7 @@ class _FuzzyPredicate: operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, - regex=False, fuzzy=None, fuzzy_match=None): + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.value = value @@ -844,7 +844,7 @@ class _FuzzyPredicate: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 - self.fuzzy_match = fuzzy_match + self.fuzzy_compare = fuzzy_compare def __call__(self, Token token): if self.is_extension: @@ -853,14 +853,14 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - return self.fuzzy_match(value, self.value, self.fuzzy) + return self.fuzzy_compare(value, self.value, self.fuzzy) class _RegexPredicate: operators = ("REGEX",) def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, - regex=False, fuzzy=None, fuzzy_match=None): + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.value = re.compile(value) @@ -882,13 +882,13 @@ class _SetPredicate: operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, - regex=False, fuzzy=None, fuzzy_match=None): + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.vocab = vocab self.regex = regex self.fuzzy = fuzzy - self.fuzzy_match = fuzzy_match + self.fuzzy_compare = fuzzy_compare if self.attr == MORPH: # normalize morph strings self.value = set(self.vocab.morphology.add(v) for v in value) @@ -937,7 +937,7 @@ class _SetPredicate: return True elif self.fuzzy is not None: value = self.vocab.strings[value] - return any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return False @@ -949,7 +949,7 @@ class _SetPredicate: return False elif self.fuzzy is not None: value = self.vocab.strings[value] - return not any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return True @@ -968,7 +968,7 @@ class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, - regex=False, fuzzy=None, fuzzy_match=None): + regex=False, fuzzy=None, fuzzy_compare=None): self.i = i self.attr = attr self.value = value @@ -997,7 +997,7 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match): +def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare): predicate_types = { "REGEX": _RegexPredicate, "IN": _SetPredicate, @@ -1035,12 +1035,12 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match): attr = IDS.get(attr.upper()) if isinstance(value, dict): output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, - extra_predicates, seen_predicates, fuzzy_match=fuzzy_match)) + extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare)) return output def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, - extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_match=None): + extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None): output = [] for type_, value in value_dict.items(): type_ = type_.upper() @@ -1063,10 +1063,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, fuzzy_val = int(fuzz) if fuzz else -1 output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, extra_predicates, seen_predicates, - fuzzy=fuzzy_val, fuzzy_match=fuzzy_match)) + fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare)) continue predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, - regex=regex, fuzzy=fuzzy, fuzzy_match=fuzzy_match) + regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare) # Don't create a redundant predicates. # This helps with efficiency, as we're caching the results. if predicate.key in seen_predicates: @@ -1150,7 +1150,7 @@ def _get_extensions(spec, string_store, name2index): return attr_values -cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int): +cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int): distance = min(len(s1), len(s2)) distance -= 1 # don't allow completely different tokens if fuzzy == -1: # FUZZY operator with unspecified fuzzy diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index 8b3c28cb9..f84da75d7 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -1,6 +1,6 @@ import pytest from spacy.matcher import levenshtein -from spacy.matcher.matcher import _default_fuzzy_match +from spacy.matcher.matcher import _default_fuzzy_compare # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests @@ -69,6 +69,6 @@ def test_levenshtein(dist, a, b): ("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max) ], ) -def test_default_fuzzy_match(a, b, fuzzy, expected): - assert _default_fuzzy_match(a, b, fuzzy) == expected - assert _default_fuzzy_match(b, a, fuzzy) == expected +def test_default_fuzzy_compare(a, b, fuzzy, expected): + assert _default_fuzzy_compare(a, b, fuzzy) == expected + assert _default_fuzzy_compare(b, a, fuzzy) == expected