diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f7720cca2..5c313a020 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -205,17 +205,6 @@ cdef class Matcher: else: yield doc - @staticmethod - def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool: - distance = min(len(input_string), len(rule_string)) - distance -= 1 # don't allow completely different tokens - if fuzzy == -1: # FUZZY operator with unspecified fuzzy - fuzzy = 5 # default max fuzzy - distance -= 1 # be more restrictive - distance = min(fuzzy, distance if distance > 0 else 1) - return levenshtein(input_string, rule_string, distance) <= distance - - def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): """Find all token sequences matching the supplied pattern. @@ -862,7 +851,7 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - return Matcher.fuzzy_match(value, self.value, self.fuzzy) + return fuzzy_match(value, self.value, self.fuzzy) class _RegexPredicate: @@ -945,7 +934,7 @@ class _SetPredicate: return True elif self.fuzzy is not None: value = self.vocab.strings[value] - return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return any(fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return False @@ -957,7 +946,7 @@ class _SetPredicate: return False elif self.fuzzy is not None: value = self.vocab.strings[value] - return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return not any(fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return True @@ -1156,3 +1145,13 @@ def _get_extensions(spec, string_store, name2index): name2index[name] = len(name2index) attr_values.append((name2index[name], value)) return attr_values + + +def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool: + distance = min(len(input_string), len(rule_string)) + distance -= 1 # don't allow completely different tokens + if fuzzy == -1: # FUZZY operator with unspecified fuzzy + fuzzy = 5 # default max fuzzy + distance -= 1 # be more restrictive + distance = min(fuzzy, distance if distance > 0 else 1) + return levenshtein(input_string, rule_string, distance) <= distance diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index d7f242f67..6159fb97e 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -1,6 +1,6 @@ import pytest from spacy.matcher import levenshtein -from spacy.matcher import Matcher +from spacy.matcher.matcher import fuzzy_match # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests @@ -70,4 +70,4 @@ def test_levenshtein(dist, a, b): ], ) def test_fuzzy_match(a, b, fuzzy, expected): - assert Matcher.fuzzy_match(a, b, fuzzy) == expected + assert fuzzy_match(a, b, fuzzy) == expected