From 49e93170bbc17920ab971e4a37579cb12afa022f Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Thu, 3 Nov 2022 13:39:17 -0700 Subject: [PATCH] move fuzzy_match back inside Matcher simplify logic and add tests --- spacy/matcher/fuzzy.py | 19 ---------------- spacy/matcher/matcher.pyx | 25 +++++++++++++++------ spacy/tests/matcher/test_levenshtein.py | 29 +++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 26 deletions(-) delete mode 100644 spacy/matcher/fuzzy.py diff --git a/spacy/matcher/fuzzy.py b/spacy/matcher/fuzzy.py deleted file mode 100644 index 3b29303be..000000000 --- a/spacy/matcher/fuzzy.py +++ /dev/null @@ -1,19 +0,0 @@ -from .levenshtein import levenshtein - -def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool: - """Define in pure Python outside Matcher to allow patching. - - Patch with e.g.: - import wrapt - from spacy.matcher import fuzzy - @wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match') - *before* import spacy - """ - min_length = min(len(input_string), len(rule_string)) - if distance: # FUZZYn operators with explicit distance - threshold = min(distance, min_length - 1) - else: # FUZZY operator with default distance - threshold = min(5, min_length - 2) - if threshold > 0: - return levenshtein(input_string, rule_string) <= threshold - return False diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 257fd3ea4..e4283f138 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -25,7 +25,8 @@ from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS -import fuzzy +from .levenshtein import levenshtein + DEF PADDING = 5 @@ -204,6 +205,17 @@ cdef class Matcher: else: yield doc + @staticmethod + def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool: + distance = min(len(input_string), len(rule_string)) + distance -= 1 # don't allow completely different tokens + if fuzzy == -1: # FUZZY operator with unspecified fuzzy + fuzzy = 5 # default max fuzzy + distance -= 1 # be more restrictive + distance = min(fuzzy, distance if distance > 0 else 1) + return levenshtein(input_string, rule_string, distance) <= distance + + def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): """Find all token sequences matching the supplied pattern. @@ -326,7 +338,6 @@ cdef class Matcher: else: return key - def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) for key, pattern in patterns.items(): @@ -841,7 +852,7 @@ class _FuzzyPredicate: if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) self.fuzzy = self.predicate[len('FUZZY'):] # number after prefix - self.fuzzy = int(self.fuzzy) if self.fuzzy else 0 + self.fuzzy = int(self.fuzzy) if self.fuzzy else -1 def __call__(self, Token token): if self.is_extension: @@ -850,7 +861,7 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - return fuzzy.fuzzy_match(value, self.value, self.fuzzy) + return Matcher.fuzzy_match(value, self.value, self.fuzzy) class _RegexPredicate: @@ -933,7 +944,7 @@ class _SetPredicate: return True elif self.fuzzy is not None: value = self.vocab.strings[value] - return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return False @@ -945,7 +956,7 @@ class _SetPredicate: return False elif self.fuzzy is not None: value = self.vocab.strings[value] - return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return True @@ -1054,7 +1065,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, continue elif cls == _FuzzyPredicate: fuzz = type_[len("FUZZY"):] # number after prefix - fuzz = int(fuzz) if fuzz else 0 + fuzz = int(fuzz) if fuzz else -1 if isinstance(value, dict): # add predicates inside fuzzy operator output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index d30e36132..1e7a6710f 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -1,5 +1,6 @@ import pytest from spacy.matcher import levenshtein +from spacy.matcher import Matcher # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests @@ -42,3 +43,31 @@ from spacy.matcher import levenshtein ) def test_levenshtein(dist, a, b): assert levenshtein(a, b) == dist + + +@pytest.mark.parametrize( + "a,b,fuzzy,expected", + [ + ("a", "a", 1, True), + ("a", "a", 0, True), + ("a", "a", -1, True), + ("a", "ab", 1, True), + ("a", "ab", 0, False), + ("a", "ab", -1, True), + ("ab", "ac", 1, True), + ("ab", "ac", -1, True), + ("abc", "cde", 4, False), # 4 reduced because of token length + ("abc", "cde", -1, False), + ("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length + ("abcdef", "cdefgh", 3, False), + ("abcdef", "cdefgh", -1, True), # default equivalent to 4 + ("abcdefgh", "cdefghijk", 5, True), + ("abcdefgh", "cdefghijk", 4, False), + ("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5 + ("abcdefgh", "cdefghijkl", 6, True), + ("abcdefgh", "cdefghijkl", 5, False), + ("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max) + ] +) +def test_fuzzy_match(a, b, fuzzy, expected): + assert Matcher.fuzzy_match(a, b, fuzzy) == expected