From 6e64a5cd0d49437592bc15c3a7272a1494d18af6 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 31 Oct 2022 15:52:49 -0700 Subject: [PATCH] move fuzzy_match to its own Python module to allow patching --- spacy/matcher/fuzzy.py | 19 +++++++++++++++++++ spacy/matcher/matcher.pyx | 20 ++++---------------- 2 files changed, 23 insertions(+), 16 deletions(-) create mode 100644 spacy/matcher/fuzzy.py diff --git a/spacy/matcher/fuzzy.py b/spacy/matcher/fuzzy.py new file mode 100644 index 000000000..3b29303be --- /dev/null +++ b/spacy/matcher/fuzzy.py @@ -0,0 +1,19 @@ +from .levenshtein import levenshtein + +def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool: + """Define in pure Python outside Matcher to allow patching. + + Patch with e.g.: + import wrapt + from spacy.matcher import fuzzy + @wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match') + *before* import spacy + """ + min_length = min(len(input_string), len(rule_string)) + if distance: # FUZZYn operators with explicit distance + threshold = min(distance, min_length - 1) + else: # FUZZY operator with default distance + threshold = min(5, min_length - 2) + if threshold > 0: + return levenshtein(input_string, rule_string) <= threshold + return False diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 443947cf1..257fd3ea4 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -25,8 +25,7 @@ from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS -from .levenshtein import levenshtein - +import fuzzy DEF PADDING = 5 @@ -205,17 +204,6 @@ cdef class Matcher: else: yield doc - @staticmethod - def fuzzy_match(s1: str, s2: str, distance: int) -> bool: - min_length = min(len(s1), len(s2)) - if distance: # FUZZYn operators with explicit distance - threshold = min(distance, min_length - 1) - else: # FUZZY operator with default distance - threshold = min(5, min_length - 2) - if threshold > 0: - return levenshtein(s1, s2) <= threshold - return False - def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): """Find all token sequences matching the supplied pattern. @@ -862,7 +850,7 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - return Matcher.fuzzy_match(value, self.value, self.fuzzy) + return fuzzy.fuzzy_match(value, self.value, self.fuzzy) class _RegexPredicate: @@ -945,7 +933,7 @@ class _SetPredicate: return True elif self.fuzzy is not None: value = self.vocab.strings[value] - return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return False @@ -957,7 +945,7 @@ class _SetPredicate: return False elif self.fuzzy is not None: value = self.vocab.strings[value] - return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) + return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) for v in self.value) else: return True