move fuzzy_match

to its own Python module to allow patching
This commit is contained in:
Kevin Humphreys 2022-10-31 15:52:49 -07:00
parent 1bfbd29160
commit 6e64a5cd0d
2 changed files with 23 additions and 16 deletions

19
spacy/matcher/fuzzy.py Normal file
View File

@ -0,0 +1,19 @@
from .levenshtein import levenshtein
def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool:
"""Define in pure Python outside Matcher to allow patching.
Patch with e.g.:
import wrapt
from spacy.matcher import fuzzy
@wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match')
*before* import spacy
"""
min_length = min(len(input_string), len(rule_string))
if distance: # FUZZYn operators with explicit distance
threshold = min(distance, min_length - 1)
else: # FUZZY operator with default distance
threshold = min(5, min_length - 2)
if threshold > 0:
return levenshtein(input_string, rule_string) <= threshold
return False

View File

@ -25,8 +25,7 @@ from ..errors import Errors, MatchPatternError, Warnings
from ..strings import get_string_id
from ..attrs import IDS
from .levenshtein import levenshtein
import fuzzy
DEF PADDING = 5
@ -205,17 +204,6 @@ cdef class Matcher:
else:
yield doc
@staticmethod
def fuzzy_match(s1: str, s2: str, distance: int) -> bool:
min_length = min(len(s1), len(s2))
if distance: # FUZZYn operators with explicit distance
threshold = min(distance, min_length - 1)
else: # FUZZY operator with default distance
threshold = min(5, min_length - 2)
if threshold > 0:
return levenshtein(s1, s2) <= threshold
return False
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
"""Find all token sequences matching the supplied pattern.
@ -862,7 +850,7 @@ class _FuzzyPredicate:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value:
return True
return Matcher.fuzzy_match(value, self.value, self.fuzzy)
return fuzzy.fuzzy_match(value, self.value, self.fuzzy)
class _RegexPredicate:
@ -945,7 +933,7 @@ class _SetPredicate:
return True
elif self.fuzzy is not None:
value = self.vocab.strings[value]
return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
for v in self.value)
else:
return False
@ -957,7 +945,7 @@ class _SetPredicate:
return False
elif self.fuzzy is not None:
value = self.vocab.strings[value]
return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
for v in self.value)
else:
return True