mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
move fuzzy_match
to its own Python module to allow patching
This commit is contained in:
parent
1bfbd29160
commit
6e64a5cd0d
19
spacy/matcher/fuzzy.py
Normal file
19
spacy/matcher/fuzzy.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from .levenshtein import levenshtein
|
||||
|
||||
def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool:
|
||||
"""Define in pure Python outside Matcher to allow patching.
|
||||
|
||||
Patch with e.g.:
|
||||
import wrapt
|
||||
from spacy.matcher import fuzzy
|
||||
@wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match')
|
||||
*before* import spacy
|
||||
"""
|
||||
min_length = min(len(input_string), len(rule_string))
|
||||
if distance: # FUZZYn operators with explicit distance
|
||||
threshold = min(distance, min_length - 1)
|
||||
else: # FUZZY operator with default distance
|
||||
threshold = min(5, min_length - 2)
|
||||
if threshold > 0:
|
||||
return levenshtein(input_string, rule_string) <= threshold
|
||||
return False
|
|
@ -25,8 +25,7 @@ from ..errors import Errors, MatchPatternError, Warnings
|
|||
from ..strings import get_string_id
|
||||
from ..attrs import IDS
|
||||
|
||||
from .levenshtein import levenshtein
|
||||
|
||||
import fuzzy
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -205,17 +204,6 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
@staticmethod
|
||||
def fuzzy_match(s1: str, s2: str, distance: int) -> bool:
|
||||
min_length = min(len(s1), len(s2))
|
||||
if distance: # FUZZYn operators with explicit distance
|
||||
threshold = min(distance, min_length - 1)
|
||||
else: # FUZZY operator with default distance
|
||||
threshold = min(5, min_length - 2)
|
||||
if threshold > 0:
|
||||
return levenshtein(s1, s2) <= threshold
|
||||
return False
|
||||
|
||||
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
|
@ -862,7 +850,7 @@ class _FuzzyPredicate:
|
|||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||
if self.value == value:
|
||||
return True
|
||||
return Matcher.fuzzy_match(value, self.value, self.fuzzy)
|
||||
return fuzzy.fuzzy_match(value, self.value, self.fuzzy)
|
||||
|
||||
|
||||
class _RegexPredicate:
|
||||
|
@ -945,7 +933,7 @@ class _SetPredicate:
|
|||
return True
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
else:
|
||||
return False
|
||||
|
@ -957,7 +945,7 @@ class _SetPredicate:
|
|||
return False
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
else:
|
||||
return True
|
||||
|
|
Loading…
Reference in New Issue
Block a user