mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 05:40:20 +03:00
move fuzzy_match back inside Matcher
simplify logic and add tests
This commit is contained in:
parent
6e64a5cd0d
commit
49e93170bb
|
@ -1,19 +0,0 @@
|
||||||
from .levenshtein import levenshtein
|
|
||||||
|
|
||||||
def fuzzy_match(input_string: str, rule_string: str, distance: int=0) -> bool:
|
|
||||||
"""Define in pure Python outside Matcher to allow patching.
|
|
||||||
|
|
||||||
Patch with e.g.:
|
|
||||||
import wrapt
|
|
||||||
from spacy.matcher import fuzzy
|
|
||||||
@wrapt.patch_function_wrapper('spacy.matcher.fuzzy', 'fuzzy_match')
|
|
||||||
*before* import spacy
|
|
||||||
"""
|
|
||||||
min_length = min(len(input_string), len(rule_string))
|
|
||||||
if distance: # FUZZYn operators with explicit distance
|
|
||||||
threshold = min(distance, min_length - 1)
|
|
||||||
else: # FUZZY operator with default distance
|
|
||||||
threshold = min(5, min_length - 2)
|
|
||||||
if threshold > 0:
|
|
||||||
return levenshtein(input_string, rule_string) <= threshold
|
|
||||||
return False
|
|
|
@ -25,7 +25,8 @@ from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
|
|
||||||
import fuzzy
|
from .levenshtein import levenshtein
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -204,6 +205,17 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool:
|
||||||
|
distance = min(len(input_string), len(rule_string))
|
||||||
|
distance -= 1 # don't allow completely different tokens
|
||||||
|
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
||||||
|
fuzzy = 5 # default max fuzzy
|
||||||
|
distance -= 1 # be more restrictive
|
||||||
|
distance = min(fuzzy, distance if distance > 0 else 1)
|
||||||
|
return levenshtein(input_string, rule_string, distance) <= distance
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
|
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
|
@ -326,7 +338,6 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
def unpickle_matcher(vocab, patterns, callbacks):
|
def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
for key, pattern in patterns.items():
|
for key, pattern in patterns.items():
|
||||||
|
@ -841,7 +852,7 @@ class _FuzzyPredicate:
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
self.fuzzy = self.predicate[len('FUZZY'):] # number after prefix
|
self.fuzzy = self.predicate[len('FUZZY'):] # number after prefix
|
||||||
self.fuzzy = int(self.fuzzy) if self.fuzzy else 0
|
self.fuzzy = int(self.fuzzy) if self.fuzzy else -1
|
||||||
|
|
||||||
def __call__(self, Token token):
|
def __call__(self, Token token):
|
||||||
if self.is_extension:
|
if self.is_extension:
|
||||||
|
@ -850,7 +861,7 @@ class _FuzzyPredicate:
|
||||||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||||
if self.value == value:
|
if self.value == value:
|
||||||
return True
|
return True
|
||||||
return fuzzy.fuzzy_match(value, self.value, self.fuzzy)
|
return Matcher.fuzzy_match(value, self.value, self.fuzzy)
|
||||||
|
|
||||||
|
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
|
@ -933,7 +944,7 @@ class _SetPredicate:
|
||||||
return True
|
return True
|
||||||
elif self.fuzzy is not None:
|
elif self.fuzzy is not None:
|
||||||
value = self.vocab.strings[value]
|
value = self.vocab.strings[value]
|
||||||
return any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||||
for v in self.value)
|
for v in self.value)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -945,7 +956,7 @@ class _SetPredicate:
|
||||||
return False
|
return False
|
||||||
elif self.fuzzy is not None:
|
elif self.fuzzy is not None:
|
||||||
value = self.vocab.strings[value]
|
value = self.vocab.strings[value]
|
||||||
return not any(fuzzy.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||||
for v in self.value)
|
for v in self.value)
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -1054,7 +1065,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
continue
|
continue
|
||||||
elif cls == _FuzzyPredicate:
|
elif cls == _FuzzyPredicate:
|
||||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||||
fuzz = int(fuzz) if fuzz else 0
|
fuzz = int(fuzz) if fuzz else -1
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# add predicates inside fuzzy operator
|
# add predicates inside fuzzy operator
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.matcher import levenshtein
|
from spacy.matcher import levenshtein
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||||
|
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
|
||||||
)
|
)
|
||||||
def test_levenshtein(dist, a, b):
|
def test_levenshtein(dist, a, b):
|
||||||
assert levenshtein(a, b) == dist
|
assert levenshtein(a, b) == dist
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"a,b,fuzzy,expected",
|
||||||
|
[
|
||||||
|
("a", "a", 1, True),
|
||||||
|
("a", "a", 0, True),
|
||||||
|
("a", "a", -1, True),
|
||||||
|
("a", "ab", 1, True),
|
||||||
|
("a", "ab", 0, False),
|
||||||
|
("a", "ab", -1, True),
|
||||||
|
("ab", "ac", 1, True),
|
||||||
|
("ab", "ac", -1, True),
|
||||||
|
("abc", "cde", 4, False), # 4 reduced because of token length
|
||||||
|
("abc", "cde", -1, False),
|
||||||
|
("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length
|
||||||
|
("abcdef", "cdefgh", 3, False),
|
||||||
|
("abcdef", "cdefgh", -1, True), # default equivalent to 4
|
||||||
|
("abcdefgh", "cdefghijk", 5, True),
|
||||||
|
("abcdefgh", "cdefghijk", 4, False),
|
||||||
|
("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5
|
||||||
|
("abcdefgh", "cdefghijkl", 6, True),
|
||||||
|
("abcdefgh", "cdefghijkl", 5, False),
|
||||||
|
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_fuzzy_match(a, b, fuzzy, expected):
|
||||||
|
assert Matcher.fuzzy_match(a, b, fuzzy) == expected
|
||||||
|
|
Loading…
Reference in New Issue
Block a user