mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
Rename to fuzzy_compare
This commit is contained in:
parent
efa638a11c
commit
c3f446f71b
|
@ -77,6 +77,6 @@ cdef class Matcher:
|
||||||
cdef public object _extensions
|
cdef public object _extensions
|
||||||
cdef public object _extra_predicates
|
cdef public object _extra_predicates
|
||||||
cdef public object _seen_attrs
|
cdef public object _seen_attrs
|
||||||
cdef public object _fuzzy_match
|
cdef public object _fuzzy_compare
|
||||||
|
|
||||||
cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int)
|
cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int)
|
||||||
|
|
|
@ -53,4 +53,4 @@ class Matcher:
|
||||||
) -> List[Span]: ...
|
) -> List[Span]: ...
|
||||||
def _normalize_key(self, key: Any) -> Any: ...
|
def _normalize_key(self, key: Any) -> Any: ...
|
||||||
|
|
||||||
def _default_fuzzy_match(s1: str, s2: str, fuzzy: int=-1) -> bool: ...
|
def _default_fuzzy_compare(s1: str, s2: str, fuzzy: int) -> bool: ...
|
||||||
|
|
|
@ -38,7 +38,7 @@ cdef class Matcher:
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True, *, fuzzy_match=_default_fuzzy_match):
|
def __init__(self, vocab, validate=True, *, fuzzy_compare=_default_fuzzy_compare):
|
||||||
"""Create the Matcher.
|
"""Create the Matcher.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
|
@ -53,10 +53,10 @@ cdef class Matcher:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.validate = validate
|
self.validate = validate
|
||||||
self._fuzzy_match = fuzzy_match
|
self._fuzzy_compare = fuzzy_compare
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_match)
|
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
|
||||||
return (unpickle_matcher, data, None, None)
|
return (unpickle_matcher, data, None, None)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
@ -131,7 +131,7 @@ cdef class Matcher:
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates, self._fuzzy_match)
|
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -329,8 +329,8 @@ cdef class Matcher:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_match):
|
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
|
||||||
matcher = Matcher(vocab, validate=validate, fuzzy_match=fuzzy_match)
|
matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
|
||||||
for key, pattern in patterns.items():
|
for key, pattern in patterns.items():
|
||||||
callback = callbacks.get(key, None)
|
callback = callbacks.get(key, None)
|
||||||
matcher.add(key, pattern, on_match=callback)
|
matcher.add(key, pattern, on_match=callback)
|
||||||
|
@ -757,7 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_match):
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -784,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -833,7 +833,7 @@ class _FuzzyPredicate:
|
||||||
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
|
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
regex=False, fuzzy=None, fuzzy_match=None):
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@ -844,7 +844,7 @@ class _FuzzyPredicate:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||||
self.fuzzy = int(fuzz) if fuzz else -1
|
self.fuzzy = int(fuzz) if fuzz else -1
|
||||||
self.fuzzy_match = fuzzy_match
|
self.fuzzy_compare = fuzzy_compare
|
||||||
|
|
||||||
def __call__(self, Token token):
|
def __call__(self, Token token):
|
||||||
if self.is_extension:
|
if self.is_extension:
|
||||||
|
@ -853,14 +853,14 @@ class _FuzzyPredicate:
|
||||||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||||
if self.value == value:
|
if self.value == value:
|
||||||
return True
|
return True
|
||||||
return self.fuzzy_match(value, self.value, self.fuzzy)
|
return self.fuzzy_compare(value, self.value, self.fuzzy)
|
||||||
|
|
||||||
|
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
regex=False, fuzzy=None, fuzzy_match=None):
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
|
@ -882,13 +882,13 @@ class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
regex=False, fuzzy=None, fuzzy_match=None):
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.regex = regex
|
self.regex = regex
|
||||||
self.fuzzy = fuzzy
|
self.fuzzy = fuzzy
|
||||||
self.fuzzy_match = fuzzy_match
|
self.fuzzy_compare = fuzzy_compare
|
||||||
if self.attr == MORPH:
|
if self.attr == MORPH:
|
||||||
# normalize morph strings
|
# normalize morph strings
|
||||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
|
@ -937,7 +937,7 @@ class _SetPredicate:
|
||||||
return True
|
return True
|
||||||
elif self.fuzzy is not None:
|
elif self.fuzzy is not None:
|
||||||
value = self.vocab.strings[value]
|
value = self.vocab.strings[value]
|
||||||
return any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||||
for v in self.value)
|
for v in self.value)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -949,7 +949,7 @@ class _SetPredicate:
|
||||||
return False
|
return False
|
||||||
elif self.fuzzy is not None:
|
elif self.fuzzy is not None:
|
||||||
value = self.vocab.strings[value]
|
value = self.vocab.strings[value]
|
||||||
return not any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||||
for v in self.value)
|
for v in self.value)
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -968,7 +968,7 @@ class _ComparisonPredicate:
|
||||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||||
regex=False, fuzzy=None, fuzzy_match=None):
|
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@ -997,7 +997,7 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match):
|
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetPredicate,
|
"IN": _SetPredicate,
|
||||||
|
@ -1035,12 +1035,12 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match):
|
||||||
attr = IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
extra_predicates, seen_predicates, fuzzy_match=fuzzy_match))
|
extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_match=None):
|
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
|
||||||
output = []
|
output = []
|
||||||
for type_, value in value_dict.items():
|
for type_, value in value_dict.items():
|
||||||
type_ = type_.upper()
|
type_ = type_.upper()
|
||||||
|
@ -1063,10 +1063,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
fuzzy_val = int(fuzz) if fuzz else -1
|
fuzzy_val = int(fuzz) if fuzz else -1
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
extra_predicates, seen_predicates,
|
extra_predicates, seen_predicates,
|
||||||
fuzzy=fuzzy_val, fuzzy_match=fuzzy_match))
|
fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
|
||||||
continue
|
continue
|
||||||
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
||||||
regex=regex, fuzzy=fuzzy, fuzzy_match=fuzzy_match)
|
regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
@ -1150,7 +1150,7 @@ def _get_extensions(spec, string_store, name2index):
|
||||||
return attr_values
|
return attr_values
|
||||||
|
|
||||||
|
|
||||||
cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int):
|
cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int):
|
||||||
distance = min(len(s1), len(s2))
|
distance = min(len(s1), len(s2))
|
||||||
distance -= 1 # don't allow completely different tokens
|
distance -= 1 # don't allow completely different tokens
|
||||||
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.matcher import levenshtein
|
from spacy.matcher import levenshtein
|
||||||
from spacy.matcher.matcher import _default_fuzzy_match
|
from spacy.matcher.matcher import _default_fuzzy_compare
|
||||||
|
|
||||||
|
|
||||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||||
|
@ -69,6 +69,6 @@ def test_levenshtein(dist, a, b):
|
||||||
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
|
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_default_fuzzy_match(a, b, fuzzy, expected):
|
def test_default_fuzzy_compare(a, b, fuzzy, expected):
|
||||||
assert _default_fuzzy_match(a, b, fuzzy) == expected
|
assert _default_fuzzy_compare(a, b, fuzzy) == expected
|
||||||
assert _default_fuzzy_match(b, a, fuzzy) == expected
|
assert _default_fuzzy_compare(b, a, fuzzy) == expected
|
||||||
|
|
Loading…
Reference in New Issue
Block a user