Rename to fuzzy_compare

This commit is contained in:
Adriane Boyd 2022-11-28 10:18:07 +01:00
parent efa638a11c
commit c3f446f71b
4 changed files with 30 additions and 30 deletions

View File

@ -77,6 +77,6 @@ cdef class Matcher:
cdef public object _extensions cdef public object _extensions
cdef public object _extra_predicates cdef public object _extra_predicates
cdef public object _seen_attrs cdef public object _seen_attrs
cdef public object _fuzzy_match cdef public object _fuzzy_compare
cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int) cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int)

View File

@ -53,4 +53,4 @@ class Matcher:
) -> List[Span]: ... ) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ... def _normalize_key(self, key: Any) -> Any: ...
def _default_fuzzy_match(s1: str, s2: str, fuzzy: int=-1) -> bool: ... def _default_fuzzy_compare(s1: str, s2: str, fuzzy: int) -> bool: ...

View File

@ -38,7 +38,7 @@ cdef class Matcher:
USAGE: https://spacy.io/usage/rule-based-matching USAGE: https://spacy.io/usage/rule-based-matching
""" """
def __init__(self, vocab, validate=True, *, fuzzy_match=_default_fuzzy_match): def __init__(self, vocab, validate=True, *, fuzzy_compare=_default_fuzzy_compare):
"""Create the Matcher. """Create the Matcher.
vocab (Vocab): The vocabulary object, which must be shared with the vocab (Vocab): The vocabulary object, which must be shared with the
@ -53,10 +53,10 @@ cdef class Matcher:
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
self.validate = validate self.validate = validate
self._fuzzy_match = fuzzy_match self._fuzzy_compare = fuzzy_compare
def __reduce__(self): def __reduce__(self):
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_match) data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
return (unpickle_matcher, data, None, None) return (unpickle_matcher, data, None, None)
def __len__(self): def __len__(self):
@ -131,7 +131,7 @@ cdef class Matcher:
for pattern in patterns: for pattern in patterns:
try: try:
specs = _preprocess_pattern(pattern, self.vocab, specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates, self._fuzzy_match) self._extensions, self._extra_predicates, self._fuzzy_compare)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs: for spec in specs:
for attr, _ in spec[1]: for attr, _ in spec[1]:
@ -329,8 +329,8 @@ cdef class Matcher:
return key return key
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_match): def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
matcher = Matcher(vocab, validate=validate, fuzzy_match=fuzzy_match) matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
for key, pattern in patterns.items(): for key, pattern in patterns.items():
callback = callbacks.get(key, None) callback = callbacks.get(key, None)
matcher.add(key, pattern, on_match=callback) matcher.add(key, pattern, on_match=callback)
@ -757,7 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value return id_attr.value
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_match): def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
"""This function interprets the pattern, converting the various bits of """This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern. syntactic sugar before we compile it into a struct with init_pattern.
@ -784,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates,
ops = _get_operators(spec) ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store) attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table) extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match) predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
for op in ops: for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
return tokens return tokens
@ -833,7 +833,7 @@ class _FuzzyPredicate:
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5") operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_match=None): regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = value self.value = value
@ -844,7 +844,7 @@ class _FuzzyPredicate:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
fuzz = self.predicate[len("FUZZY"):] # number after prefix fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_match = fuzzy_match self.fuzzy_compare = fuzzy_compare
def __call__(self, Token token): def __call__(self, Token token):
if self.is_extension: if self.is_extension:
@ -853,14 +853,14 @@ class _FuzzyPredicate:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value: if self.value == value:
return True return True
return self.fuzzy_match(value, self.value, self.fuzzy) return self.fuzzy_compare(value, self.value, self.fuzzy)
class _RegexPredicate: class _RegexPredicate:
operators = ("REGEX",) operators = ("REGEX",)
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_match=None): regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = re.compile(value) self.value = re.compile(value)
@ -882,13 +882,13 @@ class _SetPredicate:
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_match=None): regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.vocab = vocab self.vocab = vocab
self.regex = regex self.regex = regex
self.fuzzy = fuzzy self.fuzzy = fuzzy
self.fuzzy_match = fuzzy_match self.fuzzy_compare = fuzzy_compare
if self.attr == MORPH: if self.attr == MORPH:
# normalize morph strings # normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value) self.value = set(self.vocab.morphology.add(v) for v in value)
@ -937,7 +937,7 @@ class _SetPredicate:
return True return True
elif self.fuzzy is not None: elif self.fuzzy is not None:
value = self.vocab.strings[value] value = self.vocab.strings[value]
return any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
for v in self.value) for v in self.value)
else: else:
return False return False
@ -949,7 +949,7 @@ class _SetPredicate:
return False return False
elif self.fuzzy is not None: elif self.fuzzy is not None:
value = self.vocab.strings[value] value = self.vocab.strings[value]
return not any(self.fuzzy_match(value, self.vocab.strings[v], self.fuzzy) return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
for v in self.value) for v in self.value)
else: else:
return True return True
@ -968,7 +968,7 @@ class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<") operators = ("==", "!=", ">=", "<=", ">", "<")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
regex=False, fuzzy=None, fuzzy_match=None): regex=False, fuzzy=None, fuzzy_compare=None):
self.i = i self.i = i
self.attr = attr self.attr = attr
self.value = value self.value = value
@ -997,7 +997,7 @@ class _ComparisonPredicate:
return value < self.value return value < self.value
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match): def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
predicate_types = { predicate_types = {
"REGEX": _RegexPredicate, "REGEX": _RegexPredicate,
"IN": _SetPredicate, "IN": _SetPredicate,
@ -1035,12 +1035,12 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_match):
attr = IDS.get(attr.upper()) attr = IDS.get(attr.upper())
if isinstance(value, dict): if isinstance(value, dict):
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
extra_predicates, seen_predicates, fuzzy_match=fuzzy_match)) extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
return output return output
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_match=None): extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
output = [] output = []
for type_, value in value_dict.items(): for type_, value in value_dict.items():
type_ = type_.upper() type_ = type_.upper()
@ -1063,10 +1063,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
fuzzy_val = int(fuzz) if fuzz else -1 fuzzy_val = int(fuzz) if fuzz else -1
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
extra_predicates, seen_predicates, extra_predicates, seen_predicates,
fuzzy=fuzzy_val, fuzzy_match=fuzzy_match)) fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
continue continue
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
regex=regex, fuzzy=fuzzy, fuzzy_match=fuzzy_match) regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
# Don't create a redundant predicates. # Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results. # This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates: if predicate.key in seen_predicates:
@ -1150,7 +1150,7 @@ def _get_extensions(spec, string_store, name2index):
return attr_values return attr_values
cpdef bint _default_fuzzy_match(s1: str, s2: str, fuzzy: int): cpdef bint _default_fuzzy_compare(s1: str, s2: str, fuzzy: int):
distance = min(len(s1), len(s2)) distance = min(len(s1), len(s2))
distance -= 1 # don't allow completely different tokens distance -= 1 # don't allow completely different tokens
if fuzzy == -1: # FUZZY operator with unspecified fuzzy if fuzzy == -1: # FUZZY operator with unspecified fuzzy

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy.matcher import levenshtein from spacy.matcher import levenshtein
from spacy.matcher.matcher import _default_fuzzy_match from spacy.matcher.matcher import _default_fuzzy_compare
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -69,6 +69,6 @@ def test_levenshtein(dist, a, b):
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max) ("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
], ],
) )
def test_default_fuzzy_match(a, b, fuzzy, expected): def test_default_fuzzy_compare(a, b, fuzzy, expected):
assert _default_fuzzy_match(a, b, fuzzy) == expected assert _default_fuzzy_compare(a, b, fuzzy) == expected
assert _default_fuzzy_match(b, a, fuzzy) == expected assert _default_fuzzy_compare(b, a, fuzzy) == expected