reinstate FUZZY operator

with length-based distance function
This commit is contained in:
Kevin Humphreys 2022-09-22 19:26:52 -07:00
parent eab96f7c03
commit 0da324ab5b
3 changed files with 132 additions and 25 deletions

View File

@ -205,6 +205,18 @@ cdef class Matcher:
else:
yield doc
@staticmethod
def fuzzy_match(s1: str, s2: str, distance: int, token: Token) -> bool:
if token.is_oov: # (TODO: param?)
threshold = min(len(s1), len(s2)) - 1 # max edit distance
if distance: # FUZZYn operators
threshold = min(distance, threshold)
else: # FUZZY operator
threshold = min(5, threshold - 1) # default fuzziness (TODO: param?)
if threshold > 0:
return levenshtein(s1, s2) <= threshold
return False
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
"""Find all token sequences matching the supplied pattern.
@ -829,7 +841,7 @@ def _get_attr_values(spec, string_store):
# extensions to the matcher introduced in #3173.
class _FuzzyPredicate:
operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None):
self.i = i
@ -840,7 +852,8 @@ class _FuzzyPredicate:
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix
self.distance = self.predicate[len('FUZZY'):] # number after prefix
self.distance = int(self.distance) if self.distance else 0
def __call__(self, Token token):
if self.is_extension:
@ -849,9 +862,7 @@ class _FuzzyPredicate:
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
if self.value == value:
return True
elif self.distance and token.is_oov:
return bool(levenshtein(value, self.value) <= min(self.distance, min(len(value), len(self.value))-1))
return False
return Matcher.fuzzy_match(value, self.value, self.distance, token)
class _RegexPredicate:
@ -887,7 +898,7 @@ class _SetPredicate:
# normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value)
else:
if self.distance:
if self.distance is not None:
# add to string store
self.value = set(self.vocab.strings.add(v) for v in value)
else:
@ -924,21 +935,19 @@ class _SetPredicate:
if self.predicate == "IN":
if value in self.value:
return True
elif self.distance and token.is_oov:
elif self.distance is not None:
s1 = self.vocab.strings[value]
for v in self.value:
s2 = self.vocab.strings[v]
if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
return True
return False
elif self.predicate == "NOT_IN":
if value in self.value:
return False
elif self.distance and token.is_oov:
elif self.distance is not None:
s1 = self.vocab.strings[value]
for v in self.value:
s2 = self.vocab.strings[v]
if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1):
if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token):
return False
return True
elif self.predicate == "IS_SUBSET":
@ -998,6 +1007,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
"<=": _ComparisonPredicate,
">": _ComparisonPredicate,
"<": _ComparisonPredicate,
"FUZZY": _FuzzyPredicate,
"FUZZY1": _FuzzyPredicate,
"FUZZY2": _FuzzyPredicate,
"FUZZY3": _FuzzyPredicate,
@ -1036,7 +1046,8 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
# ignore unrecognized predicate type
continue
elif cls == _FuzzyPredicate:
distance = int(type_[len("FUZZY"):]) # number after prefix
distance = type_[len("FUZZY"):] # number after prefix
distance = int(distance) if distance else 0
if isinstance(value, dict):
# add predicates inside fuzzy operator
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,

View File

@ -162,6 +162,7 @@ class TokenPatternString(BaseModel):
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy")
FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1")
FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2")
FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3")

View File

@ -123,7 +123,7 @@ def test_matcher_match_multi(matcher):
def test_matcher_match_fuzzy1(en_vocab):
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": "java"}]],
}
matcher = Matcher(en_vocab)
@ -140,7 +140,7 @@ def test_matcher_match_fuzzy2(en_vocab):
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": {"FUZZY1": "java"}}]],
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
@ -153,6 +153,101 @@ def test_matcher_match_fuzzy2(en_vocab):
]
def test_matcher_match_fuzzy3(en_vocab):
rules = {
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns)
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 2, 4),
(doc.vocab.strings["Java"], 5, 6),
(doc.vocab.strings["JS"], 8, 9),
]
def test_matcher_match_fuzzy_set1(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 2, 4),
]
def test_matcher_match_fuzzy_set2(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 0, 2),
]
def test_matcher_match_fuzzy_set3(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]},
"NOT_IN": ["Goggle"]},
"OP": "+"}]]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 3, 4),
]
def test_matcher_match_fuzzyn1(en_vocab):
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": "java"}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns)
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 2, 4),
]
def test_matcher_match_fuzzyn2(en_vocab):
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": {"FUZZY1": "java"}}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns)
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["Java"], 5, 6),
]
def test_matcher_match_fuzzyn3(en_vocab):
rules = {
"JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]],
"GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]],
@ -170,37 +265,37 @@ def test_matcher_match_fuzzy3(en_vocab):
(doc.vocab.strings["JS"], 8, 9),
]
def test_matcher_match_fuzzy_set1(en_vocab):
def test_matcher_match_fuzzyn_set1(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]]
"GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "Now"]}}, "OP": "+"}]]
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Now"]
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 2, 4),
]
def test_matcher_match_fuzzy_set2(en_vocab):
def test_matcher_match_fuzzyn_set2(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]],
"GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Now"]
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 0, 2),
]
def test_matcher_match_fuzzy_set3(en_vocab):
def test_matcher_match_fuzzyn_set3(en_vocab):
rules = {
"GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]},
"GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]},
"NOT_IN": ["Goggle"]},
"OP": "+"}]]
}
@ -208,12 +303,12 @@ def test_matcher_match_fuzzy_set3(en_vocab):
for key, patterns in rules.items():
matcher.add(key, patterns, greedy="LONGEST")
words = ["They", "like", "Goggle", "Now"]
words = ["They", "like", "Goggle", "Noo"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 3, 4),
]
def test_matcher_empty_dict(en_vocab):
"""Test matcher allows empty token specs, meaning match on any token."""