From 0da324ab5ba0463fdaf018a13d077fe9a3fc260b Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Thu, 22 Sep 2022 19:26:52 -0700 Subject: [PATCH] reinstate FUZZY operator with length-based distance function --- spacy/matcher/matcher.pyx | 37 +++++--- spacy/schemas.py | 1 + spacy/tests/matcher/test_matcher_api.py | 119 +++++++++++++++++++++--- 3 files changed, 132 insertions(+), 25 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 8cc022d7e..c49bb92de 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -205,6 +205,18 @@ cdef class Matcher: else: yield doc + @staticmethod + def fuzzy_match(s1: str, s2: str, distance: int, token: Token) -> bool: + if token.is_oov: # (TODO: param?) + threshold = min(len(s1), len(s2)) - 1 # max edit distance + if distance: # FUZZYn operators + threshold = min(distance, threshold) + else: # FUZZY operator + threshold = min(5, threshold - 1) # default fuzziness (TODO: param?) + if threshold > 0: + return levenshtein(s1, s2) <= threshold + return False + def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False): """Find all token sequences matching the supplied pattern. @@ -829,7 +841,7 @@ def _get_attr_values(spec, string_store): # extensions to the matcher introduced in #3173. class _FuzzyPredicate: - operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5") + operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5") def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None): self.i = i @@ -840,7 +852,8 @@ class _FuzzyPredicate: self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) - self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix + self.distance = self.predicate[len('FUZZY'):] # number after prefix + self.distance = int(self.distance) if self.distance else 0 def __call__(self, Token token): if self.is_extension: @@ -849,9 +862,7 @@ class _FuzzyPredicate: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] if self.value == value: return True - elif self.distance and token.is_oov: - return bool(levenshtein(value, self.value) <= min(self.distance, min(len(value), len(self.value))-1)) - return False + return Matcher.fuzzy_match(value, self.value, self.distance, token) class _RegexPredicate: @@ -887,7 +898,7 @@ class _SetPredicate: # normalize morph strings self.value = set(self.vocab.morphology.add(v) for v in value) else: - if self.distance: + if self.distance is not None: # add to string store self.value = set(self.vocab.strings.add(v) for v in value) else: @@ -924,21 +935,19 @@ class _SetPredicate: if self.predicate == "IN": if value in self.value: return True - elif self.distance and token.is_oov: + elif self.distance is not None: s1 = self.vocab.strings[value] for v in self.value: - s2 = self.vocab.strings[v] - if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1): + if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token): return True return False elif self.predicate == "NOT_IN": if value in self.value: return False - elif self.distance and token.is_oov: + elif self.distance is not None: s1 = self.vocab.strings[value] for v in self.value: - s2 = self.vocab.strings[v] - if levenshtein(s1, s2) <= min(self.distance, min(len(s1), len(s2))-1): + if Matcher.fuzzy_match(s1, self.vocab.strings[v], self.distance, token): return False return True elif self.predicate == "IS_SUBSET": @@ -998,6 +1007,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): "<=": _ComparisonPredicate, ">": _ComparisonPredicate, "<": _ComparisonPredicate, + "FUZZY": _FuzzyPredicate, "FUZZY1": _FuzzyPredicate, "FUZZY2": _FuzzyPredicate, "FUZZY3": _FuzzyPredicate, @@ -1036,7 +1046,8 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, # ignore unrecognized predicate type continue elif cls == _FuzzyPredicate: - distance = int(type_[len("FUZZY"):]) # number after prefix + distance = type_[len("FUZZY"):] # number after prefix + distance = int(distance) if distance else 0 if isinstance(value, dict): # add predicates inside fuzzy operator output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, diff --git a/spacy/schemas.py b/spacy/schemas.py index 053bd7cc1..f2be4428b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -162,6 +162,7 @@ class TokenPatternString(BaseModel): IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") + FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy") FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1") FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2") FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3") diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 77e3d9713..3c50489b8 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -123,7 +123,7 @@ def test_matcher_match_multi(matcher): def test_matcher_match_fuzzy1(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } matcher = Matcher(en_vocab) @@ -140,7 +140,7 @@ def test_matcher_match_fuzzy2(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": {"FUZZY1": "java"}}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], } matcher = Matcher(en_vocab) for key, patterns in rules.items(): @@ -153,6 +153,101 @@ def test_matcher_match_fuzzy2(en_vocab): ] def test_matcher_match_fuzzy3(en_vocab): + rules = { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + (doc.vocab.strings["JS"], 8, 9), + ] + +def test_matcher_match_fuzzy_set1(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + ] + +def test_matcher_match_fuzzy_set2(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]], + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 0, 2), + ] + +def test_matcher_match_fuzzy_set3(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, + "NOT_IN": ["Goggle"]}, + "OP": "+"}]] + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["They", "like", "Goggle", "Noo"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 3, 4), + ] + + +def test_matcher_match_fuzzyn1(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + ] + +def test_matcher_match_fuzzyn2(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY1": "java"}}]], + } + matcher = Matcher(en_vocab) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzzyn3(en_vocab): rules = { "JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]], "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]], @@ -170,37 +265,37 @@ def test_matcher_match_fuzzy3(en_vocab): (doc.vocab.strings["JS"], 8, 9), ] -def test_matcher_match_fuzzy_set1(en_vocab): +def test_matcher_match_fuzzyn_set1(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]] + "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "Now"]}}, "OP": "+"}]] } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["They", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Noo"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), ] -def test_matcher_match_fuzzy_set2(en_vocab): +def test_matcher_match_fuzzyn_set2(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]], + "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]], } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["They", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Noo"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 0, 2), ] -def test_matcher_match_fuzzy_set3(en_vocab): +def test_matcher_match_fuzzyn_set3(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]}, + "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]}, "OP": "+"}]] } @@ -208,12 +303,12 @@ def test_matcher_match_fuzzy_set3(en_vocab): for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["They", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Noo"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 3, 4), ] - + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token."""