From 63f5e1331d489dfa535fc7ebacde0142717c8167 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Fri, 26 Aug 2022 00:10:53 +0200 Subject: [PATCH] add fuzzy attribute list --- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyi | 3 +- spacy/matcher/matcher.pyx | 66 ++++++++++----- spacy/tests/matcher/test_matcher_api.py | 105 ++++++++++++++++-------- 4 files changed, 119 insertions(+), 56 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index b5e24e0e2..98041e199 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -72,6 +72,7 @@ cdef class Matcher: cdef readonly Vocab vocab cdef public object validate cdef public object fuzzy + cdef public object fuzzy_attrs cdef public object _patterns cdef public object _callbacks cdef public object _filter diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index c7f487450..676be6a45 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,7 +5,8 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ..., + fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index b4f0a3f5e..17d965eaa 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -38,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True, fuzzy=None): + def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -54,6 +54,7 @@ cdef class Matcher: self.mem = Pool() self.validate = validate self.fuzzy = fuzzy if fuzzy is not None else 0 + self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else [] def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -131,7 +132,8 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates, self.fuzzy) + self._extensions, self._extra_predicates, + self.fuzzy, self.fuzzy_attrs) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -257,7 +259,8 @@ cdef class Matcher: else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates, - with_alignments=with_alignments, fuzzy=self.fuzzy) + with_alignments=with_alignments, + fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -338,7 +341,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0, list fuzzy_attrs=[]): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -357,6 +360,9 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() + cdef int8_t* fuzzy_attrs_array + cdef int n_fuzzy_attrs = len(fuzzy_attrs) + output = [] if length == 0: # avoid any processing or mem alloc if the document is empty @@ -375,6 +381,10 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if isinstance(value, str): value = token.vocab.strings[value] extra_attr_values[i * nr_extra_attr + index] = value + if n_fuzzy_attrs > 0: + fuzzy_attrs_array = mem.alloc(n_fuzzy_attrs, sizeof(int8_t)) + for i in range(n_fuzzy_attrs): + fuzzy_attrs_array[i] = fuzzy_attrs[i] # Main loop cdef int nr_predicate = len(predicates) for i in range(length): @@ -383,7 +393,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if with_alignments != 0: align_states.resize(states.size()) transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments, fuzzy) + doclike[i], extra_attr_values, predicates, with_alignments, + fuzzy, fuzzy_attrs_array, n_fuzzy_attrs) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -412,7 +423,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states @@ -422,7 +434,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) action = get_action(states[i], token, extra_attrs, - cached_py_predicates, fuzzy) + cached_py_predicates, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -459,7 +472,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) action = get_action(states[q], token, extra_attrs, - cached_py_predicates, fuzzy) + cached_py_predicates, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -571,7 +585,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, float fuzzy) nogil: + const int8_t* predicate_matches, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -630,7 +645,8 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ cdef int8_t is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy) + is_match = get_is_match(state, token, extra_attrs, predicate_matches, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -683,7 +699,8 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, float fuzzy) nogil: + const int8_t* predicate_matches, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 @@ -692,16 +709,22 @@ cdef int8_t get_is_match(PatternStateC state, for attr in spec.attrs[:spec.nr_attr]: token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) if token_attr_value != attr.value: - if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA - or attr.attr == LOWER or attr.attr == NORM): - with gil: - if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], - token.vocab.strings[attr.value]) < fuzzy: - return 0 + if fuzzy != 0: # and n_fuzzy_attrs > 0: + fuzzy_match = False + for i in range(n_fuzzy_attrs): + if attr.attr == fuzzy_attrs[i]: + with gil: + if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], + token.vocab.strings[attr.value]) >= fuzzy: + fuzzy_match = True + break + if not fuzzy_match: + return 0 else: return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: + # TODO: fuzzy match return 0 return True @@ -766,7 +789,8 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, + fuzzy, fuzzy_attrs): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -916,9 +940,9 @@ class _SetPredicate: else: value = set(get_string_id(v) for v in value) if self.predicate == "IN": - return value in self.value # handle fuzzy + return value in self.value # TODO: handle fuzzy elif self.predicate == "NOT_IN": - return value not in self.value # handle fuzzy + return value not in self.value # TODO: handle fuzzy elif self.predicate == "IS_SUBSET": return value <= self.value elif self.predicate == "IS_SUPERSET": @@ -933,7 +957,7 @@ class _SetPredicate: class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): self.i = i self.attr = attr self.value = value diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 595488bf4..798222cc3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,15 +6,16 @@ from spacy.tokens import Doc, Token, Span from ..doc.test_underscore import clean_underscore # noqa: F401 +matcher_rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], +} + @pytest.fixture def matcher(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } matcher = Matcher(en_vocab) - for key, patterns in rules.items(): + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) return matcher @@ -118,57 +119,58 @@ def test_matcher_match_multi(matcher): ] +# fuzzy matches on specific attributes + def test_matcher_match_fuzz_all(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=80) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), (doc.vocab.strings["Java"], 5, 6), + (doc.vocab.strings["JS"], 8, 9), + ] + +def test_matcher_match_fuzz_all_lower(en_vocab): + matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"]) + for key, patterns in matcher_rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), ] def test_matcher_match_fuzz_some(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=85) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["Java"], 5, 6), ] def test_matcher_match_fuzz_none(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=90) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [] -def test_matcher_match_fuzz_pred(en_vocab): +# fuzzy matches on specific tokens + +def test_matcher_match_fuzz_pred1(en_vocab): rules = { - "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } @@ -176,10 +178,45 @@ def test_matcher_match_fuzz_pred(en_vocab): for key, patterns in rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [] + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + ] +def test_matcher_match_fuzz_pred2(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_preds(en_vocab): + rules = { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + (doc.vocab.strings["JS"], 8, 9), + ] def test_matcher_empty_dict(en_vocab):