From 1f2e57eca4ccbb283cda9b82d400872cbbec81c9 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 22 Aug 2022 17:02:47 +0200 Subject: [PATCH] enable fuzzy matching --- requirements.txt | 1 + setup.cfg | 1 + spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyi | 2 +- spacy/matcher/matcher.pyx | 44 ++++++++++++++--------- spacy/tests/matcher/test_matcher_api.py | 48 +++++++++++++++++++++++++ 6 files changed, 80 insertions(+), 17 deletions(-) diff --git a/requirements.txt b/requirements.txt index 437dd415a..38b4cbf0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 +rapidfuzz>=2.4.0,<3.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 708300b04..536322ab1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,6 +63,7 @@ install_requires = packaging>=20.0 typing_extensions>=3.7.4,<4.2.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 + rapidfuzz>=2.4.0,<3.0.0 [options.entry_points] console_scripts = diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 455f978cc..b5e24e0e2 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -71,6 +71,7 @@ cdef class Matcher: cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab cdef public object validate + cdef public object fuzzy cdef public object _patterns cdef public object _callbacks cdef public object _filter diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 390629ff8..c7f487450 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,7 +5,7 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 5105f69ed..0d847c219 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -10,6 +10,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly import warnings +from rapidfuzz import fuzz_cpp from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -19,6 +20,7 @@ from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB +from ..attrs cimport LOWER, NORM from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -36,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True): + def __init__(self, vocab, validate=True, fuzzy=None): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -51,6 +53,7 @@ cdef class Matcher: self.vocab = vocab self.mem = Pool() self.validate = validate + self.fuzzy = fuzzy if fuzzy is not None else 0 def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -253,7 +256,8 @@ cdef class Matcher: matches = [] else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) + extensions=self._extensions, predicates=self._extra_predicates, + with_alignments=with_alignments, fuzzy=self.fuzzy) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -334,7 +338,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -379,7 +383,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if with_alignments != 0: align_states.resize(states.size()) transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments) + doclike[i], extra_attr_values, predicates, with_alignments, fuzzy) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -408,7 +412,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states @@ -417,8 +421,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[i].pattern.nr_py >= 1: update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) - action = get_action(states[i], token.c, extra_attrs, - cached_py_predicates) + action = get_action(states[i], token, extra_attrs, + cached_py_predicates, fuzzy) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -454,8 +458,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[q].pattern.nr_py != 0: update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) - action = get_action(states[q], token.c, extra_attrs, - cached_py_predicates) + action = get_action(states[q], token, extra_attrs, + cached_py_predicates, fuzzy) # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -566,8 +570,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: + Token token, const attr_t* extra_attrs, + const int8_t* predicate_matches, float fuzzy) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -626,7 +630,7 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ cdef int8_t is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches) + is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy) quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -678,16 +682,24 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: + Token token, const attr_t* extra_attrs, + const int8_t* predicate_matches, float fuzzy) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 spec = state.pattern if spec.nr_attr > 0: for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr_for_matcher(token, attr.attr) != attr.value: - return 0 + token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) + if token_attr_value != attr.value: + if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA + or attr.attr == LOWER or attr.attr == NORM): + with gil: + if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], + token.vocab.strings[attr.value]) < fuzzy: + return 0 + else: + return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: return 0 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 7c16da9f8..c29a349af 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -118,6 +118,54 @@ def test_matcher_match_multi(matcher): ] +def test_matcher_match_fuzz_all(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_some(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=85) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_none(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=90) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [] + + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab)