From 1f2e57eca4ccbb283cda9b82d400872cbbec81c9 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 22 Aug 2022 17:02:47 +0200 Subject: [PATCH 01/15] enable fuzzy matching --- requirements.txt | 1 + setup.cfg | 1 + spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyi | 2 +- spacy/matcher/matcher.pyx | 44 ++++++++++++++--------- spacy/tests/matcher/test_matcher_api.py | 48 +++++++++++++++++++++++++ 6 files changed, 80 insertions(+), 17 deletions(-) diff --git a/requirements.txt b/requirements.txt index 437dd415a..38b4cbf0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 +rapidfuzz>=2.4.0,<3.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 708300b04..536322ab1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,6 +63,7 @@ install_requires = packaging>=20.0 typing_extensions>=3.7.4,<4.2.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 + rapidfuzz>=2.4.0,<3.0.0 [options.entry_points] console_scripts = diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 455f978cc..b5e24e0e2 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -71,6 +71,7 @@ cdef class Matcher: cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab cdef public object validate + cdef public object fuzzy cdef public object _patterns cdef public object _callbacks cdef public object _filter diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 390629ff8..c7f487450 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,7 +5,7 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 5105f69ed..0d847c219 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -10,6 +10,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly import warnings +from rapidfuzz import fuzz_cpp from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -19,6 +20,7 @@ from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB +from ..attrs cimport LOWER, NORM from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -36,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True): + def __init__(self, vocab, validate=True, fuzzy=None): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -51,6 +53,7 @@ cdef class Matcher: self.vocab = vocab self.mem = Pool() self.validate = validate + self.fuzzy = fuzzy if fuzzy is not None else 0 def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -253,7 +256,8 @@ cdef class Matcher: matches = [] else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) + extensions=self._extensions, predicates=self._extra_predicates, + with_alignments=with_alignments, fuzzy=self.fuzzy) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -334,7 +338,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -379,7 +383,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if with_alignments != 0: align_states.resize(states.size()) transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments) + doclike[i], extra_attr_values, predicates, with_alignments, fuzzy) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -408,7 +412,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states @@ -417,8 +421,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[i].pattern.nr_py >= 1: update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) - action = get_action(states[i], token.c, extra_attrs, - cached_py_predicates) + action = get_action(states[i], token, extra_attrs, + cached_py_predicates, fuzzy) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -454,8 +458,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[q].pattern.nr_py != 0: update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) - action = get_action(states[q], token.c, extra_attrs, - cached_py_predicates) + action = get_action(states[q], token, extra_attrs, + cached_py_predicates, fuzzy) # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -566,8 +570,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: + Token token, const attr_t* extra_attrs, + const int8_t* predicate_matches, float fuzzy) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -626,7 +630,7 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ cdef int8_t is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches) + is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy) quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -678,16 +682,24 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const int8_t* predicate_matches) nogil: + Token token, const attr_t* extra_attrs, + const int8_t* predicate_matches, float fuzzy) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 spec = state.pattern if spec.nr_attr > 0: for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr_for_matcher(token, attr.attr) != attr.value: - return 0 + token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) + if token_attr_value != attr.value: + if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA + or attr.attr == LOWER or attr.attr == NORM): + with gil: + if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], + token.vocab.strings[attr.value]) < fuzzy: + return 0 + else: + return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: return 0 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 7c16da9f8..c29a349af 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -118,6 +118,54 @@ def test_matcher_match_multi(matcher): ] +def test_matcher_match_fuzz_all(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_some(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=85) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_none(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=90) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [] + + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) From b617382dc65432956accb91f150e52d79019dcaa Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Wed, 24 Aug 2022 13:13:27 +0200 Subject: [PATCH 02/15] add fuzzy param to EntityMatcher --- spacy/pipeline/entityruler.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 3cb1ca676..d1b05334e 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -28,6 +28,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, + "fuzzy": None, }, default_score_weights={ "ents_f": 1.0, @@ -44,6 +45,7 @@ def make_entity_ruler( overwrite_ents: bool, ent_id_sep: str, scorer: Optional[Callable], + fuzzy: Optional[float], ): return EntityRuler( nlp, @@ -53,6 +55,7 @@ def make_entity_ruler( overwrite_ents=overwrite_ents, ent_id_sep=ent_id_sep, scorer=scorer, + fuzzy=fuzzy, ) @@ -87,6 +90,7 @@ class EntityRuler(Pipe): ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, scorer: Optional[Callable] = entity_ruler_score, + fuzzy: Optional[float] = None, ) -> None: """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` @@ -118,7 +122,7 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) # type: ignore self.phrase_patterns = defaultdict(list) # type: ignore self._validate = validate - self.matcher = Matcher(nlp.vocab, validate=validate) + self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=fuzzy) self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=self.phrase_matcher_attr, validate=validate @@ -128,6 +132,7 @@ class EntityRuler(Pipe): if patterns is not None: self.add_patterns(patterns) self.scorer = scorer + self.fuzzy = fuzzy def __len__(self) -> int: """The number of all patterns added to the entity ruler.""" @@ -338,7 +343,7 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(tuple) - self.matcher = Matcher(self.nlp.vocab, validate=self._validate) + self.matcher = Matcher(self.nlp.vocab, validate=self._validate, fuzzy=self.fuzzy) self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) From ee985a382e47729cba079c03c0cc5b15a618f6eb Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Wed, 24 Aug 2022 13:13:54 +0200 Subject: [PATCH 03/15] include rapidfuzz_capi not yet used --- requirements.txt | 1 + setup.cfg | 3 ++- setup.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 38b4cbf0d..47dcede1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 rapidfuzz>=2.4.0,<3.0.0 +rapidfuzz_capi>=1.0.5,<2.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 536322ab1..658683df7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,12 +58,13 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 + rapidfuzz>=2.4.0,<3.0.0 + rapidfuzz_capi>=1.0.5,<2.0.0 # Official Python utilities setuptools packaging>=20.0 typing_extensions>=3.7.4,<4.2.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 - rapidfuzz>=2.4.0,<3.0.0 [options.entry_points] console_scripts = diff --git a/setup.py b/setup.py index ec1bd35fa..413c55d22 100755 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ from Cython.Build import cythonize from Cython.Compiler import Options import os import subprocess +import rapidfuzz_capi ROOT = Path(__file__).parent @@ -202,6 +203,7 @@ def setup_package(): include_dirs = [ numpy.get_include(), + rapidfuzz_capi.get_include(), get_python_inc(plat_specific=True), ] ext_modules = [] From 9600fe1d99923d57666332d5d62399a1e7ed8873 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Wed, 24 Aug 2022 15:04:09 +0200 Subject: [PATCH 04/15] fix type --- spacy/pipeline/entityruler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index d1b05334e..e5852e4e8 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -90,7 +90,7 @@ class EntityRuler(Pipe): ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, scorer: Optional[Callable] = entity_ruler_score, - fuzzy: Optional[float] = None, + fuzzy: float = None, ) -> None: """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` From 3dc5b9c7be99854c146e2ab14ff3c7750a2f934e Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Wed, 24 Aug 2022 17:54:42 +0200 Subject: [PATCH 05/15] add FUZZY predicate --- spacy/matcher/matcher.pyx | 41 +++++++++++++++++++------ spacy/schemas.py | 2 ++ spacy/tests/matcher/test_matcher_api.py | 16 ++++++++++ 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 0d847c219..b4f0a3f5e 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -131,7 +131,7 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates) + self._extensions, self._extra_predicates, self.fuzzy) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -766,7 +766,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -793,7 +793,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates, vocab) + predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -838,10 +838,32 @@ def _get_attr_values(spec, string_store): # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. +class _FuzzyPredicate: + operators = ("FUZZY",) + + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): + self.i = i + self.attr = attr + self.value = value + self.predicate = predicate + self.is_extension = is_extension + self.fuzzy = fuzzy + self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + if self.predicate not in self.operators: + raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + + def __call__(self, Token token): + if self.is_extension: + value = token._.get(self.attr) + else: + value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] + return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy) + + class _RegexPredicate: operators = ("REGEX",) - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): self.i = i self.attr = attr self.value = re.compile(value) @@ -862,7 +884,7 @@ class _RegexPredicate: class _SetPredicate: operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): self.i = i self.attr = attr self.vocab = vocab @@ -894,9 +916,9 @@ class _SetPredicate: else: value = set(get_string_id(v) for v in value) if self.predicate == "IN": - return value in self.value + return value in self.value # handle fuzzy elif self.predicate == "NOT_IN": - return value not in self.value + return value not in self.value # handle fuzzy elif self.predicate == "IS_SUBSET": return value <= self.value elif self.predicate == "IS_SUPERSET": @@ -940,8 +962,9 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates, vocab): +def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy): predicate_types = { + "FUZZY": _FuzzyPredicate, "REGEX": _RegexPredicate, "IN": _SetPredicate, "NOT_IN": _SetPredicate, @@ -975,7 +998,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): value_with_upper_keys = {k.upper(): v for k, v in value.items()} for type_, cls in predicate_types.items(): if type_ in value_with_upper_keys: - predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab) + predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy) # Don't create a redundant predicates. # This helps with efficiency, as we're caching the results. if predicate.key in seen_predicates: diff --git a/spacy/schemas.py b/spacy/schemas.py index 9f91451a9..2677378d6 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -157,6 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") + FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") @@ -176,6 +177,7 @@ class TokenPatternString(BaseModel): class TokenPatternNumber(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") + FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c29a349af..595488bf4 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -166,6 +166,22 @@ def test_matcher_match_fuzz_none(en_vocab): assert matcher(doc) == [] +def test_matcher_match_fuzz_pred(en_vocab): + rules = { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [] + + + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) From 78699ab0ce105720203c95a4bcd3e9c729090819 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Fri, 26 Aug 2022 00:10:53 +0200 Subject: [PATCH 06/15] add fuzzy attribute list --- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyi | 3 +- spacy/matcher/matcher.pyx | 66 ++++++++++----- spacy/tests/matcher/test_matcher_api.py | 105 ++++++++++++++++-------- 4 files changed, 119 insertions(+), 56 deletions(-) diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index b5e24e0e2..98041e199 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -72,6 +72,7 @@ cdef class Matcher: cdef readonly Vocab vocab cdef public object validate cdef public object fuzzy + cdef public object fuzzy_attrs cdef public object _patterns cdef public object _callbacks cdef public object _filter diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index c7f487450..676be6a45 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,7 +5,8 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ..., fuzzy: float = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ..., + fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index b4f0a3f5e..17d965eaa 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -38,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True, fuzzy=None): + def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -54,6 +54,7 @@ cdef class Matcher: self.mem = Pool() self.validate = validate self.fuzzy = fuzzy if fuzzy is not None else 0 + self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else [] def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -131,7 +132,8 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates, self.fuzzy) + self._extensions, self._extra_predicates, + self.fuzzy, self.fuzzy_attrs) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -257,7 +259,8 @@ cdef class Matcher: else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates, - with_alignments=with_alignments, fuzzy=self.fuzzy) + with_alignments=with_alignments, + fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -338,7 +341,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0, list fuzzy_attrs=[]): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -357,6 +360,9 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() + cdef int8_t* fuzzy_attrs_array + cdef int n_fuzzy_attrs = len(fuzzy_attrs) + output = [] if length == 0: # avoid any processing or mem alloc if the document is empty @@ -375,6 +381,10 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if isinstance(value, str): value = token.vocab.strings[value] extra_attr_values[i * nr_extra_attr + index] = value + if n_fuzzy_attrs > 0: + fuzzy_attrs_array = mem.alloc(n_fuzzy_attrs, sizeof(int8_t)) + for i in range(n_fuzzy_attrs): + fuzzy_attrs_array[i] = fuzzy_attrs[i] # Main loop cdef int nr_predicate = len(predicates) for i in range(length): @@ -383,7 +393,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if with_alignments != 0: align_states.resize(states.size()) transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments, fuzzy) + doclike[i], extra_attr_values, predicates, with_alignments, + fuzzy, fuzzy_attrs_array, n_fuzzy_attrs) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -412,7 +423,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, float fuzzy) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states @@ -422,7 +434,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) action = get_action(states[i], token, extra_attrs, - cached_py_predicates, fuzzy) + cached_py_predicates, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -459,7 +472,8 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) action = get_action(states[q], token, extra_attrs, - cached_py_predicates, fuzzy) + cached_py_predicates, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -571,7 +585,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, float fuzzy) nogil: + const int8_t* predicate_matches, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -630,7 +645,8 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ cdef int8_t is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches, fuzzy) + is_match = get_is_match(state, token, extra_attrs, predicate_matches, + fuzzy, fuzzy_attrs, n_fuzzy_attrs) quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -683,7 +699,8 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, float fuzzy) nogil: + const int8_t* predicate_matches, + float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 @@ -692,16 +709,22 @@ cdef int8_t get_is_match(PatternStateC state, for attr in spec.attrs[:spec.nr_attr]: token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) if token_attr_value != attr.value: - if fuzzy != 0 and (attr.attr == ORTH or attr.attr == LEMMA - or attr.attr == LOWER or attr.attr == NORM): - with gil: - if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], - token.vocab.strings[attr.value]) < fuzzy: - return 0 + if fuzzy != 0: # and n_fuzzy_attrs > 0: + fuzzy_match = False + for i in range(n_fuzzy_attrs): + if attr.attr == fuzzy_attrs[i]: + with gil: + if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], + token.vocab.strings[attr.value]) >= fuzzy: + fuzzy_match = True + break + if not fuzzy_match: + return 0 else: return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: + # TODO: fuzzy match return 0 return True @@ -766,7 +789,8 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, + fuzzy, fuzzy_attrs): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -916,9 +940,9 @@ class _SetPredicate: else: value = set(get_string_id(v) for v in value) if self.predicate == "IN": - return value in self.value # handle fuzzy + return value in self.value # TODO: handle fuzzy elif self.predicate == "NOT_IN": - return value not in self.value # handle fuzzy + return value not in self.value # TODO: handle fuzzy elif self.predicate == "IS_SUBSET": return value <= self.value elif self.predicate == "IS_SUPERSET": @@ -933,7 +957,7 @@ class _SetPredicate: class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): self.i = i self.attr = attr self.value = value diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 595488bf4..798222cc3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,15 +6,16 @@ from spacy.tokens import Doc, Token, Span from ..doc.test_underscore import clean_underscore # noqa: F401 +matcher_rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], +} + @pytest.fixture def matcher(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } matcher = Matcher(en_vocab) - for key, patterns in rules.items(): + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) return matcher @@ -118,57 +119,58 @@ def test_matcher_match_multi(matcher): ] +# fuzzy matches on specific attributes + def test_matcher_match_fuzz_all(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=80) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), (doc.vocab.strings["Java"], 5, 6), + (doc.vocab.strings["JS"], 8, 9), + ] + +def test_matcher_match_fuzz_all_lower(en_vocab): + matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"]) + for key, patterns in matcher_rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), ] def test_matcher_match_fuzz_some(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=85) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["Java"], 5, 6), ] def test_matcher_match_fuzz_none(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } - matcher = Matcher(en_vocab, fuzzy=90) - for key, patterns in rules.items(): + matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"]) + for key, patterns in matcher_rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Noww", "and", "Jav", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [] -def test_matcher_match_fuzz_pred(en_vocab): +# fuzzy matches on specific tokens + +def test_matcher_match_fuzz_pred1(en_vocab): rules = { - "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } @@ -176,10 +178,45 @@ def test_matcher_match_fuzz_pred(en_vocab): for key, patterns in rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Now", "and", "JavaScrpt", "best"] + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [] + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + ] +def test_matcher_match_fuzz_pred2(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["Java"], 5, 6), + ] + +def test_matcher_match_fuzz_preds(en_vocab): + rules = { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns) + + words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + (doc.vocab.strings["JS"], 8, 9), + ] def test_matcher_empty_dict(en_vocab): From c017de997a795445850e96bd2a413b2e27ea3c15 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Fri, 26 Aug 2022 01:30:44 +0200 Subject: [PATCH 07/15] fix type properly --- spacy/pipeline/entityruler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index e5852e4e8..1e816ab16 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -26,9 +26,9 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "phrase_matcher_attr": None, "validate": False, "overwrite_ents": False, + "fuzzy": 0.0, "ent_id_sep": DEFAULT_ENT_ID_SEP, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - "fuzzy": None, }, default_score_weights={ "ents_f": 1.0, @@ -43,9 +43,9 @@ def make_entity_ruler( phrase_matcher_attr: Optional[Union[int, str]], validate: bool, overwrite_ents: bool, + fuzzy: float, ent_id_sep: str, scorer: Optional[Callable], - fuzzy: Optional[float], ): return EntityRuler( nlp, @@ -53,9 +53,9 @@ def make_entity_ruler( phrase_matcher_attr=phrase_matcher_attr, validate=validate, overwrite_ents=overwrite_ents, + fuzzy=fuzzy, ent_id_sep=ent_id_sep, scorer=scorer, - fuzzy=fuzzy, ) @@ -87,10 +87,10 @@ class EntityRuler(Pipe): phrase_matcher_attr: Optional[Union[int, str]] = None, validate: bool = False, overwrite_ents: bool = False, + fuzzy: float = 0, ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, scorer: Optional[Callable] = entity_ruler_score, - fuzzy: float = None, ) -> None: """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` @@ -122,7 +122,8 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) # type: ignore self.phrase_patterns = defaultdict(list) # type: ignore self._validate = validate - self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=fuzzy) + self.fuzzy = fuzzy + self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=self.fuzzy) self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=self.phrase_matcher_attr, validate=validate @@ -132,7 +133,6 @@ class EntityRuler(Pipe): if patterns is not None: self.add_patterns(patterns) self.scorer = scorer - self.fuzzy = fuzzy def __len__(self) -> int: """The number of all patterns added to the entity ruler.""" From c03394810b6d46dc13e8c68ba962f75f45aeeb9c Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Fri, 26 Aug 2022 02:06:05 +0200 Subject: [PATCH 08/15] tidying --- pyproject.toml | 2 ++ setup.cfg | 1 + spacy/matcher/matcher.pyx | 7 +++---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 317c5fdbe..37d041b6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,5 +8,7 @@ requires = [ "thinc>=8.1.0,<8.2.0", "pathy", "numpy>=1.15.0", + "rapidfuzz>=2.4.0,<3.0.0", + "rapidfuzz_capi>=1.0.5,<2.0.0", ] build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index 658683df7..91c73cb5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ python_requires = >=3.6 setup_requires = cython>=0.25,<3.0 numpy>=1.15.0 + rapidfuzz_capi>=1.0.5,<2.0.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 17d965eaa..4a5468b98 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -20,7 +20,6 @@ from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB -from ..attrs cimport LOWER, NORM from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -258,8 +257,7 @@ cdef class Matcher: matches = [] else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates, - with_alignments=with_alignments, + extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments, fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs) final_matches = [] pairs_by_id = {} @@ -341,7 +339,8 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, float fuzzy=0, list fuzzy_attrs=[]): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, + float fuzzy=0, list fuzzy_attrs=[]): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) From b189f25aaae1dc25b489adbe4e2a7127a49664c2 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 10:58:11 +0200 Subject: [PATCH 09/15] remove unnecessary dependency --- pyproject.toml | 1 - spacy/matcher/matcher.pyx | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 37d041b6d..b01055bdf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ requires = [ "thinc>=8.1.0,<8.2.0", "pathy", "numpy>=1.15.0", - "rapidfuzz>=2.4.0,<3.0.0", "rapidfuzz_capi>=1.0.5,<2.0.0", ] build-backend = "setuptools.build_meta" diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 4a5468b98..3badec56c 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -708,7 +708,7 @@ cdef int8_t get_is_match(PatternStateC state, for attr in spec.attrs[:spec.nr_attr]: token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) if token_attr_value != attr.value: - if fuzzy != 0: # and n_fuzzy_attrs > 0: + if fuzzy: fuzzy_match = False for i in range(n_fuzzy_attrs): if attr.attr == fuzzy_attrs[i]: From 9bdccf94e5d0c46a78fa26a978dcd88867d1ef89 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 10:58:50 +0200 Subject: [PATCH 10/15] handle fuzzy sets --- spacy/matcher/matcher.pyx | 79 ++++++++++++++++++------- spacy/schemas.py | 2 +- spacy/tests/matcher/test_matcher_api.py | 44 ++++++++++++++ 3 files changed, 103 insertions(+), 22 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 3badec56c..f6a09b9f6 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -816,7 +816,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy) + predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -915,9 +915,14 @@ class _SetPredicate: # normalize morph strings self.value = set(self.vocab.morphology.add(v) for v in value) else: - self.value = set(get_string_id(v) for v in value) + if fuzzy: + # add to string store + self.value = set(self.vocab.strings.add(v) for v in value) + else: + self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension + self.fuzzy = fuzzy self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -939,9 +944,23 @@ class _SetPredicate: else: value = set(get_string_id(v) for v in value) if self.predicate == "IN": - return value in self.value # TODO: handle fuzzy + if value in self.value: + return True + elif self.fuzzy: + for v in self.value: + if fuzz_cpp.ratio(self.vocab.strings[value], + self.vocab.strings[v]) >= self.fuzzy: + return True + return False elif self.predicate == "NOT_IN": - return value not in self.value # TODO: handle fuzzy + if value in self.value: + return False + elif self.fuzzy: + for v in self.value: + if fuzz_cpp.ratio(self.vocab.strings[value], + self.vocab.strings[v]) >= self.fuzzy: + return False + return True elif self.predicate == "IS_SUBSET": return value <= self.value elif self.predicate == "IS_SUPERSET": @@ -985,7 +1004,7 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy): +def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): predicate_types = { "FUZZY": _FuzzyPredicate, "REGEX": _RegexPredicate, @@ -1016,23 +1035,41 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy): if attr.upper() == "TEXT": attr = "ORTH" attr = IDS.get(attr.upper()) + if isinstance(value, dict): - processed = False - value_with_upper_keys = {k.upper(): v for k, v in value.items()} - for type_, cls in predicate_types.items(): - if type_ in value_with_upper_keys: - predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab, fuzzy=fuzzy) - # Don't create a redundant predicates. - # This helps with efficiency, as we're caching the results. - if predicate.key in seen_predicates: - output.append(seen_predicates[predicate.key]) - else: - extra_predicates.append(predicate) - output.append(predicate.i) - seen_predicates[predicate.key] = predicate.i - processed = True - if not processed: - warnings.warn(Warnings.W035.format(pattern=value)) + output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs, + predicate_types, + extra_predicates, seen_predicates)) + return output + + +def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs, + predicate_types, extra_predicates, seen_predicates): + output = [] + processed = False #TODO: not working as intended + value_with_upper_keys = {k.upper(): v for k, v in value.items()} + for type_, cls in predicate_types.items(): #TODO: switch this loop + if type_ in value_with_upper_keys: + if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict): + # add predicates inside fuzzy operator + output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_], + vocab, fuzzy, fuzzy_attrs, + predicate_types, + extra_predicates, seen_predicates)) + else: + predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, + vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0) + # Don't create a redundant predicates. + # This helps with efficiency, as we're caching the results. + if predicate.key in seen_predicates: + output.append(seen_predicates[predicate.key]) + else: + extra_predicates.append(predicate) + output.append(predicate.i) + seen_predicates[predicate.key] = predicate.i + processed = True + if not processed: + warnings.warn(Warnings.W035.format(pattern=value)) return output diff --git a/spacy/schemas.py b/spacy/schemas.py index 2677378d6..882815dfa 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -157,7 +157,7 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") - FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy") + FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 798222cc3..22eb18245 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -218,6 +218,50 @@ def test_matcher_match_fuzz_preds(en_vocab): (doc.vocab.strings["JS"], 8, 9), ] +def test_matcher_match_fuzz_pred_in_set(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]] + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["I", "like", "Goggle", "Now"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + ] + +def test_matcher_match_fuzz_pred_not_in_set(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]], + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["I", "like", "Goggle", "Now"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 0, 2), + ] + +def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab): + rules = { + "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}, + "NOT_IN": ["Goggle"]}, + "OP": "+"}]] + } + matcher = Matcher(en_vocab, fuzzy=80) + for key, patterns in rules.items(): + matcher.add(key, patterns, greedy="LONGEST") + + words = ["I", "like", "Goggle", "Now"] + doc = Doc(matcher.vocab, words=words) + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 3, 4), + ] + def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" From ecebb5b145874568e3c62263487f8c68af1ce8d7 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 12:49:14 +0200 Subject: [PATCH 11/15] simplify fuzzy sets --- spacy/matcher/matcher.pyx | 61 ++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f6a09b9f6..7a098aac2 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -880,7 +880,7 @@ class _FuzzyPredicate: value = token._.get(self.attr) else: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] - return bool(fuzz_cpp.ratio(self.value, value) >= self.fuzzy) + return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy) class _RegexPredicate: @@ -1006,7 +1006,6 @@ class _ComparisonPredicate: def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): predicate_types = { - "FUZZY": _FuzzyPredicate, "REGEX": _RegexPredicate, "IN": _SetPredicate, "NOT_IN": _SetPredicate, @@ -1019,6 +1018,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): "<=": _ComparisonPredicate, ">": _ComparisonPredicate, "<": _ComparisonPredicate, + "FUZZY": _FuzzyPredicate, } seen_predicates = {pred.key: pred.i for pred in extra_predicates} output = [] @@ -1037,39 +1037,40 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): attr = IDS.get(attr.upper()) if isinstance(value, dict): - output.extend(_get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs, - predicate_types, - extra_predicates, seen_predicates)) + fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr + output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match, + predicate_types, + extra_predicates, seen_predicates)) return output -def _get_extra_predicates_helper(attr, value, vocab, fuzzy, fuzzy_attrs, - predicate_types, extra_predicates, seen_predicates): +def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match, + predicate_types, extra_predicates, seen_predicates): output = [] - processed = False #TODO: not working as intended - value_with_upper_keys = {k.upper(): v for k, v in value.items()} - for type_, cls in predicate_types.items(): #TODO: switch this loop - if type_ in value_with_upper_keys: - if type_ == 'FUZZY' and isinstance(value_with_upper_keys[type_], dict): + for type_, value in value_dict.items(): + if type_ == 'FUZZY': + fuzzy_match = True # explicit fuzzy match + if isinstance(value, dict): # add predicates inside fuzzy operator - output.extend(_get_extra_predicates_helper(attr, value_with_upper_keys[type_], - vocab, fuzzy, fuzzy_attrs, - predicate_types, - extra_predicates, seen_predicates)) - else: - predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, - vocab=vocab, fuzzy=fuzzy)###??? if attr in fuzzy_attrs else 0) - # Don't create a redundant predicates. - # This helps with efficiency, as we're caching the results. - if predicate.key in seen_predicates: - output.append(seen_predicates[predicate.key]) - else: - extra_predicates.append(predicate) - output.append(predicate.i) - seen_predicates[predicate.key] = predicate.i - processed = True - if not processed: - warnings.warn(Warnings.W035.format(pattern=value)) + output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match, + predicate_types, + extra_predicates, seen_predicates)) + continue + cls = predicate_types.get(type_.upper()) + if cls is None: + warnings.warn(Warnings.W035.format(pattern=value_dict)) + # ignore unrecongized predicate type + continue + predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, + fuzzy=fuzzy if fuzzy_match else 0) + # Don't create a redundant predicates. + # This helps with efficiency, as we're caching the results. + if predicate.key in seen_predicates: + output.append(seen_predicates[predicate.key]) + else: + extra_predicates.append(predicate) + output.append(predicate.i) + seen_predicates[predicate.key] = predicate.i return output From ecd0455acdadb0aface60825b5f3f301cf096cf3 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 15:49:15 +0200 Subject: [PATCH 12/15] case fix --- spacy/matcher/matcher.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 7a098aac2..54481258b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1048,6 +1048,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match, predicate_types, extra_predicates, seen_predicates): output = [] for type_, value in value_dict.items(): + type_ = type_.upper() if type_ == 'FUZZY': fuzzy_match = True # explicit fuzzy match if isinstance(value, dict): @@ -1056,10 +1057,10 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match, predicate_types, extra_predicates, seen_predicates)) continue - cls = predicate_types.get(type_.upper()) + cls = predicate_types.get(type_) if cls is None: warnings.warn(Warnings.W035.format(pattern=value_dict)) - # ignore unrecongized predicate type + # ignore unrecognized predicate type continue predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, fuzzy=fuzzy if fuzzy_match else 0) From 43948f731b44b18379a0cceb41f64877e5c9cd34 Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 18:10:42 +0200 Subject: [PATCH 13/15] switch to FUZZYn predicates use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi. --- pyproject.toml | 1 - requirements.txt | 1 - setup.cfg | 2 - spacy/matcher/matcher.pxd | 2 - spacy/matcher/matcher.pyi | 3 +- spacy/matcher/matcher.pyx | 127 +++++++++--------------- spacy/pipeline/entityruler.py | 9 +- spacy/schemas.py | 12 ++- spacy/tests/matcher/test_matcher_api.py | 112 ++++++--------------- 9 files changed, 93 insertions(+), 176 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b01055bdf..317c5fdbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,5 @@ requires = [ "thinc>=8.1.0,<8.2.0", "pathy", "numpy>=1.15.0", - "rapidfuzz_capi>=1.0.5,<2.0.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 47dcede1f..38b4cbf0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,6 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 rapidfuzz>=2.4.0,<3.0.0 -rapidfuzz_capi>=1.0.5,<2.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index 91c73cb5c..a149b1f7e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,7 +34,6 @@ python_requires = >=3.6 setup_requires = cython>=0.25,<3.0 numpy>=1.15.0 - rapidfuzz_capi>=1.0.5,<2.0.0 # We also need our Cython packages here to compile against cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 @@ -60,7 +59,6 @@ install_requires = pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 rapidfuzz>=2.4.0,<3.0.0 - rapidfuzz_capi>=1.0.5,<2.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 98041e199..455f978cc 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -71,8 +71,6 @@ cdef class Matcher: cdef vector[TokenPatternC*] patterns cdef readonly Vocab vocab cdef public object validate - cdef public object fuzzy - cdef public object fuzzy_attrs cdef public object _patterns cdef public object _callbacks cdef public object _filter diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 676be6a45..390629ff8 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,8 +5,7 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ..., - fuzzy: float = ..., fuzzy_attrs: list = ...) -> None: ... + def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 54481258b..cb6152ed0 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -10,7 +10,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly import warnings -from rapidfuzz import fuzz_cpp +from rapidfuzz.distance import Levenshtein from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -37,7 +37,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True, fuzzy=None, fuzzy_attrs=None): + def __init__(self, vocab, validate=True): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the @@ -52,8 +52,6 @@ cdef class Matcher: self.vocab = vocab self.mem = Pool() self.validate = validate - self.fuzzy = fuzzy if fuzzy is not None else 0 - self.fuzzy_attrs = [IDS.get(attr) for attr in fuzzy_attrs] if fuzzy_attrs else [] def __reduce__(self): data = (self.vocab, self._patterns, self._callbacks) @@ -131,8 +129,7 @@ cdef class Matcher: for pattern in patterns: try: specs = _preprocess_pattern(pattern, self.vocab, - self._extensions, self._extra_predicates, - self.fuzzy, self.fuzzy_attrs) + self._extensions, self._extra_predicates) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: for attr, _ in spec[1]: @@ -257,8 +254,7 @@ cdef class Matcher: matches = [] else: matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, - extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments, - fuzzy=self.fuzzy, fuzzy_attrs=self.fuzzy_attrs) + extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments) final_matches = [] pairs_by_id = {} # For each key, either add all matches, or only the filtered, @@ -339,8 +335,7 @@ def unpickle_matcher(vocab, patterns, callbacks): return matcher -cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0, - float fuzzy=0, list fuzzy_attrs=[]): +cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0): """Find matches in a doc, with a compiled array of patterns. Matches are returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0) @@ -359,8 +354,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() - cdef int8_t* fuzzy_attrs_array - cdef int n_fuzzy_attrs = len(fuzzy_attrs) output = [] if length == 0: @@ -380,10 +373,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if isinstance(value, str): value = token.vocab.strings[value] extra_attr_values[i * nr_extra_attr + index] = value - if n_fuzzy_attrs > 0: - fuzzy_attrs_array = mem.alloc(n_fuzzy_attrs, sizeof(int8_t)) - for i in range(n_fuzzy_attrs): - fuzzy_attrs_array[i] = fuzzy_attrs[i] # Main loop cdef int nr_predicate = len(predicates) for i in range(length): @@ -392,8 +381,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e if with_alignments != 0: align_states.resize(states.size()) transition_states(states, matches, align_states, align_matches, predicate_cache, - doclike[i], extra_attr_values, predicates, with_alignments, - fuzzy, fuzzy_attrs_array, n_fuzzy_attrs) + doclike[i], extra_attr_values, predicates, with_alignments) extra_attr_values += nr_extra_attr predicate_cache += len(predicates) # Handle matches that end in 0-width patterns @@ -422,8 +410,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches, int8_t* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments, - float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) except *: + Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *: cdef int q = 0 cdef vector[PatternStateC] new_states cdef vector[vector[MatchAlignmentC]] align_new_states @@ -433,8 +420,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) action = get_action(states[i], token, extra_attrs, - cached_py_predicates, - fuzzy, fuzzy_attrs, n_fuzzy_attrs) + cached_py_predicates) if action == REJECT: continue # Keep only a subset of states (the active ones). Index q is the @@ -471,8 +457,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) action = get_action(states[q], token, extra_attrs, - cached_py_predicates, - fuzzy, fuzzy_attrs, n_fuzzy_attrs) + cached_py_predicates) # Update alignment before the transition of current state if with_alignments != 0: align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length)) @@ -584,8 +569,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, - float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: + const int8_t* predicate_matches) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] b) What's the quantifier? [1, 0+, ?] @@ -644,8 +628,7 @@ cdef action_t get_action(PatternStateC state, Problem: If a quantifier is matching, we're adding a lot of open partials """ cdef int8_t is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches, - fuzzy, fuzzy_attrs, n_fuzzy_attrs) + is_match = get_is_match(state, token, extra_attrs, predicate_matches) quantifier = get_quantifier(state) is_final = get_is_final(state) if quantifier == ZERO: @@ -698,8 +681,7 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, Token token, const attr_t* extra_attrs, - const int8_t* predicate_matches, - float fuzzy, int8_t* fuzzy_attrs, int n_fuzzy_attrs) nogil: + const int8_t* predicate_matches) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: return 0 @@ -708,22 +690,9 @@ cdef int8_t get_is_match(PatternStateC state, for attr in spec.attrs[:spec.nr_attr]: token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) if token_attr_value != attr.value: - if fuzzy: - fuzzy_match = False - for i in range(n_fuzzy_attrs): - if attr.attr == fuzzy_attrs[i]: - with gil: - if fuzz_cpp.ratio(token.vocab.strings[token_attr_value], - token.vocab.strings[attr.value]) >= fuzzy: - fuzzy_match = True - break - if not fuzzy_match: - return 0 - else: - return 0 + return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: - # TODO: fuzzy match return 0 return True @@ -788,8 +757,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, - fuzzy, fuzzy_attrs): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -816,7 +784,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs) + predicates = _get_extra_predicates(spec, extra_predicates, vocab) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx)) return tokens @@ -862,31 +830,31 @@ def _get_attr_values(spec, string_store): # extensions to the matcher introduced in #3173. class _FuzzyPredicate: - operators = ("FUZZY",) + operators = ("FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None): self.i = i self.attr = attr self.value = value self.predicate = predicate self.is_extension = is_extension - self.fuzzy = fuzzy self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + self.distance = int(self.predicate[len('FUZZY'):]) # number after prefix def __call__(self, Token token): if self.is_extension: value = token._.get(self.attr) else: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] - return bool(self.fuzzy and fuzz_cpp.ratio(self.value, value) >= self.fuzzy) + return bool(Levenshtein.distance(self.value, value) <= self.distance) class _RegexPredicate: operators = ("REGEX",) - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None): self.i = i self.attr = attr self.value = re.compile(value) @@ -907,22 +875,22 @@ class _RegexPredicate: class _SetPredicate: operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None): self.i = i self.attr = attr self.vocab = vocab + self.distance = distance if self.attr == MORPH: # normalize morph strings self.value = set(self.vocab.morphology.add(v) for v in value) else: - if fuzzy: + if self.distance: # add to string store self.value = set(self.vocab.strings.add(v) for v in value) else: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.fuzzy = fuzzy self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -946,19 +914,19 @@ class _SetPredicate: if self.predicate == "IN": if value in self.value: return True - elif self.fuzzy: + elif self.distance: for v in self.value: - if fuzz_cpp.ratio(self.vocab.strings[value], - self.vocab.strings[v]) >= self.fuzzy: + if Levenshtein.distance(self.vocab.strings[value], + self.vocab.strings[v]) <= self.distance: return True return False elif self.predicate == "NOT_IN": if value in self.value: return False - elif self.fuzzy: + elif self.distance: for v in self.value: - if fuzz_cpp.ratio(self.vocab.strings[value], - self.vocab.strings[v]) >= self.fuzzy: + if Levenshtein.distance(self.vocab.strings[value], + self.vocab.strings[v]) <= self.distance: return False return True elif self.predicate == "IS_SUBSET": @@ -975,7 +943,7 @@ class _SetPredicate: class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") - def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, fuzzy=None): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None, distance=None): self.i = i self.attr = attr self.value = value @@ -1004,7 +972,7 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): +def _get_extra_predicates(spec, extra_predicates, vocab): predicate_types = { "REGEX": _RegexPredicate, "IN": _SetPredicate, @@ -1018,7 +986,11 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): "<=": _ComparisonPredicate, ">": _ComparisonPredicate, "<": _ComparisonPredicate, - "FUZZY": _FuzzyPredicate, + "FUZZY1": _FuzzyPredicate, + "FUZZY2": _FuzzyPredicate, + "FUZZY3": _FuzzyPredicate, + "FUZZY4": _FuzzyPredicate, + "FUZZY5": _FuzzyPredicate, } seen_predicates = {pred.key: pred.i for pred in extra_predicates} output = [] @@ -1037,33 +1009,30 @@ def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy, fuzzy_attrs): attr = IDS.get(attr.upper()) if isinstance(value, dict): - fuzzy_match = attr in fuzzy_attrs # fuzzy match enabled for this attr - output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match, - predicate_types, + output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, extra_predicates, seen_predicates)) return output -def _get_extra_predicates_dict(attr, value_dict, vocab, fuzzy, fuzzy_match, - predicate_types, extra_predicates, seen_predicates): +def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types, + extra_predicates, seen_predicates, distance=None): output = [] for type_, value in value_dict.items(): type_ = type_.upper() - if type_ == 'FUZZY': - fuzzy_match = True # explicit fuzzy match - if isinstance(value, dict): - # add predicates inside fuzzy operator - output.extend(_get_extra_predicates_dict(attr, value, vocab, fuzzy, fuzzy_match, - predicate_types, - extra_predicates, seen_predicates)) - continue cls = predicate_types.get(type_) if cls is None: warnings.warn(Warnings.W035.format(pattern=value_dict)) # ignore unrecognized predicate type continue - predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, - fuzzy=fuzzy if fuzzy_match else 0) + elif cls == _FuzzyPredicate: + distance = int(type_[len("FUZZY"):]) # number after prefix + if isinstance(value, dict): + # add predicates inside fuzzy operator + output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, + extra_predicates, seen_predicates, + distance=distance)) + continue + predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab, distance=distance) # Don't create a redundant predicates. # This helps with efficiency, as we're caching the results. if predicate.key in seen_predicates: diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 1e816ab16..3cb1ca676 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -26,7 +26,6 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "phrase_matcher_attr": None, "validate": False, "overwrite_ents": False, - "fuzzy": 0.0, "ent_id_sep": DEFAULT_ENT_ID_SEP, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, }, @@ -43,7 +42,6 @@ def make_entity_ruler( phrase_matcher_attr: Optional[Union[int, str]], validate: bool, overwrite_ents: bool, - fuzzy: float, ent_id_sep: str, scorer: Optional[Callable], ): @@ -53,7 +51,6 @@ def make_entity_ruler( phrase_matcher_attr=phrase_matcher_attr, validate=validate, overwrite_ents=overwrite_ents, - fuzzy=fuzzy, ent_id_sep=ent_id_sep, scorer=scorer, ) @@ -87,7 +84,6 @@ class EntityRuler(Pipe): phrase_matcher_attr: Optional[Union[int, str]] = None, validate: bool = False, overwrite_ents: bool = False, - fuzzy: float = 0, ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, scorer: Optional[Callable] = entity_ruler_score, @@ -122,8 +118,7 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) # type: ignore self.phrase_patterns = defaultdict(list) # type: ignore self._validate = validate - self.fuzzy = fuzzy - self.matcher = Matcher(nlp.vocab, validate=validate, fuzzy=self.fuzzy) + self.matcher = Matcher(nlp.vocab, validate=validate) self.phrase_matcher_attr = phrase_matcher_attr self.phrase_matcher = PhraseMatcher( nlp.vocab, attr=self.phrase_matcher_attr, validate=validate @@ -343,7 +338,7 @@ class EntityRuler(Pipe): self.token_patterns = defaultdict(list) self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(tuple) - self.matcher = Matcher(self.nlp.vocab, validate=self._validate, fuzzy=self.fuzzy) + self.matcher = Matcher(self.nlp.vocab, validate=self._validate) self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate ) diff --git a/spacy/schemas.py b/spacy/schemas.py index 882815dfa..a9012d7d9 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -157,12 +157,16 @@ def validate_token_pattern(obj: list) -> List[str]: class TokenPatternString(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") - FUZZY: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") + FUZZY1: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy1") + FUZZY2: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy2") + FUZZY3: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy3") + FUZZY4: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy4") + FUZZY5: Union[StrictStr, "TokenPatternString"] = Field(None, alias="fuzzy5") class Config: extra = "forbid" @@ -177,7 +181,6 @@ class TokenPatternString(BaseModel): class TokenPatternNumber(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") - FUZZY: Optional[StrictStr] = Field(None, alias="fuzzy") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") @@ -189,6 +192,11 @@ class TokenPatternNumber(BaseModel): LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") + FUZZY1: Optional[StrictStr] = Field(None, alias="fuzzy1") + FUZZY2: Optional[StrictStr] = Field(None, alias="fuzzy2") + FUZZY3: Optional[StrictStr] = Field(None, alias="fuzzy3") + FUZZY4: Optional[StrictStr] = Field(None, alias="fuzzy4") + FUZZY5: Optional[StrictStr] = Field(None, alias="fuzzy5") class Config: extra = "forbid" diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 22eb18245..1b6dda273 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -6,16 +6,15 @@ from spacy.tokens import Doc, Token, Span from ..doc.test_underscore import clean_underscore # noqa: F401 -matcher_rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], -} - @pytest.fixture def matcher(en_vocab): + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } matcher = Matcher(en_vocab) - for key, patterns in matcher_rules.items(): + for key, patterns in rules.items(): matcher.add(key, patterns) return matcher @@ -119,98 +118,51 @@ def test_matcher_match_multi(matcher): ] -# fuzzy matches on specific attributes - -def test_matcher_match_fuzz_all(en_vocab): - matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["ORTH", "LOWER"]) - for key, patterns in matcher_rules.items(): - matcher.add(key, patterns) - - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["GoogleNow"], 2, 4), - (doc.vocab.strings["Java"], 5, 6), - (doc.vocab.strings["JS"], 8, 9), - ] - -def test_matcher_match_fuzz_all_lower(en_vocab): - matcher = Matcher(en_vocab, fuzzy=80, fuzzy_attrs=["LOWER"]) - for key, patterns in matcher_rules.items(): - matcher.add(key, patterns) - - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["Java"], 5, 6), - ] - -def test_matcher_match_fuzz_some(en_vocab): - matcher = Matcher(en_vocab, fuzzy=85, fuzzy_attrs=["ORTH", "LOWER"]) - for key, patterns in matcher_rules.items(): - matcher.add(key, patterns) - - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["Java"], 5, 6), - ] - -def test_matcher_match_fuzz_none(en_vocab): - matcher = Matcher(en_vocab, fuzzy=90, fuzzy_attrs=["ORTH", "LOWER"]) - for key, patterns in matcher_rules.items(): - matcher.add(key, patterns) - - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [] - - # fuzzy matches on specific tokens -def test_matcher_match_fuzz_pred1(en_vocab): +def test_matcher_match_fuzzy1(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]], "Java": [[{"LOWER": "java"}]], } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), ] -def test_matcher_match_fuzz_pred2(en_vocab): +def test_matcher_match_fuzzy2(en_vocab): rules = { "JS": [[{"ORTH": "JavaScript"}]], "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": {"FUZZY": "java"}}]], + "Java": [[{"LOWER": {"FUZZY1": "java"}}]], } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["Java"], 5, 6), ] -def test_matcher_match_fuzz_preds(en_vocab): +def test_matcher_match_fuzzy3(en_vocab): rules = { - "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], - "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": {"FUZZY": "java"}}]], + "JS": [[{"ORTH": {"FUZZY2": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY1": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY1": "java"}}]], } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns) - words = ["I", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), @@ -218,45 +170,45 @@ def test_matcher_match_fuzz_preds(en_vocab): (doc.vocab.strings["JS"], 8, 9), ] -def test_matcher_match_fuzz_pred_in_set(en_vocab): +def test_matcher_match_fuzzy_set1(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}}, "OP": "+"}]] + "GoogleNow": [[{"ORTH": {"FUZZY2": {"IN": ["Google", "No"]}}, "OP": "+"}]] } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["I", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Now"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 2, 4), ] -def test_matcher_match_fuzz_pred_not_in_set(en_vocab): +def test_matcher_match_fuzzy_set2(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]], + "GoogleNow": [[{"ORTH": {"FUZZY2": {"NOT_IN": ["Google", "No"]}}, "OP": "+"}]], } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["I", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Now"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 0, 2), ] -def test_matcher_match_fuzz_pred_in_set_with_exclude(en_vocab): +def test_matcher_match_fuzzy_set3(en_vocab): rules = { - "GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "No"]}, + "GoogleNow": [[{"ORTH": {"FUZZY1": {"IN": ["Google", "No"]}, "NOT_IN": ["Goggle"]}, "OP": "+"}]] } - matcher = Matcher(en_vocab, fuzzy=80) + matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") - words = ["I", "like", "Goggle", "Now"] + words = ["They", "like", "Goggle", "Now"] doc = Doc(matcher.vocab, words=words) assert matcher(doc) == [ (doc.vocab.strings["GoogleNow"], 3, 4), From a8a4d86bae2001e58a32e334840d2956ad0ad6ac Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 18:28:17 +0200 Subject: [PATCH 14/15] revert changes added for fuzzy param --- setup.py | 2 -- spacy/matcher/matcher.pyx | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 413c55d22..ec1bd35fa 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ from Cython.Build import cythonize from Cython.Compiler import Options import os import subprocess -import rapidfuzz_capi ROOT = Path(__file__).parent @@ -203,7 +202,6 @@ def setup_package(): include_dirs = [ numpy.get_include(), - rapidfuzz_capi.get_include(), get_python_inc(plat_specific=True), ] ext_modules = [] diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index cb6152ed0..56fd11365 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -354,7 +354,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e cdef PatternStateC state cdef int i, j, nr_extra_attr cdef Pool mem = Pool() - output = [] if length == 0: # avoid any processing or mem alloc if the document is empty @@ -419,7 +418,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[i].pattern.nr_py >= 1: update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates) - action = get_action(states[i], token, extra_attrs, + action = get_action(states[i], token.c, extra_attrs, cached_py_predicates) if action == REJECT: continue @@ -456,7 +455,7 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match if states[q].pattern.nr_py != 0: update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates) - action = get_action(states[q], token, extra_attrs, + action = get_action(states[q], token.c, extra_attrs, cached_py_predicates) # Update alignment before the transition of current state if with_alignments != 0: @@ -568,7 +567,7 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states, cdef action_t get_action(PatternStateC state, - Token token, const attr_t* extra_attrs, + const TokenC* token, const attr_t* extra_attrs, const int8_t* predicate_matches) nogil: """We need to consider: a) Does the token match the specification? [Yes, No] @@ -680,7 +679,7 @@ cdef action_t get_action(PatternStateC state, cdef int8_t get_is_match(PatternStateC state, - Token token, const attr_t* extra_attrs, + const TokenC* token, const attr_t* extra_attrs, const int8_t* predicate_matches) nogil: for i in range(state.pattern.nr_py): if predicate_matches[state.pattern.py_predicates[i]] == -1: @@ -688,8 +687,7 @@ cdef int8_t get_is_match(PatternStateC state, spec = state.pattern if spec.nr_attr > 0: for attr in spec.attrs[:spec.nr_attr]: - token_attr_value = get_token_attr_for_matcher(token.c, attr.attr) - if token_attr_value != attr.value: + if get_token_attr_for_matcher(token, attr.attr) != attr.value: return 0 for i in range(spec.nr_extra_attr): if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: @@ -1007,7 +1005,6 @@ def _get_extra_predicates(spec, extra_predicates, vocab): if attr.upper() == "TEXT": attr = "ORTH" attr = IDS.get(attr.upper()) - if isinstance(value, dict): output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types, extra_predicates, seen_predicates)) From 59021f7d25bd8fa3c5dc8e5ab594023257b0ed5a Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Mon, 29 Aug 2022 21:42:10 +0200 Subject: [PATCH 15/15] switch to polyleven (Python package) --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/matcher/matcher.pyx | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 38b4cbf0d..070ffe7a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 langcodes>=3.2.0,<4.0.0 -rapidfuzz>=2.4.0,<3.0.0 +polyleven>=0.7,<1.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index a149b1f7e..de58de3bc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,7 +58,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 jinja2 - rapidfuzz>=2.4.0,<3.0.0 + polyleven>=0.7,<1.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 56fd11365..d27397f8b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -10,7 +10,7 @@ from murmurhash.mrmr cimport hash64 import re import srsly import warnings -from rapidfuzz.distance import Levenshtein +from polyleven import levenshtein from ..typedefs cimport attr_t from ..structs cimport TokenC @@ -846,7 +846,7 @@ class _FuzzyPredicate: value = token._.get(self.attr) else: value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)] - return bool(Levenshtein.distance(self.value, value) <= self.distance) + return bool(levenshtein(self.value, value) <= self.distance) class _RegexPredicate: @@ -914,8 +914,8 @@ class _SetPredicate: return True elif self.distance: for v in self.value: - if Levenshtein.distance(self.vocab.strings[value], - self.vocab.strings[v]) <= self.distance: + if levenshtein(self.vocab.strings[value], + self.vocab.strings[v]) <= self.distance: return True return False elif self.predicate == "NOT_IN": @@ -923,8 +923,8 @@ class _SetPredicate: return False elif self.distance: for v in self.value: - if Levenshtein.distance(self.vocab.strings[value], - self.vocab.strings[v]) <= self.distance: + if levenshtein(self.vocab.strings[value], + self.vocab.strings[v]) <= self.distance: return False return True elif self.predicate == "IS_SUBSET":