diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx index 8463d913d..0e8cd26da 100644 --- a/spacy/matcher/levenshtein.pyx +++ b/spacy/matcher/levenshtein.pyx @@ -4,6 +4,8 @@ from libc.stdint cimport int64_t from typing import Optional +from ..util import registry + cdef extern from "polyleven.c": int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k) @@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None): if k is None: k = -1 return polyleven(a, b, k) + + +cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1): + if fuzzy >= 0: + max_edits = fuzzy + else: + # allow at least two edits (to allow at least one transposition) and up + # to 20% of the pattern string length + max_edits = max(2, round(0.3 * len(pattern_text))) + return levenshtein(input_text, pattern_text, max_edits) <= max_edits + + +@registry.misc("spacy.levenshtein_compare.v1") +def make_levenshtein_compare(): + return levenshtein_compare diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 2dd64c375..77ea7b7a6 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -53,5 +53,3 @@ class Matcher: with_alignments: bool = ... ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... - -def fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1) -> bool: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2ca8d6322..ea1b4b66b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -20,33 +20,17 @@ from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB +from .levenshtein import levenshtein_compare from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings from ..strings import get_string_id from ..attrs import IDS from ..util import registry -from .levenshtein import levenshtein - DEF PADDING = 5 -cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1): - if fuzzy >= 0: - max_edits = fuzzy - else: - # allow at least two edits (to allow at least one transposition) and up - # to 20% of the pattern string length - max_edits = max(2, round(0.3 * len(pattern_text))) - return levenshtein(input_text, pattern_text, max_edits) <= max_edits - - -@registry.misc("spacy.fuzzy_compare.v1") -def make_fuzzy_compare(): - return fuzzy_compare - - cdef class Matcher: """Match sequences of tokens, based on pattern rules. @@ -54,7 +38,7 @@ cdef class Matcher: USAGE: https://spacy.io/usage/rule-based-matching """ - def __init__(self, vocab, validate=True, *, fuzzy_compare=fuzzy_compare): + def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare): """Create the Matcher. vocab (Vocab): The vocabulary object, which must be shared with the diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 29dbf4708..9a084f5a6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -11,7 +11,7 @@ from ..errors import Errors, Warnings from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher -from ..matcher.matcher import fuzzy_compare +from ..matcher.levenshtein import levenshtein_compare from ..scorer import get_ner_prf @@ -24,7 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] assigns=["doc.ents", "token.ent_type", "token.ent_iob"], default_config={ "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"}, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, "validate": False, "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, @@ -85,7 +85,7 @@ class EntityRuler(Pipe): name: str = "entity_ruler", *, phrase_matcher_attr: Optional[Union[int, str]] = None, - matcher_fuzzy_compare: Callable = fuzzy_compare, + matcher_fuzzy_compare: Callable = levenshtein_compare, validate: bool = False, overwrite_ents: bool = False, ent_id_sep: str = DEFAULT_ENT_ID_SEP, @@ -137,7 +137,6 @@ class EntityRuler(Pipe): if patterns is not None: self.add_patterns(patterns) self.scorer = scorer - self.fuzzy_compare = fuzzy_compare def __len__(self) -> int: """The number of all patterns added to the entity ruler.""" diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 5ca5ee9a9..aab297d9c 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -13,7 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry from ..tokens import Doc, Span from ..scorer import Scorer from ..matcher import Matcher, PhraseMatcher -from ..matcher.matcher import fuzzy_compare +from ..matcher.levenshtein import levenshtein_compare from .. import util PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] @@ -29,7 +29,7 @@ DEFAULT_SPANS_KEY = "ruler" "overwrite_ents": False, "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, "ent_id_sep": "__unused__", - "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"}, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, }, default_score_weights={ "ents_f": 1.0, @@ -76,7 +76,7 @@ def make_entity_ruler( "annotate_ents": False, "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}, "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"}, + "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, "validate": False, "overwrite": True, "scorer": { @@ -223,7 +223,7 @@ class SpanRuler(Pipe): [Iterable[Span], Iterable[Span]], Iterable[Span] ] = util.filter_chain_spans, phrase_matcher_attr: Optional[Union[int, str]] = None, - matcher_fuzzy_compare: Callable = fuzzy_compare, + matcher_fuzzy_compare: Callable = levenshtein_compare, validate: bool = False, overwrite: bool = False, scorer: Optional[Callable] = partial( diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index c4001bba1..5afb7e1fc 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -1,6 +1,6 @@ import pytest from spacy.matcher import levenshtein -from spacy.matcher.matcher import fuzzy_compare +from spacy.matcher.levenshtein import levenshtein_compare # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests @@ -69,5 +69,5 @@ def test_levenshtein(dist, a, b): ("abcdefgh", "cdefghijkl", -1, False), # default (2) ], ) -def test_fuzzy_compare(a, b, fuzzy, expected): - assert fuzzy_compare(a, b, fuzzy) == expected +def test_levenshtein_compare(a, b, fuzzy, expected): + assert levenshtein_compare(a, b, fuzzy) == expected