Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein

2025-09-21 11:32:38 +03:00 · 2023-01-09 13:50:27 +01:00 · 2023-01-09 13:50:27 +01:00 · e0abb55c42
commit e0abb55c42
parent 0d60744996
6 changed files with 29 additions and 31 deletions
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t

 from typing import Optional

+from ..util import registry
+

 cdef extern from "polyleven.c":
    int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
    if k is None:
        k = -1
    return polyleven(<PyObject*>a, <PyObject*>b, k)
+
+
+cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
+    if fuzzy >= 0:
+        max_edits = fuzzy
+    else:
+        # allow at least two edits (to allow at least one transposition) and up
+        # to 20% of the pattern string length
+        max_edits = max(2, round(0.3 * len(pattern_text)))
+    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
+
+
+@registry.misc("spacy.levenshtein_compare.v1")
+def make_levenshtein_compare():
+    return levenshtein_compare
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@ -53,5 +53,3 @@ class Matcher:
        with_alignments: bool = ...
    ) -> List[Span]: ...
    def _normalize_key(self, key: Any) -> Any: ...
-
-def fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1) -> bool: ...
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -20,33 +20,17 @@ from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB

+from .levenshtein import levenshtein_compare
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
 from ..strings import get_string_id
 from ..attrs import IDS
 from ..util import registry

-from .levenshtein import levenshtein
-

 DEF PADDING = 5


-cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
-    if fuzzy >= 0:
-        max_edits = fuzzy
-    else:
-        # allow at least two edits (to allow at least one transposition) and up
-        # to 20% of the pattern string length
-        max_edits = max(2, round(0.3 * len(pattern_text)))
-    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
-
-
-@registry.misc("spacy.fuzzy_compare.v1")
-def make_fuzzy_compare():
-    return fuzzy_compare
-
-
 cdef class Matcher:
    """Match sequences of tokens, based on pattern rules.

@ -54,7 +38,7 @@ cdef class Matcher:
    USAGE: https://spacy.io/usage/rule-based-matching
    """

-    def __init__(self, vocab, validate=True, *, fuzzy_compare=fuzzy_compare):
+    def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
        """Create the Matcher.

        vocab (Vocab): The vocabulary object, which must be shared with the
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -11,7 +11,7 @@ from ..errors import Errors, Warnings
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
-from ..matcher.matcher import fuzzy_compare
+from ..matcher.levenshtein import levenshtein_compare
 from ..scorer import get_ner_prf


@ -24,7 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
    assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
    default_config={
        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
        "validate": False,
        "overwrite_ents": False,
        "ent_id_sep": DEFAULT_ENT_ID_SEP,
@ -85,7 +85,7 @@ class EntityRuler(Pipe):
        name: str = "entity_ruler",
        *,
        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = fuzzy_compare,
+        matcher_fuzzy_compare: Callable = levenshtein_compare,
        validate: bool = False,
        overwrite_ents: bool = False,
        ent_id_sep: str = DEFAULT_ENT_ID_SEP,
@ -137,7 +137,6 @@ class EntityRuler(Pipe):
        if patterns is not None:
            self.add_patterns(patterns)
        self.scorer = scorer
-        self.fuzzy_compare = fuzzy_compare

    def __len__(self) -> int:
        """The number of all patterns added to the entity ruler."""
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -13,7 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
 from ..tokens import Doc, Span
 from ..scorer import Scorer
 from ..matcher import Matcher, PhraseMatcher
-from ..matcher.matcher import fuzzy_compare
+from ..matcher.levenshtein import levenshtein_compare
 from .. import util

 PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@ -29,7 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
        "overwrite_ents": False,
        "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
        "ent_id_sep": "__unused__",
-        "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
    },
    default_score_weights={
        "ents_f": 1.0,
@ -76,7 +76,7 @@ def make_entity_ruler(
        "annotate_ents": False,
        "ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
        "phrase_matcher_attr": None,
-        "matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
+        "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
        "validate": False,
        "overwrite": True,
        "scorer": {
@ -223,7 +223,7 @@ class SpanRuler(Pipe):
            [Iterable[Span], Iterable[Span]], Iterable[Span]
        ] = util.filter_chain_spans,
        phrase_matcher_attr: Optional[Union[int, str]] = None,
-        matcher_fuzzy_compare: Callable = fuzzy_compare,
+        matcher_fuzzy_compare: Callable = levenshtein_compare,
        validate: bool = False,
        overwrite: bool = False,
        scorer: Optional[Callable] = partial(
--- a/spacy/tests/matcher/test_levenshtein.py
+++ b/spacy/tests/matcher/test_levenshtein.py
@ -1,6 +1,6 @@
 import pytest
 from spacy.matcher import levenshtein
-from spacy.matcher.matcher import fuzzy_compare
+from spacy.matcher.levenshtein import levenshtein_compare


 # empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -69,5 +69,5 @@ def test_levenshtein(dist, a, b):
        ("abcdefgh", "cdefghijkl", -1, False),  # default (2)
    ],
 )
-def test_fuzzy_compare(a, b, fuzzy, expected):
-    assert fuzzy_compare(a, b, fuzzy) == expected
+def test_levenshtein_compare(a, b, fuzzy, expected):
+    assert levenshtein_compare(a, b, fuzzy) == expected