Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein

This commit is contained in:
Adriane Boyd 2023-01-09 13:50:27 +01:00
parent 0d60744996
commit e0abb55c42
6 changed files with 29 additions and 31 deletions

View File

@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
from typing import Optional
from ..util import registry
cdef extern from "polyleven.c":
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
if k is None:
k = -1
return polyleven(<PyObject*>a, <PyObject*>b, k)
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
if fuzzy >= 0:
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.levenshtein_compare.v1")
def make_levenshtein_compare():
return levenshtein_compare

View File

@ -53,5 +53,3 @@ class Matcher:
with_alignments: bool = ...
) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ...
def fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1) -> bool: ...

View File

@ -20,33 +20,17 @@ from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from .levenshtein import levenshtein_compare
from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings
from ..strings import get_string_id
from ..attrs import IDS
from ..util import registry
from .levenshtein import levenshtein
DEF PADDING = 5
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
if fuzzy >= 0:
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.fuzzy_compare.v1")
def make_fuzzy_compare():
return fuzzy_compare
cdef class Matcher:
"""Match sequences of tokens, based on pattern rules.
@ -54,7 +38,7 @@ cdef class Matcher:
USAGE: https://spacy.io/usage/rule-based-matching
"""
def __init__(self, vocab, validate=True, *, fuzzy_compare=fuzzy_compare):
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
"""Create the Matcher.
vocab (Vocab): The vocabulary object, which must be shared with the

View File

@ -11,7 +11,7 @@ from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..matcher.matcher import fuzzy_compare
from ..matcher.levenshtein import levenshtein_compare
from ..scorer import get_ner_prf
@ -24,7 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
@ -85,7 +85,7 @@ class EntityRuler(Pipe):
name: str = "entity_ruler",
*,
phrase_matcher_attr: Optional[Union[int, str]] = None,
matcher_fuzzy_compare: Callable = fuzzy_compare,
matcher_fuzzy_compare: Callable = levenshtein_compare,
validate: bool = False,
overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
@ -137,7 +137,6 @@ class EntityRuler(Pipe):
if patterns is not None:
self.add_patterns(patterns)
self.scorer = scorer
self.fuzzy_compare = fuzzy_compare
def __len__(self) -> int:
"""The number of all patterns added to the entity ruler."""

View File

@ -13,7 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
from ..tokens import Doc, Span
from ..scorer import Scorer
from ..matcher import Matcher, PhraseMatcher
from ..matcher.matcher import fuzzy_compare
from ..matcher.levenshtein import levenshtein_compare
from .. import util
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@ -29,7 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
"overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
},
default_score_weights={
"ents_f": 1.0,
@ -76,7 +76,7 @@ def make_entity_ruler(
"annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite": True,
"scorer": {
@ -223,7 +223,7 @@ class SpanRuler(Pipe):
[Iterable[Span], Iterable[Span]], Iterable[Span]
] = util.filter_chain_spans,
phrase_matcher_attr: Optional[Union[int, str]] = None,
matcher_fuzzy_compare: Callable = fuzzy_compare,
matcher_fuzzy_compare: Callable = levenshtein_compare,
validate: bool = False,
overwrite: bool = False,
scorer: Optional[Callable] = partial(

View File

@ -1,6 +1,6 @@
import pytest
from spacy.matcher import levenshtein
from spacy.matcher.matcher import fuzzy_compare
from spacy.matcher.levenshtein import levenshtein_compare
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
@ -69,5 +69,5 @@ def test_levenshtein(dist, a, b):
("abcdefgh", "cdefghijkl", -1, False), # default (2)
],
)
def test_fuzzy_compare(a, b, fuzzy, expected):
assert fuzzy_compare(a, b, fuzzy) == expected
def test_levenshtein_compare(a, b, fuzzy, expected):
assert levenshtein_compare(a, b, fuzzy) == expected