mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein
This commit is contained in:
parent
0d60744996
commit
e0abb55c42
|
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
cdef extern from "polyleven.c":
|
||||
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
||||
|
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
|
|||
if k is None:
|
||||
k = -1
|
||||
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
||||
|
||||
|
||||
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||
if fuzzy >= 0:
|
||||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 20% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.levenshtein_compare.v1")
|
||||
def make_levenshtein_compare():
|
||||
return levenshtein_compare
|
||||
|
|
|
@ -53,5 +53,3 @@ class Matcher:
|
|||
with_alignments: bool = ...
|
||||
) -> List[Span]: ...
|
||||
def _normalize_key(self, key: Any) -> Any: ...
|
||||
|
||||
def fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1) -> bool: ...
|
||||
|
|
|
@ -20,33 +20,17 @@ from ..tokens.token cimport Token
|
|||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from .levenshtein import levenshtein_compare
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..strings import get_string_id
|
||||
from ..attrs import IDS
|
||||
from ..util import registry
|
||||
|
||||
from .levenshtein import levenshtein
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||
if fuzzy >= 0:
|
||||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 20% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.fuzzy_compare.v1")
|
||||
def make_fuzzy_compare():
|
||||
return fuzzy_compare
|
||||
|
||||
|
||||
cdef class Matcher:
|
||||
"""Match sequences of tokens, based on pattern rules.
|
||||
|
||||
|
@ -54,7 +38,7 @@ cdef class Matcher:
|
|||
USAGE: https://spacy.io/usage/rule-based-matching
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, validate=True, *, fuzzy_compare=fuzzy_compare):
|
||||
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
|
||||
"""Create the Matcher.
|
||||
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
|
|
|
@ -11,7 +11,7 @@ from ..errors import Errors, Warnings
|
|||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.matcher import fuzzy_compare
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from ..scorer import get_ner_prf
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||
default_config={
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
|
@ -85,7 +85,7 @@ class EntityRuler(Pipe):
|
|||
name: str = "entity_ruler",
|
||||
*,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = fuzzy_compare,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite_ents: bool = False,
|
||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||
|
@ -137,7 +137,6 @@ class EntityRuler(Pipe):
|
|||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
self.scorer = scorer
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
|
|||
from ..tokens import Doc, Span
|
||||
from ..scorer import Scorer
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.matcher import fuzzy_compare
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from .. import util
|
||||
|
||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||
|
@ -29,7 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
|||
"overwrite_ents": False,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
"ent_id_sep": "__unused__",
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
|
@ -76,7 +76,7 @@ def make_entity_ruler(
|
|||
"annotate_ents": False,
|
||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite": True,
|
||||
"scorer": {
|
||||
|
@ -223,7 +223,7 @@ class SpanRuler(Pipe):
|
|||
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
||||
] = util.filter_chain_spans,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = fuzzy_compare,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = partial(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from spacy.matcher import levenshtein
|
||||
from spacy.matcher.matcher import fuzzy_compare
|
||||
from spacy.matcher.levenshtein import levenshtein_compare
|
||||
|
||||
|
||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||
|
@ -69,5 +69,5 @@ def test_levenshtein(dist, a, b):
|
|||
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||
],
|
||||
)
|
||||
def test_fuzzy_compare(a, b, fuzzy, expected):
|
||||
assert fuzzy_compare(a, b, fuzzy) == expected
|
||||
def test_levenshtein_compare(a, b, fuzzy, expected):
|
||||
assert levenshtein_compare(a, b, fuzzy) == expected
|
||||
|
|
Loading…
Reference in New Issue
Block a user