mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein
This commit is contained in:
parent
0d60744996
commit
e0abb55c42
|
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
cdef extern from "polyleven.c":
|
cdef extern from "polyleven.c":
|
||||||
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
||||||
|
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
|
||||||
if k is None:
|
if k is None:
|
||||||
k = -1
|
k = -1
|
||||||
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||||
|
if fuzzy >= 0:
|
||||||
|
max_edits = fuzzy
|
||||||
|
else:
|
||||||
|
# allow at least two edits (to allow at least one transposition) and up
|
||||||
|
# to 20% of the pattern string length
|
||||||
|
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||||
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.levenshtein_compare.v1")
|
||||||
|
def make_levenshtein_compare():
|
||||||
|
return levenshtein_compare
|
||||||
|
|
|
@ -53,5 +53,3 @@ class Matcher:
|
||||||
with_alignments: bool = ...
|
with_alignments: bool = ...
|
||||||
) -> List[Span]: ...
|
) -> List[Span]: ...
|
||||||
def _normalize_key(self, key: Any) -> Any: ...
|
def _normalize_key(self, key: Any) -> Any: ...
|
||||||
|
|
||||||
def fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1) -> bool: ...
|
|
||||||
|
|
|
@ -20,33 +20,17 @@ from ..tokens.token cimport Token
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||||
|
|
||||||
|
from .levenshtein import levenshtein_compare
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
from .levenshtein import levenshtein
|
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
|
||||||
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
|
||||||
if fuzzy >= 0:
|
|
||||||
max_edits = fuzzy
|
|
||||||
else:
|
|
||||||
# allow at least two edits (to allow at least one transposition) and up
|
|
||||||
# to 20% of the pattern string length
|
|
||||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
|
||||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.fuzzy_compare.v1")
|
|
||||||
def make_fuzzy_compare():
|
|
||||||
return fuzzy_compare
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
"""Match sequences of tokens, based on pattern rules.
|
"""Match sequences of tokens, based on pattern rules.
|
||||||
|
|
||||||
|
@ -54,7 +38,7 @@ cdef class Matcher:
|
||||||
USAGE: https://spacy.io/usage/rule-based-matching
|
USAGE: https://spacy.io/usage/rule-based-matching
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, vocab, validate=True, *, fuzzy_compare=fuzzy_compare):
|
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
|
||||||
"""Create the Matcher.
|
"""Create the Matcher.
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||||
|
|
|
@ -11,7 +11,7 @@ from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..matcher.matcher import fuzzy_compare
|
from ..matcher.levenshtein import levenshtein_compare
|
||||||
from ..scorer import get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||||
default_config={
|
default_config={
|
||||||
"phrase_matcher_attr": None,
|
"phrase_matcher_attr": None,
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||||
|
@ -85,7 +85,7 @@ class EntityRuler(Pipe):
|
||||||
name: str = "entity_ruler",
|
name: str = "entity_ruler",
|
||||||
*,
|
*,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||||
matcher_fuzzy_compare: Callable = fuzzy_compare,
|
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||||
validate: bool = False,
|
validate: bool = False,
|
||||||
overwrite_ents: bool = False,
|
overwrite_ents: bool = False,
|
||||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||||
|
@ -137,7 +137,6 @@ class EntityRuler(Pipe):
|
||||||
if patterns is not None:
|
if patterns is not None:
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.fuzzy_compare = fuzzy_compare
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
"""The number of all patterns added to the entity ruler."""
|
"""The number of all patterns added to the entity ruler."""
|
||||||
|
|
|
@ -13,7 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from ..matcher.matcher import fuzzy_compare
|
from ..matcher.levenshtein import levenshtein_compare
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||||
|
@ -29,7 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
||||||
"overwrite_ents": False,
|
"overwrite_ents": False,
|
||||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||||
"ent_id_sep": "__unused__",
|
"ent_id_sep": "__unused__",
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"ents_f": 1.0,
|
"ents_f": 1.0,
|
||||||
|
@ -76,7 +76,7 @@ def make_entity_ruler(
|
||||||
"annotate_ents": False,
|
"annotate_ents": False,
|
||||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||||
"phrase_matcher_attr": None,
|
"phrase_matcher_attr": None,
|
||||||
"matcher_fuzzy_compare": {"@misc": "spacy.fuzzy_compare.v1"},
|
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||||
"validate": False,
|
"validate": False,
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {
|
"scorer": {
|
||||||
|
@ -223,7 +223,7 @@ class SpanRuler(Pipe):
|
||||||
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
||||||
] = util.filter_chain_spans,
|
] = util.filter_chain_spans,
|
||||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||||
matcher_fuzzy_compare: Callable = fuzzy_compare,
|
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||||
validate: bool = False,
|
validate: bool = False,
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = partial(
|
scorer: Optional[Callable] = partial(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.matcher import levenshtein
|
from spacy.matcher import levenshtein
|
||||||
from spacy.matcher.matcher import fuzzy_compare
|
from spacy.matcher.levenshtein import levenshtein_compare
|
||||||
|
|
||||||
|
|
||||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||||
|
@ -69,5 +69,5 @@ def test_levenshtein(dist, a, b):
|
||||||
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_fuzzy_compare(a, b, fuzzy, expected):
|
def test_levenshtein_compare(a, b, fuzzy, expected):
|
||||||
assert fuzzy_compare(a, b, fuzzy) == expected
|
assert levenshtein_compare(a, b, fuzzy) == expected
|
||||||
|
|
Loading…
Reference in New Issue
Block a user