mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 12:50:20 +03:00
Move fuzzy_match to a standalone method
This commit is contained in:
parent
7e25c7f10f
commit
6ae4c99365
|
@ -205,17 +205,6 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
@staticmethod
|
||||
def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool:
|
||||
distance = min(len(input_string), len(rule_string))
|
||||
distance -= 1 # don't allow completely different tokens
|
||||
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
||||
fuzzy = 5 # default max fuzzy
|
||||
distance -= 1 # be more restrictive
|
||||
distance = min(fuzzy, distance if distance > 0 else 1)
|
||||
return levenshtein(input_string, rule_string, distance) <= distance
|
||||
|
||||
|
||||
def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
|
@ -862,7 +851,7 @@ class _FuzzyPredicate:
|
|||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||
if self.value == value:
|
||||
return True
|
||||
return Matcher.fuzzy_match(value, self.value, self.fuzzy)
|
||||
return fuzzy_match(value, self.value, self.fuzzy)
|
||||
|
||||
|
||||
class _RegexPredicate:
|
||||
|
@ -945,7 +934,7 @@ class _SetPredicate:
|
|||
return True
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
return any(fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
else:
|
||||
return False
|
||||
|
@ -957,7 +946,7 @@ class _SetPredicate:
|
|||
return False
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(Matcher.fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
return not any(fuzzy_match(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
else:
|
||||
return True
|
||||
|
@ -1156,3 +1145,13 @@ def _get_extensions(spec, string_store, name2index):
|
|||
name2index[name] = len(name2index)
|
||||
attr_values.append((name2index[name], value))
|
||||
return attr_values
|
||||
|
||||
|
||||
def fuzzy_match(input_string: str, rule_string: str, fuzzy: int=-1) -> bool:
|
||||
distance = min(len(input_string), len(rule_string))
|
||||
distance -= 1 # don't allow completely different tokens
|
||||
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
||||
fuzzy = 5 # default max fuzzy
|
||||
distance -= 1 # be more restrictive
|
||||
distance = min(fuzzy, distance if distance > 0 else 1)
|
||||
return levenshtein(input_string, rule_string, distance) <= distance
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from spacy.matcher import levenshtein
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.matcher.matcher import fuzzy_match
|
||||
|
||||
|
||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||
|
@ -70,4 +70,4 @@ def test_levenshtein(dist, a, b):
|
|||
],
|
||||
)
|
||||
def test_fuzzy_match(a, b, fuzzy, expected):
|
||||
assert Matcher.fuzzy_match(a, b, fuzzy) == expected
|
||||
assert fuzzy_match(a, b, fuzzy) == expected
|
||||
|
|
Loading…
Reference in New Issue
Block a user