mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Use simpler fuzzy_compare algorithm
This commit is contained in:
parent
3c6dc10d60
commit
9fc37d4ab4
|
@ -7,6 +7,7 @@ from libc.string cimport memset, memcmp
|
|||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from math import ceil
|
||||
import re
|
||||
import srsly
|
||||
import warnings
|
||||
|
@ -32,14 +33,13 @@ from .levenshtein import levenshtein
|
|||
DEF PADDING = 5
|
||||
|
||||
|
||||
cpdef bint fuzzy_compare(s1: str, s2: str, fuzzy: int = -1):
|
||||
distance = min(len(s1), len(s2))
|
||||
distance -= 1 # don't allow completely different tokens
|
||||
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
||||
fuzzy = 5 # default max fuzzy
|
||||
distance -= 1 # be more restrictive
|
||||
distance = min(fuzzy, distance if distance > 0 else 1)
|
||||
return levenshtein(s1, s2, distance) <= distance
|
||||
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||
if fuzzy >= 0:
|
||||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least one edit and up to 20% of the pattern string length
|
||||
max_edits = ceil(0.2 * len(pattern_text))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.fuzzy_compare.v1")
|
||||
|
|
|
@ -56,19 +56,18 @@ def test_levenshtein(dist, a, b):
|
|||
("a", "ab", -1, True),
|
||||
("ab", "ac", 1, True),
|
||||
("ab", "ac", -1, True),
|
||||
("abc", "cde", 4, False), # 4 reduced because of token length
|
||||
("abc", "cde", 4, True),
|
||||
("abc", "cde", -1, False),
|
||||
("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length
|
||||
("abcdef", "cdefgh", 4, True),
|
||||
("abcdef", "cdefgh", 3, False),
|
||||
("abcdef", "cdefgh", -1, True), # default equivalent to 4
|
||||
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
||||
("abcdefgh", "cdefghijk", 5, True),
|
||||
("abcdefgh", "cdefghijk", 4, False),
|
||||
("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5
|
||||
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
||||
("abcdefgh", "cdefghijkl", 6, True),
|
||||
("abcdefgh", "cdefghijkl", 5, False),
|
||||
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
|
||||
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||
],
|
||||
)
|
||||
def test_fuzzy_compare(a, b, fuzzy, expected):
|
||||
assert fuzzy_compare(a, b, fuzzy) == expected
|
||||
assert fuzzy_compare(b, a, fuzzy) == expected
|
||||
|
|
Loading…
Reference in New Issue
Block a user