Use simpler fuzzy_compare algorithm

This commit is contained in:
Adriane Boyd 2022-11-29 16:17:56 +01:00
parent 3c6dc10d60
commit 9fc37d4ab4
2 changed files with 13 additions and 14 deletions

View File

@ -7,6 +7,7 @@ from libc.string cimport memset, memcmp
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from math import ceil
import re
import srsly
import warnings
@ -32,14 +33,13 @@ from .levenshtein import levenshtein
DEF PADDING = 5
cpdef bint fuzzy_compare(s1: str, s2: str, fuzzy: int = -1):
distance = min(len(s1), len(s2))
distance -= 1 # don't allow completely different tokens
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
fuzzy = 5 # default max fuzzy
distance -= 1 # be more restrictive
distance = min(fuzzy, distance if distance > 0 else 1)
return levenshtein(s1, s2, distance) <= distance
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
if fuzzy >= 0:
max_edits = fuzzy
else:
# allow at least one edit and up to 20% of the pattern string length
max_edits = ceil(0.2 * len(pattern_text))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.fuzzy_compare.v1")

View File

@ -56,19 +56,18 @@ def test_levenshtein(dist, a, b):
("a", "ab", -1, True),
("ab", "ac", 1, True),
("ab", "ac", -1, True),
("abc", "cde", 4, False), # 4 reduced because of token length
("abc", "cde", 4, True),
("abc", "cde", -1, False),
("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length
("abcdef", "cdefgh", 4, True),
("abcdef", "cdefgh", 3, False),
("abcdef", "cdefgh", -1, True), # default equivalent to 4
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
("abcdefgh", "cdefghijk", 5, True),
("abcdefgh", "cdefghijk", 4, False),
("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5
("abcdefgh", "cdefghijk", -1, False), # default (2)
("abcdefgh", "cdefghijkl", 6, True),
("abcdefgh", "cdefghijkl", 5, False),
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
("abcdefgh", "cdefghijkl", -1, False), # default (2)
],
)
def test_fuzzy_compare(a, b, fuzzy, expected):
assert fuzzy_compare(a, b, fuzzy) == expected
assert fuzzy_compare(b, a, fuzzy) == expected