Use simpler fuzzy_compare algorithm

This commit is contained in:
Adriane Boyd 2022-11-29 16:17:56 +01:00
parent 3c6dc10d60
commit 9fc37d4ab4
2 changed files with 13 additions and 14 deletions

View File

@ -7,6 +7,7 @@ from libc.string cimport memset, memcmp
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from math import ceil
import re import re
import srsly import srsly
import warnings import warnings
@ -32,14 +33,13 @@ from .levenshtein import levenshtein
DEF PADDING = 5 DEF PADDING = 5
cpdef bint fuzzy_compare(s1: str, s2: str, fuzzy: int = -1): cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
distance = min(len(s1), len(s2)) if fuzzy >= 0:
distance -= 1 # don't allow completely different tokens max_edits = fuzzy
if fuzzy == -1: # FUZZY operator with unspecified fuzzy else:
fuzzy = 5 # default max fuzzy # allow at least one edit and up to 20% of the pattern string length
distance -= 1 # be more restrictive max_edits = ceil(0.2 * len(pattern_text))
distance = min(fuzzy, distance if distance > 0 else 1) return levenshtein(input_text, pattern_text, max_edits) <= max_edits
return levenshtein(s1, s2, distance) <= distance
@registry.misc("spacy.fuzzy_compare.v1") @registry.misc("spacy.fuzzy_compare.v1")

View File

@ -56,19 +56,18 @@ def test_levenshtein(dist, a, b):
("a", "ab", -1, True), ("a", "ab", -1, True),
("ab", "ac", 1, True), ("ab", "ac", 1, True),
("ab", "ac", -1, True), ("ab", "ac", -1, True),
("abc", "cde", 4, False), # 4 reduced because of token length ("abc", "cde", 4, True),
("abc", "cde", -1, False), ("abc", "cde", -1, False),
("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length ("abcdef", "cdefgh", 4, True),
("abcdef", "cdefgh", 3, False), ("abcdef", "cdefgh", 3, False),
("abcdef", "cdefgh", -1, True), # default equivalent to 4 ("abcdef", "cdefgh", -1, False), # default (2 for length 6)
("abcdefgh", "cdefghijk", 5, True), ("abcdefgh", "cdefghijk", 5, True),
("abcdefgh", "cdefghijk", 4, False), ("abcdefgh", "cdefghijk", 4, False),
("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5 ("abcdefgh", "cdefghijk", -1, False), # default (2)
("abcdefgh", "cdefghijkl", 6, True), ("abcdefgh", "cdefghijkl", 6, True),
("abcdefgh", "cdefghijkl", 5, False), ("abcdefgh", "cdefghijkl", 5, False),
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max) ("abcdefgh", "cdefghijkl", -1, False), # default (2)
], ],
) )
def test_fuzzy_compare(a, b, fuzzy, expected): def test_fuzzy_compare(a, b, fuzzy, expected):
assert fuzzy_compare(a, b, fuzzy) == expected assert fuzzy_compare(a, b, fuzzy) == expected
assert fuzzy_compare(b, a, fuzzy) == expected