mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Use simpler fuzzy_compare algorithm
This commit is contained in:
parent
3c6dc10d60
commit
9fc37d4ab4
|
@ -7,6 +7,7 @@ from libc.string cimport memset, memcmp
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
from math import ceil
|
||||||
import re
|
import re
|
||||||
import srsly
|
import srsly
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -32,14 +33,13 @@ from .levenshtein import levenshtein
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
|
||||||
cpdef bint fuzzy_compare(s1: str, s2: str, fuzzy: int = -1):
|
cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||||
distance = min(len(s1), len(s2))
|
if fuzzy >= 0:
|
||||||
distance -= 1 # don't allow completely different tokens
|
max_edits = fuzzy
|
||||||
if fuzzy == -1: # FUZZY operator with unspecified fuzzy
|
else:
|
||||||
fuzzy = 5 # default max fuzzy
|
# allow at least one edit and up to 20% of the pattern string length
|
||||||
distance -= 1 # be more restrictive
|
max_edits = ceil(0.2 * len(pattern_text))
|
||||||
distance = min(fuzzy, distance if distance > 0 else 1)
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
return levenshtein(s1, s2, distance) <= distance
|
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.fuzzy_compare.v1")
|
@registry.misc("spacy.fuzzy_compare.v1")
|
||||||
|
|
|
@ -56,19 +56,18 @@ def test_levenshtein(dist, a, b):
|
||||||
("a", "ab", -1, True),
|
("a", "ab", -1, True),
|
||||||
("ab", "ac", 1, True),
|
("ab", "ac", 1, True),
|
||||||
("ab", "ac", -1, True),
|
("ab", "ac", -1, True),
|
||||||
("abc", "cde", 4, False), # 4 reduced because of token length
|
("abc", "cde", 4, True),
|
||||||
("abc", "cde", -1, False),
|
("abc", "cde", -1, False),
|
||||||
("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length
|
("abcdef", "cdefgh", 4, True),
|
||||||
("abcdef", "cdefgh", 3, False),
|
("abcdef", "cdefgh", 3, False),
|
||||||
("abcdef", "cdefgh", -1, True), # default equivalent to 4
|
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
||||||
("abcdefgh", "cdefghijk", 5, True),
|
("abcdefgh", "cdefghijk", 5, True),
|
||||||
("abcdefgh", "cdefghijk", 4, False),
|
("abcdefgh", "cdefghijk", 4, False),
|
||||||
("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5
|
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
||||||
("abcdefgh", "cdefghijkl", 6, True),
|
("abcdefgh", "cdefghijkl", 6, True),
|
||||||
("abcdefgh", "cdefghijkl", 5, False),
|
("abcdefgh", "cdefghijkl", 5, False),
|
||||||
("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max)
|
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_fuzzy_compare(a, b, fuzzy, expected):
|
def test_fuzzy_compare(a, b, fuzzy, expected):
|
||||||
assert fuzzy_compare(a, b, fuzzy) == expected
|
assert fuzzy_compare(a, b, fuzzy) == expected
|
||||||
assert fuzzy_compare(b, a, fuzzy) == expected
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user