diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 7e04f4609..e57f79186 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -7,6 +7,7 @@ from libc.string cimport memset, memcmp from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 +from math import ceil import re import srsly import warnings @@ -32,14 +33,13 @@ from .levenshtein import levenshtein DEF PADDING = 5 -cpdef bint fuzzy_compare(s1: str, s2: str, fuzzy: int = -1): - distance = min(len(s1), len(s2)) - distance -= 1 # don't allow completely different tokens - if fuzzy == -1: # FUZZY operator with unspecified fuzzy - fuzzy = 5 # default max fuzzy - distance -= 1 # be more restrictive - distance = min(fuzzy, distance if distance > 0 else 1) - return levenshtein(s1, s2, distance) <= distance +cpdef bint fuzzy_compare(input_text: str, pattern_text: str, fuzzy: int = -1): + if fuzzy >= 0: + max_edits = fuzzy + else: + # allow at least one edit and up to 20% of the pattern string length + max_edits = ceil(0.2 * len(pattern_text)) + return levenshtein(input_text, pattern_text, max_edits) <= max_edits @registry.misc("spacy.fuzzy_compare.v1") diff --git a/spacy/tests/matcher/test_levenshtein.py b/spacy/tests/matcher/test_levenshtein.py index 75e2eda13..c4001bba1 100644 --- a/spacy/tests/matcher/test_levenshtein.py +++ b/spacy/tests/matcher/test_levenshtein.py @@ -56,19 +56,18 @@ def test_levenshtein(dist, a, b): ("a", "ab", -1, True), ("ab", "ac", 1, True), ("ab", "ac", -1, True), - ("abc", "cde", 4, False), # 4 reduced because of token length + ("abc", "cde", 4, True), ("abc", "cde", -1, False), - ("abcdef", "cdefgh", 4, True), # 4 not reduced because of token length + ("abcdef", "cdefgh", 4, True), ("abcdef", "cdefgh", 3, False), - ("abcdef", "cdefgh", -1, True), # default equivalent to 4 + ("abcdef", "cdefgh", -1, False), # default (2 for length 6) ("abcdefgh", "cdefghijk", 5, True), ("abcdefgh", "cdefghijk", 4, False), - ("abcdefgh", "cdefghijk", -1, True), # default equivalent to 5 + ("abcdefgh", "cdefghijk", -1, False), # default (2) ("abcdefgh", "cdefghijkl", 6, True), ("abcdefgh", "cdefghijkl", 5, False), - ("abcdefgh", "cdefghijkl", -1, False), # default equivalent to 5 (max) + ("abcdefgh", "cdefghijkl", -1, False), # default (2) ], ) def test_fuzzy_compare(a, b, fuzzy, expected): assert fuzzy_compare(a, b, fuzzy) == expected - assert fuzzy_compare(b, a, fuzzy) == expected