mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
19650ebb52
* enable fuzzy matching * add fuzzy param to EntityMatcher * include rapidfuzz_capi not yet used * fix type * add FUZZY predicate * add fuzzy attribute list * fix type properly * tidying * remove unnecessary dependency * handle fuzzy sets * simplify fuzzy sets * case fix * switch to FUZZYn predicates use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi. * revert changes added for fuzzy param * switch to polyleven (Python package) * enable fuzzy matching * add fuzzy param to EntityMatcher * include rapidfuzz_capi not yet used * fix type * add FUZZY predicate * add fuzzy attribute list * fix type properly * tidying * remove unnecessary dependency * handle fuzzy sets * simplify fuzzy sets * case fix * switch to FUZZYn predicates use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi. * revert changes added for fuzzy param * switch to polyleven (Python package) * fuzzy match only on oov tokens * remove polyleven * exclude whitespace tokens * don't allow more edits than characters * fix min distance * reinstate FUZZY operator with length-based distance function * handle sets inside regex operator * remove is_oov check * attempt build fix no mypy failure locally * re-attempt build fix * don't overwrite fuzzy param value * move fuzzy_match to its own Python module to allow patching * move fuzzy_match back inside Matcher simplify logic and add tests * Format tests * Parametrize fuzzyn tests * Parametrize and merge fuzzy+set tests * Format * Move fuzzy_match to a standalone method * Change regex kwarg type to bool * Add types for fuzzy_match - Refactor variable names - Add test for symmetrical behavior * Parametrize fuzzyn+set tests * Minor refactoring for fuzz/fuzzy * Make fuzzy_match a Matcher kwarg * Update type for _default_fuzzy_match * don't overwrite function param * Rename to fuzzy_compare * Update fuzzy_compare default argument declarations * allow fuzzy_compare override from EntityRuler * define new Matcher keyword arg * fix type definition * Implement fuzzy_compare config option for EntityRuler and SpanRuler * Rename _default_fuzzy_compare to fuzzy_compare, remove from reexported objects * Use simpler fuzzy_compare algorithm * Update types * Increase minimum to 2 in fuzzy_compare to allow one transposition * Fix predicate keys and matching for SetPredicate with FUZZY and REGEX * Add FUZZY6..9 * Add initial docs * Increase default fuzzy to rounded 30% of pattern length * Update docs for fuzzy_compare in components * Update EntityRuler and SpanRuler API docs * Rename EntityRuler and SpanRuler setting to matcher_fuzzy_compare To having naming similar to `phrase_matcher_attr`, rename `fuzzy_compare` setting for `EntityRuler` and `SpanRuler` to `matcher_fuzzy_compare. Organize next to `phrase_matcher_attr` in docs. * Fix schema aliases Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix typo Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Add FUZZY6-9 operators and update tests * Parameterize test over greedy Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix type for fuzzy_compare to remove Optional * Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein * Update docs following levenshtein_compare renaming Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
74 lines
2.7 KiB
Python
74 lines
2.7 KiB
Python
import pytest
|
|
from spacy.matcher import levenshtein
|
|
from spacy.matcher.levenshtein import levenshtein_compare
|
|
|
|
|
|
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
|
# from polyleven
|
|
@pytest.mark.parametrize(
|
|
"dist,a,b",
|
|
[
|
|
(0, "", ""),
|
|
(4, "bbcb", "caba"),
|
|
(3, "abcb", "cacc"),
|
|
(3, "aa", "ccc"),
|
|
(1, "cca", "ccac"),
|
|
(1, "aba", "aa"),
|
|
(4, "bcbb", "abac"),
|
|
(3, "acbc", "bba"),
|
|
(3, "cbba", "a"),
|
|
(2, "bcc", "ba"),
|
|
(4, "aaa", "ccbb"),
|
|
(3, "うあい", "いいうい"),
|
|
(2, "あううい", "うあい"),
|
|
(3, "いういい", "うううあ"),
|
|
(2, "うい", "あいあ"),
|
|
(2, "いあい", "いう"),
|
|
(1, "いい", "あいい"),
|
|
(3, "あうあ", "いいああ"),
|
|
(4, "いあうう", "ううああ"),
|
|
(3, "いあいい", "ういああ"),
|
|
(3, "いいああ", "ううあう"),
|
|
(
|
|
166,
|
|
"TCTGGGCACGGATTCGTCAGATTCCATGTCCATATTTGAGGCTCTTGCAGGCAAAATTTGGGCATGTGAACTCCTTATAGTCCCCGTGC",
|
|
"ATATGGATTGGGGGCATTCAAAGATACGGTTTCCCTTTCTTCAGTTTCGCGCGGCGCACGTCCGGGTGCGAGCCAGTTCGTCTTACTCACATTGTCGACTTCACGAATCGCGCATGATGTGCTTAGCCTGTACTTACGAACGAACTTTCGGTCCAAATACATTCTATCAACACCGAGGTATCCGTGCCACACGCCGAAGCTCGACCGTGTTCGTTGAGAGGTGGAAATGGTAAAAGATGAACATAGTC",
|
|
),
|
|
(
|
|
111,
|
|
"GGTTCGGCCGAATTCATAGAGCGTGGTAGTCGACGGTATCCCGCCTGGTAGGGGCCCCTTCTACCTAGCGGAAGTTTGTCAGTACTCTATAACACGAGGGCCTCTCACACCCTAGATCGTCCAGCCACTCGAAGATCGCAGCACCCTTACAGAAAGGCATTAATGTTTCTCCTAGCACTTGTGCAATGGTGAAGGAGTGATG",
|
|
"CGTAACACTTCGCGCTACTGGGCTGCAACGTCTTGGGCATACATGCAAGATTATCTAATGCAAGCTTGAGCCCCGCTTGCGGAATTTCCCTAATCGGGGTCCCTTCCTGTTACGATAAGGACGCGTGCACT",
|
|
),
|
|
],
|
|
)
|
|
def test_levenshtein(dist, a, b):
|
|
assert levenshtein(a, b) == dist
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"a,b,fuzzy,expected",
|
|
[
|
|
("a", "a", 1, True),
|
|
("a", "a", 0, True),
|
|
("a", "a", -1, True),
|
|
("a", "ab", 1, True),
|
|
("a", "ab", 0, False),
|
|
("a", "ab", -1, True),
|
|
("ab", "ac", 1, True),
|
|
("ab", "ac", -1, True),
|
|
("abc", "cde", 4, True),
|
|
("abc", "cde", -1, False),
|
|
("abcdef", "cdefgh", 4, True),
|
|
("abcdef", "cdefgh", 3, False),
|
|
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
|
("abcdefgh", "cdefghijk", 5, True),
|
|
("abcdefgh", "cdefghijk", 4, False),
|
|
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
|
("abcdefgh", "cdefghijkl", 6, True),
|
|
("abcdefgh", "cdefghijkl", 5, False),
|
|
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
|
],
|
|
)
|
|
def test_levenshtein_compare(a, b, fuzzy, expected):
|
|
assert levenshtein_compare(a, b, fuzzy) == expected
|