From d6773351992e0a51e297cd288ad707bb8f400ff5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Nov 2022 13:32:16 +0100 Subject: [PATCH] Parametrize and merge fuzzy+set tests --- spacy/tests/matcher/test_matcher_api.py | 122 +++++++----------------- 1 file changed, 35 insertions(+), 87 deletions(-) diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index f9fea559b..9681c5713 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -118,91 +118,56 @@ def test_matcher_match_multi(matcher): ] -# fuzzy matches on specific tokens +@pytest.mark.parametrize( + "rules,match_locs", + [ + ( + { + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + }, + [(2, 4)], + ), + ( + { + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + }, + [(5, 6)], + ), + ( + { + "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], + "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": {"FUZZY": "java"}}]], + }, + [(2, 4), (5, 6), (8, 9)], + ), + ], +) +def test_matcher_match_fuzzy(en_vocab, rules, match_locs): + words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] + doc = Doc(en_vocab, words=words) - -def test_matcher_match_fuzzy1(en_vocab): - rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": "java"}]], - } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns) - - words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["GoogleNow"], 2, 4), - ] + assert match_locs == [(start, end) for m_id, start, end in matcher(doc)] -def test_matcher_match_fuzzy2(en_vocab): +@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"]) +def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op): rules = { - "JS": [[{"ORTH": "JavaScript"}]], - "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": {"FUZZY": "java"}}]], - } - matcher = Matcher(en_vocab) - for key, patterns in rules.items(): - matcher.add(key, patterns) - - words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["Java"], 5, 6), - ] - - -def test_matcher_match_fuzzy3(en_vocab): - rules = { - "JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]], - "GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]], - "Java": [[{"LOWER": {"FUZZY": "java"}}]], - } - matcher = Matcher(en_vocab) - for key, patterns in rules.items(): - matcher.add(key, patterns) - - words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["GoogleNow"], 2, 4), - (doc.vocab.strings["Java"], 5, 6), - (doc.vocab.strings["JS"], 8, 9), - ] - - -def test_matcher_match_fuzzy_set1(en_vocab): - rules = {"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]]} - matcher = Matcher(en_vocab) - for key, patterns in rules.items(): - matcher.add(key, patterns, greedy="LONGEST") - - words = ["They", "like", "Goggle", "Noo"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["GoogleNow"], 2, 4), - ] - - -def test_matcher_match_fuzzy_set2(en_vocab): - rules = { - "GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]], + "GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]] } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, patterns, greedy="LONGEST") words = ["They", "like", "Goggle", "Noo"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["GoogleNow"], 0, 2), - ] + doc = Doc(en_vocab, words=words) + assert len(matcher(doc)) == 1 -def test_matcher_match_fuzzy_set3(en_vocab): +def test_matcher_match_fuzzy_set_multiple(en_vocab): rules = { "GoogleNow": [ [ @@ -224,23 +189,6 @@ def test_matcher_match_fuzzy_set3(en_vocab): ] -def test_matcher_match_fuzzy_set4(en_vocab): - rules = { - "QUESTION": [ - [{"ORTH": {"FUZZY": {"IN": ["what"]}, "NOT_IN": ["that"]}}, {"ORTH": "do"}] - ] - } - matcher = Matcher(en_vocab) - for key, patterns in rules.items(): - matcher.add(key, patterns, greedy="LONGEST") - - words = ["what", "do", "you", "want"] - doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [ - (doc.vocab.strings["QUESTION"], 0, 2), - ] - - @pytest.mark.parametrize("fuzzyn", range(1, 6)) def test_matcher_match_fuzzyn(en_vocab, fuzzyn): matcher = Matcher(en_vocab)