mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-06 05:10:21 +03:00
Parametrize and merge fuzzy+set tests
This commit is contained in:
parent
070cbf2be0
commit
d677335199
|
@ -118,91 +118,56 @@ def test_matcher_match_multi(matcher):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# fuzzy matches on specific tokens
|
@pytest.mark.parametrize(
|
||||||
|
"rules,match_locs",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||||
|
},
|
||||||
|
[(2, 4)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||||
|
},
|
||||||
|
[(5, 6)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
||||||
|
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||||
|
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||||
|
},
|
||||||
|
[(2, 4), (5, 6), (8, 9)],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
|
||||||
|
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy1(en_vocab):
|
|
||||||
rules = {
|
|
||||||
"JS": [[{"ORTH": "JavaScript"}]],
|
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
|
||||||
"Java": [[{"LOWER": "java"}]],
|
|
||||||
}
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns)
|
matcher.add(key, patterns)
|
||||||
|
assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
|
||||||
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy2(en_vocab):
|
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||||
|
def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
|
||||||
rules = {
|
rules = {
|
||||||
"JS": [[{"ORTH": "JavaScript"}]],
|
"GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||||
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
|
||||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
|
||||||
}
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
for key, patterns in rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy3(en_vocab):
|
|
||||||
rules = {
|
|
||||||
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
|
||||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
|
||||||
}
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
for key, patterns in rules.items():
|
|
||||||
matcher.add(key, patterns)
|
|
||||||
|
|
||||||
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
|
||||||
(doc.vocab.strings["Java"], 5, 6),
|
|
||||||
(doc.vocab.strings["JS"], 8, 9),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy_set1(en_vocab):
|
|
||||||
rules = {"GoogleNow": [[{"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}}, "OP": "+"}]]}
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
for key, patterns in rules.items():
|
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
|
||||||
|
|
||||||
words = ["They", "like", "Goggle", "Noo"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["GoogleNow"], 2, 4),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy_set2(en_vocab):
|
|
||||||
rules = {
|
|
||||||
"GoogleNow": [[{"ORTH": {"FUZZY": {"NOT_IN": ["Google", "Now"]}}, "OP": "+"}]],
|
|
||||||
}
|
}
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
matcher.add(key, patterns, greedy="LONGEST")
|
||||||
|
|
||||||
words = ["They", "like", "Goggle", "Noo"]
|
words = ["They", "like", "Goggle", "Noo"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
assert matcher(doc) == [
|
assert len(matcher(doc)) == 1
|
||||||
(doc.vocab.strings["GoogleNow"], 0, 2),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy_set3(en_vocab):
|
def test_matcher_match_fuzzy_set_multiple(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
"GoogleNow": [
|
"GoogleNow": [
|
||||||
[
|
[
|
||||||
|
@ -224,23 +189,6 @@ def test_matcher_match_fuzzy_set3(en_vocab):
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_fuzzy_set4(en_vocab):
|
|
||||||
rules = {
|
|
||||||
"QUESTION": [
|
|
||||||
[{"ORTH": {"FUZZY": {"IN": ["what"]}, "NOT_IN": ["that"]}}, {"ORTH": "do"}]
|
|
||||||
]
|
|
||||||
}
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
for key, patterns in rules.items():
|
|
||||||
matcher.add(key, patterns, greedy="LONGEST")
|
|
||||||
|
|
||||||
words = ["what", "do", "you", "want"]
|
|
||||||
doc = Doc(matcher.vocab, words=words)
|
|
||||||
assert matcher(doc) == [
|
|
||||||
(doc.vocab.strings["QUESTION"], 0, 2),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("fuzzyn", range(1, 6))
|
@pytest.mark.parametrize("fuzzyn", range(1, 6))
|
||||||
def test_matcher_match_fuzzyn(en_vocab, fuzzyn):
|
def test_matcher_match_fuzzyn(en_vocab, fuzzyn):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user