Added test cases to test_matcher_api.py and test_matcher_logic.py

This commit is contained in:
Source-Shen 2022-07-02 01:22:55 +08:00
parent 616bc51743
commit f5fa0d0bad
2 changed files with 85 additions and 2 deletions

View File

@ -715,3 +715,15 @@ def test_matcher_min_max_operator(en_vocab):
matcher.add("TEST", [pattern])
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches4) == 4
def test_matcher_non_greedy_operator(en_vocab):
doc = Doc(
en_vocab, words=["a", "a", "a", "B", "B", "a", "B"]
)
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "a", "OP": "*?"}, {"ORTH": "B", "OP": "+?"}]
matcher.add("Non-greedy", [pattern])
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 7

View File

@ -703,13 +703,19 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
("aaab", "a{,3} b", [0, 0, 0, 1]),
("aaab", "a{2} b", [0, 0, 1]),
("aaab", "a{2,3} b", [0, 0, 0, 1]),
("aaab", "a+? b", [0, 0, 0, 1]),
("aaab", "a*? b", [0, 0, 0, 1]),
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith("+"):
if part[-2:] == "+?":
pattern.append({"ORTH": part[0], "OP": "+?"})
elif part[-2:] == "*?":
pattern.append({"ORTH": part[0], "OP": "*?"})
elif part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
elif part.endswith("*"):
pattern.append({"ORTH": part[0], "OP": "*"})
@ -762,13 +768,19 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
(20, "aaab", "a{2} b", [[0, 0, 1]]),
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
(22, "aaab", "a+? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1]]),
(23, "aaab", "a*? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1], [1]]),
]
for case_id, string, pattern_str, results in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith("+"):
if part[-2:] == "+?":
pattern.append({"ORTH": part[0], "OP": "+?"})
elif part[-2:] == "*?":
pattern.append({"ORTH": part[0], "OP": "*?"})
elif part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
elif part.endswith("*"):
pattern.append({"ORTH": part[0], "OP": "*"})
@ -786,3 +798,62 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
for _, s, e, expected in matches:
assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
assert len(expected) == e - s
def test_matcher_non_greedy_operator(en_vocab):
cases = [
(0, "aabbab", "a*? b", ["a a b", "a b", "b", "b", "a b", "b"]),
(1, "aabbab", "a+? b", ["a b", "a a b", "a b"]),
(2, "aabbab", "a*? b+?", ["a a b", "a b", "b", "b", "a b", "b"]),
(3, "aabbab", "a*? b*?", []),
(4, "aabbab", "b*? b*?", []),
(5, "aabbab", "b+? b+?", ["b b"]),
(6, "aabbab", "a* b*?", ["a", "a a", "a", "a"]),
(7, "aabbab", "a*? b*", ["a a b", "a b", "b", "a a b b", "a b b", "b b", "b", "a b", "b"]),
(8, "aabbc", "a* b*? c*?", ["a", "a a", 'a']),
(9, "aabbc", "a* b*? c", ["a a b b c", "a b b c", "b b c", "b c", "c"]),
(10, "abc", "a* b*? c*", ["a", "a b c", "b c", "c"]),
# in spaCy, quantifier "*" returns __all__possible__ matches which is different from regex
# in spaCy, quantifier "*?" is designed to return only the non-greedy results from all possible matches
# Result 1: a
# Result 2: a b c
# Result 3: c
# Among the 3 results, Result 2 might be contentious to some, but we argue that this should be the correct
# behaviour since 'a' and 'c' are matches thus the longest, first possible string "a b c"
# should be one of the results
(11, "aabbc", "a+? b*? c", ["a b b c", "a a b b c"]),
(12, "aabbc", "a+? b+? c", ["a b b c", "a a b b c"]),
(13, "abbc", "a* b*? c?", ["a", "a b b c", "b b c", "b c", "c"]),
(14, "aabbc", "d! b*? c", ["b c", "b b c", "a b b c"]),
(15, "abbxb", "a*? b+? c*", ["a b", "b", "b", "b"]),
(16, "abbcbc", "a*? b+? c*", ["a b", "b", "b", "b c", "a b b c", "b b c", "b", "b c"]),
(17, "abbcbc", "a*? b+? c", ["b c", "a b b c", "b b c", "b c"]),
]
for case_id, string, pattern_str, results in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part[-2:] == "+?":
pattern.append({"ORTH": part[0], "OP": "+?"})
elif part[-2:] == "*?":
pattern.append({"ORTH": part[0], "OP": "*?"})
elif part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
elif part.endswith("*"):
pattern.append({"ORTH": part[0], "OP": "*"})
elif part.endswith("?"):
pattern.append({"ORTH": part[0], "OP": "?"})
elif part.endswith("!"):
pattern.append({"ORTH": part[0], "OP": "!"})
else:
pattern.append({"ORTH": part})
matcher.add("PATTERN", [pattern])
matches = matcher(doc, as_spans=True)
assert len(matches) == len(results)
for i in range(len(matches)):
print(i)
assert matches[i].text == results[i]