mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 21:54:54 +03:00
Added test cases to test_matcher_api.py and test_matcher_logic.py
This commit is contained in:
parent
616bc51743
commit
f5fa0d0bad
|
@ -715,3 +715,15 @@ def test_matcher_min_max_operator(en_vocab):
|
|||
matcher.add("TEST", [pattern])
|
||||
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches4) == 4
|
||||
|
||||
|
||||
def test_matcher_non_greedy_operator(en_vocab):
|
||||
|
||||
doc = Doc(
|
||||
en_vocab, words=["a", "a", "a", "B", "B", "a", "B"]
|
||||
)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "a", "OP": "*?"}, {"ORTH": "B", "OP": "+?"}]
|
||||
matcher.add("Non-greedy", [pattern])
|
||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches) == 7
|
||||
|
|
|
@ -703,13 +703,19 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
("aaab", "a{,3} b", [0, 0, 0, 1]),
|
||||
("aaab", "a{2} b", [0, 0, 1]),
|
||||
("aaab", "a{2,3} b", [0, 0, 0, 1]),
|
||||
("aaab", "a+? b", [0, 0, 0, 1]),
|
||||
("aaab", "a*? b", [0, 0, 0, 1]),
|
||||
]
|
||||
for string, pattern_str, result in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=list(string))
|
||||
pattern = []
|
||||
for part in pattern_str.split():
|
||||
if part.endswith("+"):
|
||||
if part[-2:] == "+?":
|
||||
pattern.append({"ORTH": part[0], "OP": "+?"})
|
||||
elif part[-2:] == "*?":
|
||||
pattern.append({"ORTH": part[0], "OP": "*?"})
|
||||
elif part.endswith("+"):
|
||||
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||
elif part.endswith("*"):
|
||||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
|
@ -762,13 +768,19 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
|
|||
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
|
||||
(20, "aaab", "a{2} b", [[0, 0, 1]]),
|
||||
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||
(22, "aaab", "a+? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1]]),
|
||||
(23, "aaab", "a*? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1], [1]]),
|
||||
]
|
||||
for case_id, string, pattern_str, results in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=list(string))
|
||||
pattern = []
|
||||
for part in pattern_str.split():
|
||||
if part.endswith("+"):
|
||||
if part[-2:] == "+?":
|
||||
pattern.append({"ORTH": part[0], "OP": "+?"})
|
||||
elif part[-2:] == "*?":
|
||||
pattern.append({"ORTH": part[0], "OP": "*?"})
|
||||
elif part.endswith("+"):
|
||||
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||
elif part.endswith("*"):
|
||||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
|
@ -786,3 +798,62 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
|
|||
for _, s, e, expected in matches:
|
||||
assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
|
||||
assert len(expected) == e - s
|
||||
|
||||
|
||||
def test_matcher_non_greedy_operator(en_vocab):
|
||||
cases = [
|
||||
(0, "aabbab", "a*? b", ["a a b", "a b", "b", "b", "a b", "b"]),
|
||||
(1, "aabbab", "a+? b", ["a b", "a a b", "a b"]),
|
||||
(2, "aabbab", "a*? b+?", ["a a b", "a b", "b", "b", "a b", "b"]),
|
||||
(3, "aabbab", "a*? b*?", []),
|
||||
(4, "aabbab", "b*? b*?", []),
|
||||
(5, "aabbab", "b+? b+?", ["b b"]),
|
||||
(6, "aabbab", "a* b*?", ["a", "a a", "a", "a"]),
|
||||
(7, "aabbab", "a*? b*", ["a a b", "a b", "b", "a a b b", "a b b", "b b", "b", "a b", "b"]),
|
||||
(8, "aabbc", "a* b*? c*?", ["a", "a a", 'a']),
|
||||
(9, "aabbc", "a* b*? c", ["a a b b c", "a b b c", "b b c", "b c", "c"]),
|
||||
|
||||
(10, "abc", "a* b*? c*", ["a", "a b c", "b c", "c"]),
|
||||
# in spaCy, quantifier "*" returns __all__possible__ matches which is different from regex
|
||||
# in spaCy, quantifier "*?" is designed to return only the non-greedy results from all possible matches
|
||||
# Result 1: a
|
||||
# Result 2: a b c
|
||||
# Result 3: c
|
||||
# Among the 3 results, Result 2 might be contentious to some, but we argue that this should be the correct
|
||||
# behaviour since 'a' and 'c' are matches thus the longest, first possible string "a b c"
|
||||
# should be one of the results
|
||||
|
||||
(11, "aabbc", "a+? b*? c", ["a b b c", "a a b b c"]),
|
||||
(12, "aabbc", "a+? b+? c", ["a b b c", "a a b b c"]),
|
||||
(13, "abbc", "a* b*? c?", ["a", "a b b c", "b b c", "b c", "c"]),
|
||||
(14, "aabbc", "d! b*? c", ["b c", "b b c", "a b b c"]),
|
||||
(15, "abbxb", "a*? b+? c*", ["a b", "b", "b", "b"]),
|
||||
(16, "abbcbc", "a*? b+? c*", ["a b", "b", "b", "b c", "a b b c", "b b c", "b", "b c"]),
|
||||
(17, "abbcbc", "a*? b+? c", ["b c", "a b b c", "b b c", "b c"]),
|
||||
|
||||
]
|
||||
for case_id, string, pattern_str, results in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(matcher.vocab, words=list(string))
|
||||
pattern = []
|
||||
for part in pattern_str.split():
|
||||
if part[-2:] == "+?":
|
||||
pattern.append({"ORTH": part[0], "OP": "+?"})
|
||||
elif part[-2:] == "*?":
|
||||
pattern.append({"ORTH": part[0], "OP": "*?"})
|
||||
elif part.endswith("+"):
|
||||
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||
elif part.endswith("*"):
|
||||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
elif part.endswith("?"):
|
||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||
elif part.endswith("!"):
|
||||
pattern.append({"ORTH": part[0], "OP": "!"})
|
||||
else:
|
||||
pattern.append({"ORTH": part})
|
||||
matcher.add("PATTERN", [pattern])
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == len(results)
|
||||
for i in range(len(matches)):
|
||||
print(i)
|
||||
assert matches[i].text == results[i]
|
||||
|
|
Loading…
Reference in New Issue
Block a user