Added test cases to test_matcher_api.py and test_matcher_logic.py

2025-09-17 09:32:42 +03:00 · 2022-07-02 01:22:55 +08:00 · 2022-07-02 01:22:55 +08:00 · f5fa0d0bad
commit f5fa0d0bad
parent 616bc51743
2 changed files with 85 additions and 2 deletions
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -715,3 +715,15 @@ def test_matcher_min_max_operator(en_vocab):
    matcher.add("TEST", [pattern])
    matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches4) == 4
+
+
+def test_matcher_non_greedy_operator(en_vocab):
+
+    doc = Doc(
+        en_vocab, words=["a", "a", "a", "B", "B", "a", "B"]
+    )
+    matcher = Matcher(en_vocab)
+    pattern = [{"ORTH": "a", "OP": "*?"}, {"ORTH": "B", "OP": "+?"}]
+    matcher.add("Non-greedy", [pattern])
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 7
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -703,13 +703,19 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
        ("aaab", "a{,3} b", [0, 0, 0, 1]),
        ("aaab", "a{2} b", [0, 0, 1]),
        ("aaab", "a{2,3} b", [0, 0, 0, 1]),
+        ("aaab", "a+? b", [0, 0, 0, 1]),
+        ("aaab", "a*? b", [0, 0, 0, 1]),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
-            if part.endswith("+"):
+            if part[-2:] == "+?":
+                pattern.append({"ORTH": part[0], "OP": "+?"})
+            elif part[-2:] == "*?":
+                pattern.append({"ORTH": part[0], "OP": "*?"})
+            elif part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            elif part.endswith("*"):
                pattern.append({"ORTH": part[0], "OP": "*"})
@ -762,13 +768,19 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
        (19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
        (20, "aaab", "a{2} b", [[0, 0, 1]]),
        (21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
+        (22, "aaab", "a+? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1]]),
+        (23, "aaab", "a*? b", [[0, 0, 0, 1], [0, 0, 1], [0, 1], [1]]),
    ]
    for case_id, string, pattern_str, results in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
-            if part.endswith("+"):
+            if part[-2:] == "+?":
+                pattern.append({"ORTH": part[0], "OP": "+?"})
+            elif part[-2:] == "*?":
+                pattern.append({"ORTH": part[0], "OP": "*?"})
+            elif part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            elif part.endswith("*"):
                pattern.append({"ORTH": part[0], "OP": "*"})
@ -786,3 +798,62 @@ def test_matcher_with_alignments_non_greedy(en_vocab):
        for _, s, e, expected in matches:
            assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
            assert len(expected) == e - s
+
+
+def test_matcher_non_greedy_operator(en_vocab):
+    cases = [
+        (0, "aabbab", "a*? b", ["a a b", "a b", "b", "b", "a b", "b"]),
+        (1, "aabbab", "a+? b", ["a b", "a a b", "a b"]),
+        (2, "aabbab", "a*? b+?", ["a a b", "a b", "b", "b", "a b", "b"]),
+        (3, "aabbab", "a*? b*?", []),
+        (4, "aabbab", "b*? b*?", []),
+        (5, "aabbab", "b+? b+?", ["b b"]),
+        (6, "aabbab", "a* b*?", ["a", "a a", "a", "a"]),
+        (7, "aabbab", "a*? b*", ["a a b", "a b", "b", "a a b b", "a b b", "b b", "b", "a b", "b"]),
+        (8, "aabbc", "a* b*? c*?", ["a", "a a", 'a']),
+        (9, "aabbc", "a* b*? c", ["a a b b c", "a b b c", "b b c", "b c", "c"]),
+
+        (10, "abc", "a* b*? c*", ["a", "a b c", "b c", "c"]),
+        # in spaCy, quantifier "*" returns __all__possible__ matches which is different from regex
+        # in spaCy, quantifier "*?" is designed to return only the non-greedy results from all possible matches
+        # Result 1: a
+        # Result 2: a b c
+        # Result 3: c
+        # Among the 3 results, Result 2 might be contentious to some, but we argue that this should be the correct
+        # behaviour since 'a' and 'c' are matches thus the longest, first possible string "a b c"
+        # should be one of the results
+
+        (11, "aabbc", "a+? b*? c", ["a b b c", "a a b b c"]),
+        (12, "aabbc", "a+? b+? c", ["a b b c", "a a b b c"]),
+        (13, "abbc", "a* b*? c?", ["a", "a b b c", "b b c", "b c", "c"]),
+        (14, "aabbc", "d! b*? c", ["b c", "b b c", "a b b c"]),
+        (15, "abbxb", "a*? b+? c*", ["a b", "b", "b", "b"]),
+        (16, "abbcbc", "a*? b+? c*", ["a b", "b", "b", "b c", "a b b c",  "b b c", "b", "b c"]),
+        (17, "abbcbc", "a*? b+? c", ["b c", "a b b c", "b b c", "b c"]),
+
+    ]
+    for case_id, string, pattern_str, results in cases:
+        matcher = Matcher(en_vocab)
+        doc = Doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part[-2:] == "+?":
+                pattern.append({"ORTH": part[0], "OP": "+?"})
+            elif part[-2:] == "*?":
+                pattern.append({"ORTH": part[0], "OP": "*?"})
+            elif part.endswith("+"):
+                pattern.append({"ORTH": part[0], "OP": "+"})
+            elif part.endswith("*"):
+                pattern.append({"ORTH": part[0], "OP": "*"})
+            elif part.endswith("?"):
+                pattern.append({"ORTH": part[0], "OP": "?"})
+            elif part.endswith("!"):
+                pattern.append({"ORTH": part[0], "OP": "!"})
+            else:
+                pattern.append({"ORTH": part})
+        matcher.add("PATTERN", [pattern])
+        matches = matcher(doc, as_spans=True)
+        assert len(matches) == len(results)
+        for i in range(len(matches)):
+            print(i)
+            assert matches[i].text == results[i]