diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index ce6f2d91e..ad6192c8f 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -107,6 +107,7 @@ def test_matcher_empty_dict(en_vocab): matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) + matcher = Matcher(en_vocab) matcher.add('A.', None, [{'ORTH': 'a'}, {}]) matches = matcher(doc) assert matches[0][1:] == (0, 2) @@ -175,3 +176,39 @@ def test_matcher_match_one_plus(matcher): {'ORTH': 'Philippe', 'OP': '+'}]) m = matcher(doc) assert len(m) == 1 + + +def test_operator_combos(matcher): + cases = [ + ('aaab', 'a a a b', True), + ('aaab', 'a+ b', True), + ('aaab', 'a+ a+ b', True), + ('aaab', 'a+ a+ a b', True), + ('aaab', 'a+ a+ a+ b', True), + ('aaab', 'a+ a a b', True), + ('aaab', 'a+ a a', True), + ('aaab', 'a+', True), + ('aaa', 'a+ b', False), + ('aaa', 'a+ a+ b', False), + ('aaa', 'a+ a+ a+ b', False), + ('aaa', 'a+ a b', False), + ('aaa', 'a+ a a b', False), + ('aaab', 'a+ a a', True), + ('aaab', 'a+', True), + ('aaab', 'a+ a b', False), # <-- This is the weird semantics + ] + for string, pattern_str, result in cases: + matcher = Matcher(matcher.vocab) + doc = get_doc(matcher.vocab, words=list(string)) + pattern = [] + for part in pattern_str.split(): + if part.endswith('+'): + pattern.append({'ORTH': part[0], 'op': '+'}) + else: + pattern.append({'ORTH': part}) + matcher.add('PATTERN', None, pattern) + matches = matcher(doc) + if result: + assert matches, (string, pattern_str) + else: + assert not matches, (string, pattern_str)