import re import pytest from spacy.attrs import IS_PUNCT, LOWER, ORTH from spacy.errors import MatchPatternError from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] pattern4 = [{"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] pattern5 = [{"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] re_pattern1 = "AA*" re_pattern2 = "A*A" re_pattern3 = "AA" re_pattern4 = "BA*B" re_pattern5 = "B*A*B" longest1 = "A A A A A" longest2 = "A A A A A" longest3 = "A A" longest4 = "B A A A A A B" # "FIRST" would be "B B" longest5 = "B B A A A A A B" @pytest.fixture def text(): return "(BBAAAAAB)." @pytest.fixture def doc(en_tokenizer, text): doc = en_tokenizer(" ".join(text)) return doc @pytest.mark.issue(118) @pytest.mark.parametrize( "patterns", [ [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], ], ) def test_issue118(en_tokenizer, patterns): """Test a bug that arose from having overlapping matches""" text = ( "how many points did lebron james score against the boston celtics last night" ) doc = en_tokenizer(text) ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) matcher.add("BostonCeltics", patterns) assert len(list(doc.ents)) == 0 matches = [(ORG, start, end) for _, start, end in matcher(doc)] assert matches == [(ORG, 9, 11), (ORG, 10, 11)] doc.ents = matches[:1] ents = list(doc.ents) assert len(ents) == 1 assert ents[0].label == ORG assert ents[0].start == 9 assert ents[0].end == 11 @pytest.mark.issue(118) @pytest.mark.parametrize( "patterns", [ [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], ], ) def test_issue118_prefix_reorder(en_tokenizer, patterns): """Test a bug that arose from having overlapping matches""" text = ( "how many points did lebron james score against the boston celtics last night" ) doc = en_tokenizer(text) ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) matcher.add("BostonCeltics", patterns) assert len(list(doc.ents)) == 0 matches = [(ORG, start, end) for _, start, end in matcher(doc)] doc.ents += tuple(matches)[1:] assert matches == [(ORG, 9, 10), (ORG, 9, 11)] ents = doc.ents assert len(ents) == 1 assert ents[0].label == ORG assert ents[0].start == 9 assert ents[0].end == 11 @pytest.mark.issue(242) def test_issue242(en_tokenizer): """Test overlapping multi-word phrases.""" text = "There are different food safety standards in different countries." patterns = [ [{"LOWER": "food"}, {"LOWER": "safety"}], [{"LOWER": "safety"}, {"LOWER": "standards"}], ] doc = en_tokenizer(text) matcher = Matcher(doc.vocab) matcher.add("FOOD", patterns) matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] match1, match2 = matches assert match1[1] == 3 assert match1[2] == 5 assert match2[1] == 4 assert match2[2] == 6 with pytest.raises(ValueError): # One token can only be part of one entity, so test that the matches # can't be added as entities doc.ents += tuple(matches) @pytest.mark.issue(587) def test_issue587(en_tokenizer): """Test that Matcher doesn't segfault on particular input""" doc = en_tokenizer("a b; c") matcher = Matcher(doc.vocab) matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) matches = matcher(doc) assert len(matches) == 1 matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) matches = matcher(doc) assert len(matches) == 2 matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) matches = matcher(doc) assert len(matches) == 2 @pytest.mark.issue(588) def test_issue588(en_vocab): """Test if empty specs still cause an error when adding patterns""" matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add("TEST", [[]]) @pytest.mark.issue(590) def test_issue590(en_vocab): """Test overlapping matches""" doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) matcher = Matcher(en_vocab) matcher.add( "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] ) matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) matches = matcher(doc) assert len(matches) == 2 @pytest.mark.issue(615) def test_issue615(en_tokenizer): def merge_phrases(matcher, doc, i, matches): """Merge a phrase. We have to be careful here because we'll change the token indices. To avoid problems, merge all the phrases once we're called on the last match.""" if i != len(matches) - 1: return None spans = [Span(doc, start, end, label=label) for label, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: tag = "NNP" if span.label_ else span.root.tag_ attrs = {"tag": tag, "lemma": span.text} retokenizer.merge(span, attrs=attrs) doc.ents = doc.ents + (span,) text = "The golf club is broken" pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] label = "Sport_Equipment" doc = en_tokenizer(text) matcher = Matcher(doc.vocab) matcher.add(label, [pattern], on_match=merge_phrases) matcher(doc) entities = list(doc.ents) assert entities != [] assert entities[0].label != 0 @pytest.mark.issue(850) def test_issue850(): """The variable-length pattern matches the succeeding token. Check we handle the ambiguity correctly.""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] matcher.add("FarAway", [pattern]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] assert start == 0 assert end == 4 @pytest.mark.issue(850) def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] matcher.add("FarAway", [pattern]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] assert start == 0 assert end == 4 @pytest.mark.issue(1434) def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] vocab = Vocab(lex_attr_getters=LEX_ATTRS) hello_world = Doc(vocab, words=["Hello", "World"]) hello = Doc(vocab, words=["Hello"]) matcher = Matcher(vocab) matcher.add("MyMatcher", [pattern]) matches = matcher(hello_world) assert matches matches = matcher(hello) assert matches @pytest.mark.parametrize( "string,start,end", [ ("a", 0, 1), ("a b", 0, 2), ("a c", 0, 1), ("a b c", 0, 2), ("a b b c", 0, 3), ("a b b", 0, 3), ], ) @pytest.mark.issue(1450) def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher = Matcher(Vocab()) matcher.add("TSTEND", [pattern]) doc = Doc(Vocab(), words=string.split()) matches = matcher(doc) if start is None or end is None: assert matches == [] assert matches[-1][1] == start assert matches[-1][2] == end @pytest.mark.issue(1945) def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 assert matches[0][1:] == (0, 2) assert matches[1][1:] == (1, 3) @pytest.mark.issue(1971) def test_issue1971(en_vocab): # Possibly related to #2675 and #2671? matcher = Matcher(en_vocab) pattern = [ {"ORTH": "Doe"}, {"ORTH": "!", "OP": "?"}, {"_": {"optional": True}, "OP": "?"}, {"ORTH": "!", "OP": "?"}, ] Token.set_extension("optional", default=False) matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) # We could also assert length 1 here, but this is more conclusive, because # the real problem here is that it returns a duplicate match for a match_id # that's not actually in the vocab! matches = matcher(doc) assert all([match_id in en_vocab.strings for match_id, start, end in matches]) @pytest.mark.issue(1971) def test_issue_1971_2(en_vocab): matcher = Matcher(en_vocab) pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) matcher.add("TEST1", [pattern1, pattern2]) matches = matcher(doc) assert len(matches) == 2 @pytest.mark.issue(1971) def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", [[{"_": {"a": 1}}]]) matcher.add("B", [[{"_": {"b": 2}}]]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) @pytest.mark.issue(1971) def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ Token.set_extension("ext_a", default="str_a", force=True) Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 matcher.add("TEST", [pattern]) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1 assert matches[0] == (en_vocab.strings["TEST"], 0, 3) @pytest.mark.issue(2464) def test_issue2464(en_vocab): """Test problem with successive ?. This is the same bug, so putting it here.""" matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["a", "b"]) matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) matches = matcher(doc) assert len(matches) == 3 @pytest.mark.issue(2569) def test_issue2569(en_tokenizer): """Test that operator + is greedy.""" doc = en_tokenizer("It is May 15, 1993.") doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] matcher = Matcher(doc.vocab) matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) matched = [doc[start:end] for _, start, end in matcher(doc)] matched = sorted(matched, key=len, reverse=True) assert len(matched) == 10 assert len(matched[0]) == 4 assert matched[0].text == "May 15, 1993" @pytest.mark.issue(2671) def test_issue2671(): """Ensure the correct entity ID is returned for matches with quantifiers. See also #2675 """ nlp = English() matcher = Matcher(nlp.vocab) pattern_id = "test_pattern" pattern = [ {"LOWER": "high"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": "adrenaline"}, ] matcher.add(pattern_id, [pattern]) doc1 = nlp("This is a high-adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.") matches1 = matcher(doc1) for match_id, start, end in matches1: assert nlp.vocab.strings[match_id] == pattern_id matches2 = matcher(doc2) for match_id, start, end in matches2: assert nlp.vocab.strings[match_id] == pattern_id @pytest.mark.issue(3009) def test_issue3009(en_vocab): """Test problem with matcher quantifiers""" patterns = [ [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], [ {"ORTH": "has"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}, ], [ {"ORTH": "has"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}, ], ] words = ["also", "has", "to", "do", "with"] tags = ["RB", "VBZ", "TO", "VB", "IN"] pos = ["ADV", "VERB", "ADP", "VERB", "ADP"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos) matcher = Matcher(en_vocab) for i, pattern in enumerate(patterns): matcher.add(str(i), [pattern]) matches = matcher(doc) assert matches @pytest.mark.issue(3328) def test_issue3328(en_vocab): doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) matcher = Matcher(en_vocab) patterns = [ [{"LOWER": {"IN": ["hello", "how"]}}], [{"LOWER": {"IN": ["you", "doing"]}}], ] matcher.add("TEST", patterns) matches = matcher(doc) assert len(matches) == 4 matched_texts = [doc[start:end].text for _, start, end in matches] assert matched_texts == ["Hello", "how", "you", "doing"] @pytest.mark.issue(3549) def test_issue3549(en_vocab): """Test that match pattern validation doesn't raise on empty errors.""" matcher = Matcher(en_vocab, validate=True) pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] matcher.add("GOOD", [pattern]) with pytest.raises(MatchPatternError): matcher.add("BAD", [[{"X": "Y"}]]) @pytest.mark.skip("Matching currently only works on strings and integers") @pytest.mark.issue(3555) def test_issue3555(en_vocab): """Test that custom extensions with default None don't break matcher.""" Token.set_extension("issue3555", default=None) matcher = Matcher(en_vocab) pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["have", "apple"]) matcher(doc) @pytest.mark.issue(3839) def test_issue3839(en_vocab): """Test that match IDs returned by the matcher are correct, are in the string""" doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) matcher = Matcher(en_vocab) match_id = "PATTERN" pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] matcher.add(match_id, [pattern1]) matches = matcher(doc) assert matches[0][0] == en_vocab.strings[match_id] matcher = Matcher(en_vocab) matcher.add(match_id, [pattern2]) matches = matcher(doc) assert matches[0][0] == en_vocab.strings[match_id] @pytest.mark.issue(3879) def test_issue3879(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) assert len(doc) == 5 pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] matcher = Matcher(en_vocab) matcher.add("TEST", [pattern]) assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' @pytest.mark.issue(3951) def test_issue3951(en_vocab): """Test that combinations of optional rules are matched correctly.""" matcher = Matcher(en_vocab) pattern = [ {"LOWER": "hello"}, {"LOWER": "this", "OP": "?"}, {"OP": "?"}, {"LOWER": "world"}, ] matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) matches = matcher(doc) assert len(matches) == 0 @pytest.mark.issue(4120) def test_issue4120(en_vocab): """Test that matches without a final {OP: ?} token are returned.""" matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) doc1 = Doc(en_vocab, words=["a"]) assert len(matcher(doc1)) == 1 # works doc2 = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc2)) == 2 # fixed matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc3)) == 2 # works matcher = Matcher(en_vocab) matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) assert len(matcher(doc4)) == 3 # fixed @pytest.mark.parametrize( "pattern,re_pattern", [ (pattern1, re_pattern1), (pattern2, re_pattern2), (pattern3, re_pattern3), (pattern4, re_pattern4), (pattern5, re_pattern5), ], ) def test_greedy_matching_first(doc, text, pattern, re_pattern): """Test that the greedy matching behavior "FIRST" is consistent with other re implementations.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] for (key, m_s, m_e), (re_s, re_e) in zip(matches, re_matches): # matching the string, not the exact position assert doc[m_s:m_e].text == doc[re_s:re_e].text @pytest.mark.parametrize( "pattern,longest", [ (pattern1, longest1), (pattern2, longest2), (pattern3, longest3), (pattern4, longest4), (pattern5, longest5), ], ) def test_greedy_matching_longest(doc, text, pattern, longest): """Test the "LONGEST" greedy matching behavior""" matcher = Matcher(doc.vocab) matcher.add("RULE", [pattern], greedy="LONGEST") matches = matcher(doc) for (key, s, e) in matches: assert doc[s:e].text == longest def test_greedy_matching_longest_first(en_tokenizer): """Test that "LONGEST" matching prefers the first of two equally long matches""" doc = en_tokenizer(" ".join("CCC")) matcher = Matcher(doc.vocab) pattern = [{"ORTH": "C"}, {"ORTH": "C"}] matcher.add("RULE", [pattern], greedy="LONGEST") matches = matcher(doc) # out of 0-2 and 1-3, the first should be picked assert len(matches) == 1 assert matches[0][1] == 0 assert matches[0][2] == 2 def test_invalid_greediness(doc, text): matcher = Matcher(doc.vocab) with pytest.raises(ValueError): matcher.add("RULE", [pattern1], greedy="GREEDY") @pytest.mark.parametrize( "pattern,re_pattern", [ (pattern1, re_pattern1), (pattern2, re_pattern2), (pattern3, re_pattern3), (pattern4, re_pattern4), (pattern5, re_pattern5), ], ) def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches) def test_operator_combos(en_vocab): cases = [ ("aaab", "a a a b", True), ("aaab", "a+ b", True), ("aaab", "a+ a+ b", True), ("aaab", "a+ a+ a b", True), ("aaab", "a+ a+ a+ b", True), ("aaab", "a+ a a b", True), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaa", "a+ b", False), ("aaa", "a+ a+ b", False), ("aaa", "a+ a+ a+ b", False), ("aaa", "a+ a b", False), ("aaa", "a+ a a b", False), ("aaab", "a+ a a", True), ("aaab", "a+", True), ("aaab", "a+ a b", True), ] for string, pattern_str, result in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): if part.endswith("+"): pattern.append({"ORTH": part[0], "OP": "+"}) else: pattern.append({"ORTH": part}) matcher.add("PATTERN", [pattern]) matches = matcher(doc) if result: assert matches, (string, pattern_str) else: assert not matches, (string, pattern_str) @pytest.mark.issue(1450) def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher.add("TSTEND", [pattern]) nlp = lambda string: Doc(matcher.vocab, words=string.split()) assert len(matcher(nlp("a"))) == 1 assert len(matcher(nlp("a b"))) == 2 assert len(matcher(nlp("a c"))) == 1 assert len(matcher(nlp("a b c"))) == 2 assert len(matcher(nlp("a b b c"))) == 3 assert len(matcher(nlp("a b b"))) == 3 def test_matcher_sets_return_correct_tokens(en_vocab): matcher = Matcher(en_vocab) patterns = [ [{"LOWER": {"IN": ["zero"]}}], [{"LOWER": {"IN": ["one"]}}], [{"LOWER": {"IN": ["two"]}}], ] matcher.add("TEST", patterns) doc = Doc(en_vocab, words="zero one two three".split()) matches = matcher(doc) texts = [Span(doc, s, e, label=L).text for L, s, e in matches] assert texts == ["zero", "one", "two"] @pytest.mark.filterwarnings("ignore:\\[W036") def test_matcher_remove(): nlp = English() matcher = Matcher(nlp.vocab) text = "This is a test case." pattern = [{"ORTH": "test"}, {"OP": "?"}] assert len(matcher) == 0 matcher.add("Rule", [pattern]) assert "Rule" in matcher # should give two matches results1 = matcher(nlp(text)) assert len(results1) == 2 # removing once should work matcher.remove("Rule") # should not return any maches anymore results2 = matcher(nlp(text)) assert len(results2) == 0 # removing again should throw an error with pytest.raises(ValueError): matcher.remove("Rule") def test_matcher_with_alignments_greedy_longest(en_vocab): cases = [ ("aaab", "a* b", [0, 0, 0, 1]), ("baab", "b a* b", [0, 1, 1, 2]), ("aaab", "a a a b", [0, 1, 2, 3]), ("aaab", "a+ b", [0, 0, 0, 1]), ("aaba", "a+ b a+", [0, 0, 1, 2]), ("aabaa", "a+ b a+", [0, 0, 1, 2, 2]), ("aaba", "a+ b a*", [0, 0, 1, 2]), ("aaaa", "a*", [0, 0, 0, 0]), ("baab", "b a* b b*", [0, 1, 1, 2]), ("aabb", "a* b* a*", [0, 0, 1, 1]), ("aaab", "a+ a+ a b", [0, 1, 2, 3]), ("aaab", "a+ a+ a+ b", [0, 1, 2, 3]), ("aaab", "a+ a a b", [0, 1, 2, 3]), ("aaab", "a+ a a", [0, 1, 2]), ("aaab", "a+ a a?", [0, 1, 2]), ("aaaa", "a a a a a?", [0, 1, 2, 3]), ("aaab", "a+ a b", [0, 0, 1, 2]), ("aaab", "a+ a+ b", [0, 0, 1, 2]), ("aaab", "a{2,} b", [0, 0, 0, 1]), ("aaab", "a{,3} b", [0, 0, 0, 1]), ("aaab", "a{2} b", [0, 0, 1]), ("aaab", "a{2,3} b", [0, 0, 0, 1]), ] for string, pattern_str, result in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): if part.endswith("+"): pattern.append({"ORTH": part[0], "OP": "+"}) elif part.endswith("*"): pattern.append({"ORTH": part[0], "OP": "*"}) elif part.endswith("?"): pattern.append({"ORTH": part[0], "OP": "?"}) elif part.endswith("}"): pattern.append({"ORTH": part[0], "OP": part[1:]}) else: pattern.append({"ORTH": part}) matcher.add("PATTERN", [pattern], greedy="LONGEST") matches = matcher(doc, with_alignments=True) n_matches = len(matches) _, s, e, expected = matches[0] assert expected == result, (string, pattern_str, s, e, n_matches) def test_matcher_with_alignments_non_greedy(en_vocab): cases = [ (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]), (1, "baab", "b a* b", [[0, 1, 1, 2]]), (2, "aaab", "a a a b", [[0, 1, 2, 3]]), (3, "aaab", "a+ b", [[0, 1], [0, 0, 1], [0, 0, 0, 1]]), (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]), ( 5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2]], ), (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]), (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]), (8, "baab", "b a* b b*", [[0, 1, 1, 2]]), ( 9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]], ), (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]), (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]), (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]), (13, "aaab", "a+ a a", [[0, 1, 2]]), (14, "aaab", "a+ a a?", [[0, 1], [0, 1, 2]]), (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]), (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]), (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]), (18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]), (19, "aaab", "a{3} b", [[0, 0, 0, 1]]), (20, "aaab", "a{2} b", [[0, 0, 1]]), (21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]), ] for case_id, string, pattern_str, results in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): if part.endswith("+"): pattern.append({"ORTH": part[0], "OP": "+"}) elif part.endswith("*"): pattern.append({"ORTH": part[0], "OP": "*"}) elif part.endswith("?"): pattern.append({"ORTH": part[0], "OP": "?"}) elif part.endswith("}"): pattern.append({"ORTH": part[0], "OP": part[1:]}) else: pattern.append({"ORTH": part}) matcher.add("PATTERN", [pattern]) matches = matcher(doc, with_alignments=True) n_matches = len(matches) for _, s, e, expected in matches: assert expected in results, (case_id, string, pattern_str, s, e, n_matches) assert len(expected) == e - s