mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 15:37:29 +03:00 
			
		
		
		
	* Support match alignments * change naming from match_alignments to with_alignments, add conditional flow if with_alignments is given, validate with_alignments, add related test case * remove added errors, utilize bint type, cleanup whitespace * fix no new line in end of file * Minor formatting * Skip alignments processing if as_spans is set * Add with_alignments to Matcher API docs * Update website/docs/api/matcher.md Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
		
			
				
	
	
		
			294 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			294 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import re
 | |
| 
 | |
| from spacy.lang.en import English
 | |
| from spacy.matcher import Matcher
 | |
| from spacy.tokens import Doc, Span
 | |
| 
 | |
| 
 | |
| pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}]
 | |
| pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}]
 | |
| pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}]
 | |
| pattern4 = [{"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}]
 | |
| pattern5 = [{"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}]
 | |
| 
 | |
| re_pattern1 = "AA*"
 | |
| re_pattern2 = "A*A"
 | |
| re_pattern3 = "AA"
 | |
| re_pattern4 = "BA*B"
 | |
| re_pattern5 = "B*A*B"
 | |
| 
 | |
| longest1 = "A A A A A"
 | |
| longest2 = "A A A A A"
 | |
| longest3 = "A A"
 | |
| longest4 = "B A A A A A B"  # "FIRST" would be "B B"
 | |
| longest5 = "B B A A A A A B"
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def text():
 | |
|     return "(BBAAAAAB)."
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def doc(en_tokenizer, text):
 | |
|     doc = en_tokenizer(" ".join(text))
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "pattern,re_pattern",
 | |
|     [
 | |
|         (pattern1, re_pattern1),
 | |
|         (pattern2, re_pattern2),
 | |
|         (pattern3, re_pattern3),
 | |
|         (pattern4, re_pattern4),
 | |
|         (pattern5, re_pattern5),
 | |
|     ],
 | |
| )
 | |
| def test_greedy_matching_first(doc, text, pattern, re_pattern):
 | |
|     """Test that the greedy matching behavior "FIRST" is consistent with
 | |
|     other re implementations."""
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     matcher.add(re_pattern, [pattern], greedy="FIRST")
 | |
|     matches = matcher(doc)
 | |
|     re_matches = [m.span() for m in re.finditer(re_pattern, text)]
 | |
|     for (key, m_s, m_e), (re_s, re_e) in zip(matches, re_matches):
 | |
|         # matching the string, not the exact position
 | |
|         assert doc[m_s:m_e].text == doc[re_s:re_e].text
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "pattern,longest",
 | |
|     [
 | |
|         (pattern1, longest1),
 | |
|         (pattern2, longest2),
 | |
|         (pattern3, longest3),
 | |
|         (pattern4, longest4),
 | |
|         (pattern5, longest5),
 | |
|     ],
 | |
| )
 | |
| def test_greedy_matching_longest(doc, text, pattern, longest):
 | |
|     """Test the "LONGEST" greedy matching behavior"""
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     matcher.add("RULE", [pattern], greedy="LONGEST")
 | |
|     matches = matcher(doc)
 | |
|     for (key, s, e) in matches:
 | |
|         assert doc[s:e].text == longest
 | |
| 
 | |
| 
 | |
| def test_greedy_matching_longest_first(en_tokenizer):
 | |
|     """Test that "LONGEST" matching prefers the first of two equally long matches"""
 | |
|     doc = en_tokenizer(" ".join("CCC"))
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     pattern = [{"ORTH": "C"}, {"ORTH": "C"}]
 | |
|     matcher.add("RULE", [pattern], greedy="LONGEST")
 | |
|     matches = matcher(doc)
 | |
|     # out of 0-2 and 1-3, the first should be picked
 | |
|     assert len(matches) == 1
 | |
|     assert matches[0][1] == 0
 | |
|     assert matches[0][2] == 2
 | |
| 
 | |
| 
 | |
| def test_invalid_greediness(doc, text):
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     with pytest.raises(ValueError):
 | |
|         matcher.add("RULE", [pattern1], greedy="GREEDY")
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "pattern,re_pattern",
 | |
|     [
 | |
|         (pattern1, re_pattern1),
 | |
|         (pattern2, re_pattern2),
 | |
|         (pattern3, re_pattern3),
 | |
|         (pattern4, re_pattern4),
 | |
|         (pattern5, re_pattern5),
 | |
|     ],
 | |
| )
 | |
| def test_match_consuming(doc, text, pattern, re_pattern):
 | |
|     """Test that matcher.__call__ consumes tokens on a match similar to
 | |
|     re.findall."""
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     matcher.add(re_pattern, [pattern], greedy="FIRST")
 | |
|     matches = matcher(doc)
 | |
|     re_matches = [m.span() for m in re.finditer(re_pattern, text)]
 | |
|     assert len(matches) == len(re_matches)
 | |
| 
 | |
| 
 | |
| def test_operator_combos(en_vocab):
 | |
|     cases = [
 | |
|         ("aaab", "a a a b", True),
 | |
|         ("aaab", "a+ b", True),
 | |
|         ("aaab", "a+ a+ b", True),
 | |
|         ("aaab", "a+ a+ a b", True),
 | |
|         ("aaab", "a+ a+ a+ b", True),
 | |
|         ("aaab", "a+ a a b", True),
 | |
|         ("aaab", "a+ a a", True),
 | |
|         ("aaab", "a+", True),
 | |
|         ("aaa", "a+ b", False),
 | |
|         ("aaa", "a+ a+ b", False),
 | |
|         ("aaa", "a+ a+ a+ b", False),
 | |
|         ("aaa", "a+ a b", False),
 | |
|         ("aaa", "a+ a a b", False),
 | |
|         ("aaab", "a+ a a", True),
 | |
|         ("aaab", "a+", True),
 | |
|         ("aaab", "a+ a b", True),
 | |
|     ]
 | |
|     for string, pattern_str, result in cases:
 | |
|         matcher = Matcher(en_vocab)
 | |
|         doc = Doc(matcher.vocab, words=list(string))
 | |
|         pattern = []
 | |
|         for part in pattern_str.split():
 | |
|             if part.endswith("+"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "+"})
 | |
|             else:
 | |
|                 pattern.append({"ORTH": part})
 | |
|         matcher.add("PATTERN", [pattern])
 | |
|         matches = matcher(doc)
 | |
|         if result:
 | |
|             assert matches, (string, pattern_str)
 | |
|         else:
 | |
|             assert not matches, (string, pattern_str)
 | |
| 
 | |
| 
 | |
| def test_matcher_end_zero_plus(en_vocab):
 | |
|     """Test matcher works when patterns end with * operator. (issue 1450)"""
 | |
|     matcher = Matcher(en_vocab)
 | |
|     pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
 | |
|     matcher.add("TSTEND", [pattern])
 | |
|     nlp = lambda string: Doc(matcher.vocab, words=string.split())
 | |
|     assert len(matcher(nlp("a"))) == 1
 | |
|     assert len(matcher(nlp("a b"))) == 2
 | |
|     assert len(matcher(nlp("a c"))) == 1
 | |
|     assert len(matcher(nlp("a b c"))) == 2
 | |
|     assert len(matcher(nlp("a b b c"))) == 3
 | |
|     assert len(matcher(nlp("a b b"))) == 3
 | |
| 
 | |
| 
 | |
| def test_matcher_sets_return_correct_tokens(en_vocab):
 | |
|     matcher = Matcher(en_vocab)
 | |
|     patterns = [
 | |
|         [{"LOWER": {"IN": ["zero"]}}],
 | |
|         [{"LOWER": {"IN": ["one"]}}],
 | |
|         [{"LOWER": {"IN": ["two"]}}],
 | |
|     ]
 | |
|     matcher.add("TEST", patterns)
 | |
|     doc = Doc(en_vocab, words="zero one two three".split())
 | |
|     matches = matcher(doc)
 | |
|     texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
 | |
|     assert texts == ["zero", "one", "two"]
 | |
| 
 | |
| 
 | |
| def test_matcher_remove():
 | |
|     nlp = English()
 | |
|     matcher = Matcher(nlp.vocab)
 | |
|     text = "This is a test case."
 | |
| 
 | |
|     pattern = [{"ORTH": "test"}, {"OP": "?"}]
 | |
|     assert len(matcher) == 0
 | |
|     matcher.add("Rule", [pattern])
 | |
|     assert "Rule" in matcher
 | |
| 
 | |
|     # should give two matches
 | |
|     results1 = matcher(nlp(text))
 | |
|     assert len(results1) == 2
 | |
| 
 | |
|     # removing once should work
 | |
|     matcher.remove("Rule")
 | |
| 
 | |
|     # should not return any maches anymore
 | |
|     results2 = matcher(nlp(text))
 | |
|     assert len(results2) == 0
 | |
| 
 | |
|     # removing again should throw an error
 | |
|     with pytest.raises(ValueError):
 | |
|         matcher.remove("Rule")
 | |
| 
 | |
| 
 | |
| def test_matcher_with_alignments_greedy_longest(en_vocab):
 | |
|     cases = [
 | |
|         ("aaab", "a* b", [0, 0, 0, 1]),
 | |
|         ("baab", "b a* b", [0, 1, 1, 2]),
 | |
|         ("aaab", "a a a b", [0, 1, 2, 3]),
 | |
|         ("aaab", "a+ b", [0, 0, 0, 1]),
 | |
|         ("aaba", "a+ b a+", [0, 0, 1, 2]),
 | |
|         ("aabaa", "a+ b a+", [0, 0, 1, 2, 2]),
 | |
|         ("aaba", "a+ b a*", [0, 0, 1, 2]),
 | |
|         ("aaaa", "a*", [0, 0, 0, 0]),
 | |
|         ("baab", "b a* b b*", [0, 1, 1, 2]),
 | |
|         ("aabb", "a* b* a*", [0, 0, 1, 1]),
 | |
|         ("aaab", "a+ a+ a b", [0, 1, 2, 3]),
 | |
|         ("aaab", "a+ a+ a+ b", [0, 1, 2, 3]),
 | |
|         ("aaab", "a+ a a b", [0, 1, 2, 3]),
 | |
|         ("aaab", "a+ a a", [0, 1, 2]),
 | |
|         ("aaab", "a+ a a?", [0, 1, 2]),
 | |
|         ("aaaa", "a a a a a?", [0, 1, 2, 3]),
 | |
|         ("aaab", "a+ a b", [0, 0, 1, 2]),
 | |
|         ("aaab", "a+ a+ b", [0, 0, 1, 2]),
 | |
|     ]
 | |
|     for string, pattern_str, result in cases:
 | |
|         matcher = Matcher(en_vocab)
 | |
|         doc = Doc(matcher.vocab, words=list(string))
 | |
|         pattern = []
 | |
|         for part in pattern_str.split():
 | |
|             if part.endswith("+"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "+"})
 | |
|             elif part.endswith("*"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "*"})
 | |
|             elif part.endswith("?"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "?"})
 | |
|             else:
 | |
|                 pattern.append({"ORTH": part})
 | |
|         matcher.add("PATTERN", [pattern], greedy="LONGEST")
 | |
|         matches = matcher(doc, with_alignments=True)
 | |
|         n_matches = len(matches)
 | |
| 
 | |
|         _, s, e, expected = matches[0]
 | |
| 
 | |
|         assert expected == result, (string, pattern_str, s, e, n_matches)
 | |
| 
 | |
| 
 | |
| def test_matcher_with_alignments_nongreedy(en_vocab):
 | |
|     cases = [
 | |
|         (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
 | |
|         (1, "baab", "b a* b", [[0, 1, 1, 2]]),
 | |
|         (2, "aaab", "a a a b", [[0, 1, 2, 3]]),
 | |
|         (3, "aaab", "a+ b",   [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
 | |
|         (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
 | |
|         (5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]),
 | |
|         (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
 | |
|         (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
 | |
|         (8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
 | |
|         (9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]),
 | |
|         (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
 | |
|         (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
 | |
|         (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),
 | |
|         (13, "aaab", "a+ a a", [[0, 1, 2]]),
 | |
|         (14, "aaab", "a+ a a?", [[0, 1], [0, 1, 2]]),
 | |
|         (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
 | |
|         (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
 | |
|         (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
 | |
|     ]
 | |
|     for case_id, string, pattern_str, results in cases:
 | |
|         matcher = Matcher(en_vocab)
 | |
|         doc = Doc(matcher.vocab, words=list(string))
 | |
|         pattern = []
 | |
|         for part in pattern_str.split():
 | |
|             if part.endswith("+"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "+"})
 | |
|             elif part.endswith("*"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "*"})
 | |
|             elif part.endswith("?"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "?"})
 | |
|             else:
 | |
|                 pattern.append({"ORTH": part})
 | |
| 
 | |
|         matcher.add("PATTERN", [pattern])
 | |
|         matches = matcher(doc, with_alignments=True)
 | |
|         n_matches = len(matches)
 | |
| 
 | |
|         for _, s, e, expected in matches:
 | |
|             assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
 | |
|             assert len(expected) == e - s
 |