mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			146 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			146 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf-8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| import re
 | |
| from spacy.matcher import Matcher
 | |
| from spacy.tokens import Doc, Span
 | |
| 
 | |
| 
 | |
| pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
 | |
| pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}]
 | |
| pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}]
 | |
| pattern4 = [
 | |
|     {"ORTH": "B", "OP": "1"},
 | |
|     {"ORTH": "A", "OP": "*"},
 | |
|     {"ORTH": "B", "OP": "1"},
 | |
| ]
 | |
| pattern5 = [
 | |
|     {"ORTH": "B", "OP": "*"},
 | |
|     {"ORTH": "A", "OP": "*"},
 | |
|     {"ORTH": "B", "OP": "1"},
 | |
| ]
 | |
| 
 | |
| re_pattern1 = "AA*"
 | |
| re_pattern2 = "A*A"
 | |
| re_pattern3 = "AA"
 | |
| re_pattern4 = "BA*B"
 | |
| re_pattern5 = "B*A*B"
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def text():
 | |
|     return "(ABBAAAAAB)."
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def doc(en_tokenizer, text):
 | |
|     doc = en_tokenizer(" ".join(text))
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "pattern,re_pattern",
 | |
|     [
 | |
|         pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()),
 | |
|         pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()),
 | |
|         pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()),
 | |
|         (pattern4, re_pattern4),
 | |
|         pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()),
 | |
|     ],
 | |
| )
 | |
| def test_greedy_matching(doc, text, pattern, re_pattern):
 | |
|     """Test that the greedy matching behavior of the * op is consistant with
 | |
|     other re implementations."""
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     matcher.add(re_pattern, None, pattern)
 | |
|     matches = matcher(doc)
 | |
|     re_matches = [m.span() for m in re.finditer(re_pattern, text)]
 | |
|     for match, re_match in zip(matches, re_matches):
 | |
|         assert match[1:] == re_match
 | |
| 
 | |
| 
 | |
| @pytest.mark.xfail
 | |
| @pytest.mark.parametrize(
 | |
|     "pattern,re_pattern",
 | |
|     [
 | |
|         (pattern1, re_pattern1),
 | |
|         (pattern2, re_pattern2),
 | |
|         (pattern3, re_pattern3),
 | |
|         (pattern4, re_pattern4),
 | |
|         (pattern5, re_pattern5),
 | |
|     ],
 | |
| )
 | |
| def test_match_consuming(doc, text, pattern, re_pattern):
 | |
|     """Test that matcher.__call__ consumes tokens on a match similar to
 | |
|     re.findall."""
 | |
|     matcher = Matcher(doc.vocab)
 | |
|     matcher.add(re_pattern, None, pattern)
 | |
|     matches = matcher(doc)
 | |
|     re_matches = [m.span() for m in re.finditer(re_pattern, text)]
 | |
|     assert len(matches) == len(re_matches)
 | |
| 
 | |
| 
 | |
| def test_operator_combos(en_vocab):
 | |
|     cases = [
 | |
|         ("aaab", "a a a b", True),
 | |
|         ("aaab", "a+ b", True),
 | |
|         ("aaab", "a+ a+ b", True),
 | |
|         ("aaab", "a+ a+ a b", True),
 | |
|         ("aaab", "a+ a+ a+ b", True),
 | |
|         ("aaab", "a+ a a b", True),
 | |
|         ("aaab", "a+ a a", True),
 | |
|         ("aaab", "a+", True),
 | |
|         ("aaa", "a+ b", False),
 | |
|         ("aaa", "a+ a+ b", False),
 | |
|         ("aaa", "a+ a+ a+ b", False),
 | |
|         ("aaa", "a+ a b", False),
 | |
|         ("aaa", "a+ a a b", False),
 | |
|         ("aaab", "a+ a a", True),
 | |
|         ("aaab", "a+", True),
 | |
|         ("aaab", "a+ a b", True),
 | |
|     ]
 | |
|     for string, pattern_str, result in cases:
 | |
|         matcher = Matcher(en_vocab)
 | |
|         doc = Doc(matcher.vocab, words=list(string))
 | |
|         pattern = []
 | |
|         for part in pattern_str.split():
 | |
|             if part.endswith("+"):
 | |
|                 pattern.append({"ORTH": part[0], "OP": "+"})
 | |
|             else:
 | |
|                 pattern.append({"ORTH": part})
 | |
|         matcher.add("PATTERN", None, pattern)
 | |
|         matches = matcher(doc)
 | |
|         if result:
 | |
|             assert matches, (string, pattern_str)
 | |
|         else:
 | |
|             assert not matches, (string, pattern_str)
 | |
| 
 | |
| 
 | |
| def test_matcher_end_zero_plus(en_vocab):
 | |
|     """Test matcher works when patterns end with * operator. (issue 1450)"""
 | |
|     matcher = Matcher(en_vocab)
 | |
|     pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
 | |
|     matcher.add("TSTEND", None, pattern)
 | |
|     nlp = lambda string: Doc(matcher.vocab, words=string.split())
 | |
|     assert len(matcher(nlp("a"))) == 1
 | |
|     assert len(matcher(nlp("a b"))) == 2
 | |
|     assert len(matcher(nlp("a c"))) == 1
 | |
|     assert len(matcher(nlp("a b c"))) == 2
 | |
|     assert len(matcher(nlp("a b b c"))) == 3
 | |
|     assert len(matcher(nlp("a b b"))) == 3
 | |
| 
 | |
| 
 | |
| def test_matcher_sets_return_correct_tokens(en_vocab):
 | |
|     matcher = Matcher(en_vocab)
 | |
|     patterns = [
 | |
|         [{'LOWER': {'IN': ["zero"]}}],
 | |
|         [{'LOWER': {'IN': ["one"]}}],
 | |
|         [{'LOWER': {'IN': ["two"]}}],
 | |
|     ]
 | |
|     matcher.add('TEST', None, *patterns)
 | |
|     doc = Doc(en_vocab, words="zero one two three".split())
 | |
|     matches = matcher(doc)
 | |
|     texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
 | |
|     assert texts == ['zero', 'one', 'two']
 |