mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			66 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			66 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf-8
 | |
| from __future__ import unicode_literals
 | |
| import re
 | |
| 
 | |
| from ...matcher import Matcher
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
 | |
| pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
 | |
| pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
 | |
| pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 | |
| pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 | |
| 
 | |
| re_pattern1	= 'AA*'
 | |
| re_pattern2 = 'A*A'
 | |
| re_pattern3	= 'AA'
 | |
| re_pattern4	= 'BA*B'
 | |
| re_pattern5	= 'B*A*B'
 | |
| 
 | |
| @pytest.fixture
 | |
| def text():
 | |
| 	return "(ABBAAAAAB)."
 | |
| 
 | |
| @pytest.fixture
 | |
| def doc(en_tokenizer,text):
 | |
|     doc = en_tokenizer(' '.join(text))
 | |
|     return doc
 | |
| 
 | |
| @pytest.mark.xfail
 | |
| @pytest.mark.parametrize('pattern,re_pattern',[
 | |
| 	(pattern1,re_pattern1),
 | |
| 	(pattern2,re_pattern2),
 | |
| 	(pattern3,re_pattern3),
 | |
| 	(pattern4,re_pattern4),
 | |
| 	(pattern5,re_pattern5)])
 | |
| def test_greedy_matching(doc,text,pattern,re_pattern):
 | |
| 	"""
 | |
| 	Test that the greedy matching behavior of the * op
 | |
| 	is consistant with other re implementations
 | |
| 	"""
 | |
| 	matcher = Matcher(doc.vocab)
 | |
| 	matcher.add(re_pattern,None,pattern)
 | |
| 	matches = matcher(doc)
 | |
| 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 | |
| 	for match,re_match in zip(matches,re_matches):
 | |
| 		assert match[1:]==re_match
 | |
| 
 | |
| @pytest.mark.xfail
 | |
| @pytest.mark.parametrize('pattern,re_pattern',[
 | |
| 	(pattern1,re_pattern1),
 | |
| 	(pattern2,re_pattern2),
 | |
| 	(pattern3,re_pattern3),
 | |
| 	(pattern4,re_pattern4),
 | |
| 	(pattern5,re_pattern5)])
 | |
| def test_match_consuming(doc,text,pattern,re_pattern):
 | |
| 	"""
 | |
| 	Test that matcher.__call__ consumes tokens on a match
 | |
| 	similar to re.findall
 | |
| 	"""
 | |
| 	matcher = Matcher(doc.vocab)
 | |
| 	matcher.add(re_pattern,None,pattern)
 | |
| 	matches = matcher(doc)
 | |
| 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 | |
| 	assert len(matches)==len(re_matches)
 |