mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			53 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			53 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| '''
 | |
| Test Matcher matches with '*' operator and Boolean flag
 | |
| '''
 | |
| from __future__ import unicode_literals
 | |
| from __future__ import print_function
 | |
| import pytest
 | |
| 
 | |
| from ...matcher import Matcher
 | |
| from ...vocab import Vocab
 | |
| from ...attrs import LOWER
 | |
| from ...tokens import Doc
 | |
| 
 | |
| 
 | |
| def test_basic_case():
 | |
|     matcher = Matcher(Vocab(
 | |
|                 lex_attr_getters={LOWER: lambda string: string.lower()}))
 | |
|     IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
 | |
|     matcher.add_pattern(
 | |
|         "FarAway",
 | |
|         [
 | |
|             {LOWER: "bob"},
 | |
|             {'OP': '*', LOWER: 'and'},
 | |
|             {LOWER: 'frank'}
 | |
|         ])
 | |
|     doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
 | |
|     match = matcher(doc)
 | |
|     assert len(match) == 1
 | |
|     ent_id, label, start, end = match[0]
 | |
|     assert start == 0
 | |
|     assert end == 4
 | |
| 
 | |
| 
 | |
| @pytest.mark.xfail
 | |
| def test_issue850():
 | |
|     '''The problem here is that the variable-length pattern matches the
 | |
|     succeeding token. We then don't handle the ambiguity correctly.'''
 | |
|     matcher = Matcher(Vocab(
 | |
|                 lex_attr_getters={LOWER: lambda string: string.lower()}))
 | |
|     IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
 | |
|     matcher.add_pattern(
 | |
|         "FarAway",
 | |
|         [
 | |
|             {LOWER: "bob"},
 | |
|             {'OP': '*', IS_ANY_TOKEN: True},
 | |
|             {LOWER: 'frank'}
 | |
|         ])
 | |
|     doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
 | |
|     match = matcher(doc)
 | |
|     assert len(match) == 1
 | |
|     ent_id, label, start, end = match[0]
 | |
|     assert start == 0
 | |
|     assert end == 4
 |