mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests
		
			
				
	
	
		
			35 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			35 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| from spacy import registry
 | |
| from spacy.language import Language
 | |
| from spacy.pipeline import EntityRuler
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def nlp():
 | |
|     return Language()
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| @registry.misc("entity_ruler_patterns")
 | |
| def patterns():
 | |
|     return [
 | |
|         {"label": "HELLO", "pattern": "hello world"},
 | |
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 | |
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 | |
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 | |
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 | |
|         {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
 | |
|     ]
 | |
| 
 | |
| 
 | |
| def test_entity_ruler_fix8216(nlp, patterns):
 | |
|     """Test that patterns don't get added excessively."""
 | |
|     ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
 | |
|     ruler.add_patterns(patterns)
 | |
|     pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
 | |
|     assert pattern_count > 0
 | |
|     ruler.add_patterns([])
 | |
|     after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
 | |
|     assert after_count == pattern_count
 |