mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| from spacy.tokens import Span
 | |
| from spacy.language import Language
 | |
| from spacy.pipeline import EntityRuler
 | |
| from spacy import load
 | |
| import srsly
 | |
| 
 | |
| from ..util import make_tempdir
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def patterns():
 | |
|     return [
 | |
|         {"label": "HELLO", "pattern": "hello world"},
 | |
|         {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 | |
|         {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 | |
|         {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 | |
|         {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 | |
|     ]
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def add_ent():
 | |
|     def add_ent_component(doc):
 | |
|         doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
 | |
|         return doc
 | |
| 
 | |
|     return add_ent_component
 | |
| 
 | |
| 
 | |
| def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
 | |
|     nlp = Language(vocab=en_vocab)
 | |
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 | |
|     ruler_bytes = ruler.to_bytes()
 | |
|     assert len(ruler) == len(patterns)
 | |
|     assert len(ruler.labels) == 4
 | |
|     assert ruler.overwrite
 | |
|     new_ruler = EntityRuler(nlp)
 | |
|     new_ruler = new_ruler.from_bytes(ruler_bytes)
 | |
|     assert len(new_ruler) == len(ruler)
 | |
|     assert len(new_ruler.labels) == 4
 | |
|     assert new_ruler.overwrite == ruler.overwrite
 | |
|     assert new_ruler.ent_id_sep == ruler.ent_id_sep
 | |
| 
 | |
| 
 | |
| def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
 | |
|     nlp = Language(vocab=en_vocab)
 | |
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 | |
|     bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
 | |
|     new_ruler = EntityRuler(nlp)
 | |
|     new_ruler = new_ruler.from_bytes(bytes_old_style)
 | |
|     assert len(new_ruler) == len(ruler)
 | |
|     for pattern in ruler.patterns:
 | |
|         assert pattern in new_ruler.patterns
 | |
|     assert new_ruler.overwrite is not ruler.overwrite
 | |
| 
 | |
| 
 | |
| def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
 | |
|     nlp = Language(vocab=en_vocab)
 | |
|     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
 | |
|     with make_tempdir() as tmpdir:
 | |
|         out_file = tmpdir / "entity_ruler"
 | |
|         srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
 | |
|         new_ruler = EntityRuler(nlp).from_disk(out_file)
 | |
|         for pattern in ruler.patterns:
 | |
|             assert pattern in new_ruler.patterns
 | |
|         assert len(new_ruler) == len(ruler)
 | |
|         assert new_ruler.overwrite is not ruler.overwrite
 | |
| 
 | |
| 
 | |
| def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
 | |
|     nlp = Language(vocab=en_vocab)
 | |
|     ruler = EntityRuler(nlp, overwrite_ents=True)
 | |
| 
 | |
|     ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
 | |
|     nlp.add_pipe(ruler)
 | |
|     with make_tempdir() as tmpdir:
 | |
|         nlp.to_disk(tmpdir)
 | |
|         ruler = nlp.get_pipe("entity_ruler")
 | |
|         assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
 | |
|         assert ruler.overwrite is True
 | |
|         nlp2 = load(tmpdir)
 | |
|         new_ruler = nlp2.get_pipe("entity_ruler")
 | |
|         assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
 | |
|         assert new_ruler.overwrite is True
 |