mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	* Add scorer option to components Add an optional `scorer` parameter to all pipeline components. If a scoring function is provided, it overrides the default scoring method for that component. * Add registered scorers for all components * Add `scorers` registry * Move all scoring methods outside of components as independent functions and register * Use the registered scoring methods as defaults in configs and inits Additional: * The scoring methods no longer have access to the full component, so use settings from `cfg` as default scorer options to handle settings such as `labels`, `threshold`, and `positive_label` * The `attribute_ruler` scoring method no longer has access to the patterns, so all scoring methods are called * Bug fix: `spancat` scoring method is updated to set `allow_overlap` to score overlapping spans correctly * Update Russian lemmatizer to use direct score method * Check type of cfg in Pipe.score * Fix check * Update spacy/pipeline/sentencizer.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Remove validate_examples from scoring functions * Use Pipe.labels instead of Pipe.cfg["labels"] Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
		
			
				
	
	
		
			294 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			294 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import numpy
 | |
| from spacy.training import Example
 | |
| from spacy.lang.en import English
 | |
| from spacy.pipeline import AttributeRuler
 | |
| from spacy import util, registry
 | |
| from spacy.tokens import Doc
 | |
| 
 | |
| from ..util import make_tempdir
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def nlp():
 | |
|     return English()
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def pattern_dicts():
 | |
|     return [
 | |
|         {
 | |
|             "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
 | |
|             "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | |
|         },
 | |
|         # one pattern sets the lemma
 | |
|         {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
 | |
|         # another pattern sets the morphology
 | |
|         {
 | |
|             "patterns": [[{"ORTH": "test"}]],
 | |
|             "attrs": {"MORPH": "Case=Nom|Number=Sing"},
 | |
|             "index": 0,
 | |
|         },
 | |
|     ]
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def tag_map():
 | |
|     return {
 | |
|         ".": {"POS": "PUNCT", "PunctType": "peri"},
 | |
|         ",": {"POS": "PUNCT", "PunctType": "comm"},
 | |
|     }
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def morph_rules():
 | |
|     return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
 | |
| 
 | |
| 
 | |
| def check_tag_map(ruler):
 | |
|     doc = Doc(
 | |
|         ruler.vocab,
 | |
|         words=["This", "is", "a", "test", "."],
 | |
|         tags=["DT", "VBZ", "DT", "NN", "."],
 | |
|     )
 | |
|     doc = ruler(doc)
 | |
|     for i in range(len(doc)):
 | |
|         if i == 4:
 | |
|             assert doc[i].pos_ == "PUNCT"
 | |
|             assert str(doc[i].morph) == "PunctType=peri"
 | |
|         else:
 | |
|             assert doc[i].pos_ == ""
 | |
|             assert str(doc[i].morph) == ""
 | |
| 
 | |
| 
 | |
| def check_morph_rules(ruler):
 | |
|     doc = Doc(
 | |
|         ruler.vocab,
 | |
|         words=["This", "is", "the", "test", "."],
 | |
|         tags=["DT", "VBZ", "DT", "NN", "."],
 | |
|     )
 | |
|     doc = ruler(doc)
 | |
|     for i in range(len(doc)):
 | |
|         if i != 2:
 | |
|             assert doc[i].pos_ == ""
 | |
|             assert str(doc[i].morph) == ""
 | |
|         else:
 | |
|             assert doc[2].pos_ == "DET"
 | |
|             assert doc[2].lemma_ == "a"
 | |
|             assert str(doc[2].morph) == "Case=Nom"
 | |
| 
 | |
| 
 | |
| def test_attributeruler_init(nlp, pattern_dicts):
 | |
|     a = nlp.add_pipe("attribute_ruler")
 | |
|     for p in pattern_dicts:
 | |
|         a.add(**p)
 | |
|     doc = nlp("This is a test.")
 | |
|     assert doc[2].lemma_ == "the"
 | |
|     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | |
|     assert doc[3].lemma_ == "cat"
 | |
|     assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | |
|     assert doc.has_annotation("LEMMA")
 | |
|     assert doc.has_annotation("MORPH")
 | |
| 
 | |
| 
 | |
| def test_attributeruler_init_patterns(nlp, pattern_dicts):
 | |
|     # initialize with patterns
 | |
|     ruler = nlp.add_pipe("attribute_ruler")
 | |
|     ruler.initialize(lambda: [], patterns=pattern_dicts)
 | |
|     doc = nlp("This is a test.")
 | |
|     assert doc[2].lemma_ == "the"
 | |
|     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | |
|     assert doc[3].lemma_ == "cat"
 | |
|     assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | |
|     assert doc.has_annotation("LEMMA")
 | |
|     assert doc.has_annotation("MORPH")
 | |
|     nlp.remove_pipe("attribute_ruler")
 | |
| 
 | |
|     # initialize with patterns from misc registry
 | |
|     @registry.misc("attribute_ruler_patterns")
 | |
|     def attribute_ruler_patterns():
 | |
|         return [
 | |
|             {
 | |
|                 "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
 | |
|                 "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | |
|             },
 | |
|             # one pattern sets the lemma
 | |
|             {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
 | |
|             # another pattern sets the morphology
 | |
|             {
 | |
|                 "patterns": [[{"ORTH": "test"}]],
 | |
|                 "attrs": {"MORPH": "Case=Nom|Number=Sing"},
 | |
|                 "index": 0,
 | |
|             },
 | |
|         ]
 | |
| 
 | |
|     nlp.config["initialize"]["components"]["attribute_ruler"] = {
 | |
|         "patterns": {"@misc": "attribute_ruler_patterns"}
 | |
|     }
 | |
|     nlp.add_pipe("attribute_ruler")
 | |
|     nlp.initialize()
 | |
|     doc = nlp("This is a test.")
 | |
|     assert doc[2].lemma_ == "the"
 | |
|     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | |
|     assert doc[3].lemma_ == "cat"
 | |
|     assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | |
|     assert doc.has_annotation("LEMMA")
 | |
|     assert doc.has_annotation("MORPH")
 | |
| 
 | |
| 
 | |
| def test_attributeruler_init_clear(nlp, pattern_dicts):
 | |
|     """Test that initialization clears patterns."""
 | |
|     ruler = nlp.add_pipe("attribute_ruler")
 | |
|     assert not len(ruler.matcher)
 | |
|     ruler.add_patterns(pattern_dicts)
 | |
|     assert len(ruler.matcher)
 | |
|     ruler.initialize(lambda: [])
 | |
|     assert not len(ruler.matcher)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_score(nlp, pattern_dicts):
 | |
|     # initialize with patterns
 | |
|     ruler = nlp.add_pipe("attribute_ruler")
 | |
|     ruler.initialize(lambda: [], patterns=pattern_dicts)
 | |
|     doc = nlp("This is a test.")
 | |
|     assert doc[2].lemma_ == "the"
 | |
|     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
 | |
|     assert doc[3].lemma_ == "cat"
 | |
|     assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 | |
|     doc = nlp.make_doc("This is a test.")
 | |
|     dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
 | |
|     scores = nlp.evaluate(dev_examples)
 | |
|     # "cat" is the only correct lemma
 | |
|     assert scores["lemma_acc"] == pytest.approx(0.2)
 | |
|     # no morphs are set
 | |
|     assert scores["morph_acc"] is None
 | |
|     nlp.remove_pipe("attribute_ruler")
 | |
| 
 | |
|     # test with custom scorer
 | |
|     @registry.misc("weird_scorer.v1")
 | |
|     def make_weird_scorer():
 | |
|         def weird_scorer(examples, weird_score, **kwargs):
 | |
|             return {"weird_score": weird_score}
 | |
| 
 | |
|         return weird_scorer
 | |
| 
 | |
|     ruler = nlp.add_pipe(
 | |
|         "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
 | |
|     )
 | |
|     ruler.initialize(lambda: [], patterns=pattern_dicts)
 | |
|     scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
 | |
|     assert scores["weird_score"] == 0.12345
 | |
|     assert "token_acc" in scores
 | |
|     assert "lemma_acc" not in scores
 | |
|     scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
 | |
|     assert scores["weird_score"] == 0.23456
 | |
| 
 | |
| 
 | |
| def test_attributeruler_rule_order(nlp):
 | |
|     a = AttributeRuler(nlp.vocab)
 | |
|     patterns = [
 | |
|         {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},
 | |
|         {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
 | |
|     ]
 | |
|     a.add_patterns(patterns)
 | |
|     doc = Doc(
 | |
|         nlp.vocab,
 | |
|         words=["This", "is", "a", "test", "."],
 | |
|         tags=["DT", "VBZ", "DT", "NN", "."],
 | |
|     )
 | |
|     doc = a(doc)
 | |
|     assert doc[1].pos_ == "NOUN"
 | |
| 
 | |
| 
 | |
| def test_attributeruler_tag_map(nlp, tag_map):
 | |
|     ruler = AttributeRuler(nlp.vocab)
 | |
|     ruler.load_from_tag_map(tag_map)
 | |
|     check_tag_map(ruler)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_tag_map_initialize(nlp, tag_map):
 | |
|     ruler = nlp.add_pipe("attribute_ruler")
 | |
|     ruler.initialize(lambda: [], tag_map=tag_map)
 | |
|     check_tag_map(ruler)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_morph_rules(nlp, morph_rules):
 | |
|     ruler = AttributeRuler(nlp.vocab)
 | |
|     ruler.load_from_morph_rules(morph_rules)
 | |
|     check_morph_rules(ruler)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
 | |
|     ruler = nlp.add_pipe("attribute_ruler")
 | |
|     ruler.initialize(lambda: [], morph_rules=morph_rules)
 | |
|     check_morph_rules(ruler)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_indices(nlp):
 | |
|     a = nlp.add_pipe("attribute_ruler")
 | |
|     a.add(
 | |
|         [[{"ORTH": "a"}, {"ORTH": "test"}]],
 | |
|         {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
 | |
|         index=0,
 | |
|     )
 | |
|     a.add(
 | |
|         [[{"ORTH": "This"}, {"ORTH": "is"}]],
 | |
|         {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
 | |
|         index=1,
 | |
|     )
 | |
|     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
 | |
| 
 | |
|     text = "This is a test."
 | |
|     doc = nlp(text)
 | |
|     for i in range(len(doc)):
 | |
|         if i == 1:
 | |
|             assert doc[i].lemma_ == "was"
 | |
|             assert str(doc[i].morph) == "Case=Nom|Number=Sing"
 | |
|         elif i == 2:
 | |
|             assert doc[i].lemma_ == "the"
 | |
|             assert str(doc[i].morph) == "Case=Nom|Number=Plur"
 | |
|         elif i == 3:
 | |
|             assert doc[i].lemma_ == "cat"
 | |
|         else:
 | |
|             assert str(doc[i].morph) == ""
 | |
|     # raises an error when trying to modify a token outside of the match
 | |
|     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
 | |
|     with pytest.raises(ValueError):
 | |
|         doc = nlp(text)
 | |
|     # raises an error when trying to modify a token outside of the match
 | |
|     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
 | |
|     with pytest.raises(ValueError):
 | |
|         doc = nlp(text)
 | |
| 
 | |
| 
 | |
| def test_attributeruler_patterns_prop(nlp, pattern_dicts):
 | |
|     a = nlp.add_pipe("attribute_ruler")
 | |
|     a.add_patterns(pattern_dicts)
 | |
|     for p1, p2 in zip(pattern_dicts, a.patterns):
 | |
|         assert p1["patterns"] == p2["patterns"]
 | |
|         assert p1["attrs"] == p2["attrs"]
 | |
|         if p1.get("index"):
 | |
|             assert p1["index"] == p2["index"]
 | |
| 
 | |
| 
 | |
| def test_attributeruler_serialize(nlp, pattern_dicts):
 | |
|     a = nlp.add_pipe("attribute_ruler")
 | |
|     a.add_patterns(pattern_dicts)
 | |
|     text = "This is a test."
 | |
|     attrs = ["ORTH", "LEMMA", "MORPH"]
 | |
|     doc = nlp(text)
 | |
|     # bytes roundtrip
 | |
|     a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
 | |
|     assert a.to_bytes() == a_reloaded.to_bytes()
 | |
|     doc1 = a_reloaded(nlp.make_doc(text))
 | |
|     numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
 | |
|     assert a.patterns == a_reloaded.patterns
 | |
|     # disk roundtrip
 | |
|     with make_tempdir() as tmp_dir:
 | |
|         nlp.to_disk(tmp_dir)
 | |
|         nlp2 = util.load_model_from_path(tmp_dir)
 | |
|         doc2 = nlp2(text)
 | |
|         assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
 | |
|         assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
 | |
|         assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
 |