Switch tests to separate scorer keys rather than merged dicts

2025-08-05 21:00:19 +03:00 · 2022-12-15 16:40:36 +01:00 · 2022-12-15 16:40:36 +01:00 · 3c6ad16f71
commit 3c6ad16f71
parent 3ca590020b
1 changed files with 100 additions and 48 deletions
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -3,6 +3,7 @@ import logging
 from unittest import mock
 import pytest
 from spacy.language import Language
 from spacy.scorer import Scorer
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.training import Example
@ -58,29 +59,6 @@ def nlp():
    return nlp
@pytest.fixture
 def nlp_multi():
    nlp = Language(Vocab())
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
        textcat_multilabel.add_label(label)
    nlp.initialize()
    return nlp
@pytest.fixture
 def nlp_both():
    nlp = Language(Vocab())
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
        textcat_multilabel.add_label(label)
    nlp.initialize()
    return nlp
 def test_language_update(nlp):
    text = "hello world"
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
@ -114,9 +92,6 @@ def test_language_evaluate(nlp):
    example = Example.from_dict(doc, annots)
    scores = nlp.evaluate([example])
    assert scores["speed"] > 0
    assert scores["cats_f_per_type"].get("POSITIVE") is not None
    assert scores["cats_f_per_type"].get("NEGATIVE") is not None
    assert scores["cats_f_per_type"].get("BUG") is None
    # test with generator
    scores = nlp.evaluate(eg for eg in [example])
@ -152,33 +127,110 @@ def test_evaluate_no_pipe(nlp):
    nlp.evaluate([Example.from_dict(doc, annots)])
-def test_evaluate_textcat(nlp_multi):
+def test_evaluate_textcat_multilabel(en_vocab):
    """Test that evaluate works with a multilabel textcat pipe."""
-    text = "hello world"
+    nlp = Language(en_vocab)
-    annots = {"doc_annotation": {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}}
+    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
-    doc = Doc(nlp_multi.vocab, words=text.split(" "))
+    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
        textcat_multilabel.add_label(label)
    nlp.initialize()
    annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
    doc = nlp.make_doc("hello world")
    example = Example.from_dict(doc, annots)
-    scores = nlp_multi.evaluate([example])
+    scores = nlp.evaluate([example])
-    assert scores["cats_f_per_type"].get("FEATURE") is not None
+    labels = nlp.get_pipe("textcat_multilabel").labels
-    assert scores["cats_f_per_type"].get("QUESTION") is not None
+    for label in labels:
-    assert scores["cats_f_per_type"].get("REQUEST") is not None
+        assert scores["cats_f_per_type"].get(label) is not None
-    assert scores["cats_f_per_type"].get("BUG") is not None
+    for key in example.reference.cats.keys():
-    assert scores["cats_f_per_type"].get("POSITIVE") is None
+        if key not in labels:
-    assert scores["cats_f_per_type"].get("NEGATIVE") is None
+            assert scores["cats_f_per_type"].get(key) is None
-def test_evaluate_both(nlp_both):
+def test_evaluate_multiple_textcat_final(en_vocab):
-    """Test that evaluate works with two textcat pipes."""
+    """Test that evaluate evaluates the final textcat component in a pipeline
-    text = "hello world"
+    with more than one textcat or textcat_multilabel."""
-    annots = {"doc_annotation": {"cats": {"FEATURE": 1.0, "QUESTION": 1.0, "POSITIVE": 1.0, "NEGATIVE": 0.0}}}
+    nlp = Language(en_vocab)
-    doc = Doc(nlp_both.vocab, words=text.split(" "))
+    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
        textcat_multilabel.add_label(label)
    nlp.initialize()
    annots = {
        "cats": {
            "POSITIVE": 1.0,
            "NEGATIVE": 0.0,
            "FEATURE": 1.0,
            "QUESTION": 1.0,
            "POSITIVE": 1.0,
            "NEGATIVE": 0.0,
        }
    }
    doc = nlp.make_doc("hello world")
    example = Example.from_dict(doc, annots)
-    scores = nlp_both.evaluate([example])
+    scores = nlp.evaluate([example])
-    assert scores["cats_f_per_type"].get("FEATURE") is not None
+    # get the labels from the final pipe
-    assert scores["cats_f_per_type"].get("QUESTION") is not None
+    labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
-    assert scores["cats_f_per_type"].get("BUG") is not None
+    for label in labels:
-    assert scores["cats_f_per_type"].get("POSITIVE") is not None
+        assert scores["cats_f_per_type"].get(label) is not None
-    assert scores["cats_f_per_type"].get("NEGATIVE") is not None
+    for key in example.reference.cats.keys():
        if key not in labels:
            assert scores["cats_f_per_type"].get(key) is None
 def test_evaluate_multiple_textcat_separate(en_vocab):
    """Test that evaluate can evaluate multiple textcat components separately
    with custom scorers."""
    def custom_textcat_score(examples, **kwargs):
        scores = Scorer.score_cats(
            examples,
            "cats",
            multi_label=False,
            **kwargs,
        )
        return {f"custom_{k}": v for k, v in scores.items()}
    @spacy.registry.scorers("test_custom_textcat_scorer")
    def make_custom_textcat_scorer():
        return custom_textcat_score
    nlp = Language(en_vocab)
    textcat = nlp.add_pipe(
        "textcat",
        config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
    )
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    textcat_multilabel = nlp.add_pipe("textcat_multilabel")
    for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
        textcat_multilabel.add_label(label)
    nlp.initialize()
    annots = {
        "cats": {
            "POSITIVE": 1.0,
            "NEGATIVE": 0.0,
            "FEATURE": 1.0,
            "QUESTION": 1.0,
            "POSITIVE": 1.0,
            "NEGATIVE": 0.0,
        }
    }
    doc = nlp.make_doc("hello world")
    example = Example.from_dict(doc, annots)
    scores = nlp.evaluate([example])
    # check custom scores for the textcat pipe
    assert "custom_cats_f_per_type" in scores
    labels = nlp.get_pipe("textcat").labels
    assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
    # check default scores for the textcat_multilabel pipe
    assert "cats_f_per_type" in scores
    labels = nlp.get_pipe("textcat_multilabel").labels
    assert set(scores["cats_f_per_type"].keys()) == set(labels)
 def vector_modification_pipe(doc):