Support store_activations in spancat and morphologizer

2025-10-22 19:54:18 +03:00 · 2022-06-22 16:29:52 +02:00 · 2022-06-22 16:29:52 +02:00 · acf47e8e48
commit acf47e8e48
parent 789a44755e
4 changed files with 92 additions and 6 deletions
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional, Union, Dict, Callable
+from typing import Callable, Dict, List, Optional, Union
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
    "morphologizer",
    assigns=["token.morph", "token.pos"],
-    default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
+    default_config={
+        "model": DEFAULT_MORPH_MODEL,
+        "overwrite": True,
+        "extend": False,
+        "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+        "store_activations": False
+    },
    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
@ -62,8 +68,10 @@ def make_morphologizer(
    overwrite: bool,
    extend: bool,
    scorer: Optional[Callable],
+    store_activations: Union[bool, List[str]],
 ):
-    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
+    return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+                         store_activations=store_activations)


 def morphologizer_score(examples, **kwargs):
@ -95,6 +103,7 @@ class Morphologizer(Tagger):
        overwrite: bool = BACKWARD_OVERWRITE,
        extend: bool = BACKWARD_EXTEND,
        scorer: Optional[Callable] = morphologizer_score,
+        store_activations=False,
    ):
        """Initialize a morphologizer.

@ -124,6 +133,7 @@ class Morphologizer(Tagger):
        }
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer
+        self.store_activations = store_activations

    @property
    def labels(self):
@ -234,6 +244,9 @@ class Morphologizer(Tagger):
        cdef bint extend = self.cfg["extend"]
        labels = self.labels
        for i, doc in enumerate(docs):
+            doc.activations[self.name] = {}
+            for activation in self.store_activations:
+                doc.activations[self.name][activation] = activations[activation][i]
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -1,4 +1,5 @@
 from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
 from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
 from thinc.api import Optimizer
 from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
        "model": DEFAULT_SPANCAT_MODEL,
        "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
        "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+        "store_activations": False,
    },
    default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
 )
@ -118,6 +120,7 @@ def make_spancat(
    scorer: Optional[Callable],
    threshold: float,
    max_positive: Optional[int],
+    store_activations: Union[bool, List[str]],
 ) -> "SpanCategorizer":
    """Create a SpanCategorizer component. The span categorizer consists of two
    parts: a suggester function that proposes candidate spans, and a labeller
@ -148,6 +151,7 @@ def make_spancat(
        max_positive=max_positive,
        name=name,
        scorer=scorer,
+        store_activations=store_activations,
    )


@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe):
        threshold: float = 0.5,
        max_positive: Optional[int] = None,
        scorer: Optional[Callable] = spancat_score,
+        store_activations=False,
    ) -> None:
        """Initialize the span categorizer.
        vocab (Vocab): The shared vocabulary.
@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe):
        self.model = model
        self.name = name
        self.scorer = scorer
+        self.store_activations = store_activations

    @property
    def key(self) -> str:
@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe):
        """
        indices = self.suggester(docs, ops=self.model.ops)
        scores = self.model.predict((docs, indices))  # type: ignore
-        return indices, scores
+        return {"indices": indices, "scores": scores}

    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe):
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])

-    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+    def set_annotations(self, docs: Iterable[Doc], activations) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe):
        DOCS: https://spacy.io/api/spancategorizer#set_annotations
        """
        labels = self.labels
-        indices, scores = indices_scores
+
+        indices = activations["indices"]
+        scores = activations["scores"]
        offset = 0
        for i, doc in enumerate(docs):
            indices_i = indices[i].dataXd
+            doc.activations[self.name] = {}
+            if "indices" in self.store_activations:
+                doc.activations[self.name]["indices"] = indices_i
+            if "scores" in self.store_activations:
+                doc.activations[self.name]["scores"] = scores[
+                    offset : offset + indices.lengths[i]
+                ]
            doc.spans[self.key] = self._make_span_group(
                doc, indices_i, scores[offset : offset + indices.lengths[i]], labels  # type: ignore[arg-type]
            )
@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe):
                    spans.append(Span(doc, start, end, label=labels[j]))

        return spans
+
+    @property
+    def activations(self):
+        return ["indices", "scores"]
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -1,3 +1,4 @@
+from typing import cast
 import pytest
 from numpy.testing import assert_equal

@ -7,6 +8,7 @@ from spacy.lang.en import English
 from spacy.language import Language
 from spacy.tests.util import make_tempdir
 from spacy.morphology import Morphology
+from spacy.pipeline import TrainablePipe
 from spacy.attrs import MORPH
 from spacy.tokens import Doc

@ -197,3 +199,29 @@ def test_overfitting_IO():
    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_store_activations():
+    # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
+    nlp = English()
+    morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+    train_examples = []
+    for inst in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    doc = nlp("This is a test.")
+    assert len(list(doc.activations["morphologizer"].keys())) == 0
+
+    morphologizer.store_activations = True
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"}
+    assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
+    assert doc.activations["morphologizer"]["guesses"].shape == (5,)
+
+    morphologizer.store_activations = ["probs"]
+    doc = nlp("This is a test.")
+    assert "morphologizer" in doc.activations
+    assert set(doc.activations["morphologizer"].keys()) == {"probs"}
+    assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@ -419,3 +419,29 @@ def test_set_candidates():
    assert len(docs[0].spans["candidates"]) == 9
    assert docs[0].spans["candidates"][0].text == "Just"
    assert docs[0].spans["candidates"][4].text == "Just a"
+
+
+def test_store_activations():
+    # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
+    nlp = English()
+    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+    train_examples = make_examples(nlp)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    nO = spancat.model.get_dim("nO")
+    assert nO == 2
+    assert set(spancat.labels) == {"LOC", "PERSON"}
+
+    doc = nlp("This is a test.")
+    assert len(list(doc.activations["spancat"].keys())) == 0
+
+    spancat.store_activations = True
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+    assert doc.activations["spancat"]["indices"].shape == (12, 2)
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)
+    spancat.store_activations = True
+
+    spancat.store_activations = ["scores"]
+    doc = nlp("This is a test.")
+    assert set(doc.activations["spancat"].keys()) == {"scores"}
+    assert doc.activations["spancat"]["scores"].shape == (12, nO)