From acf47e8e48401baaf612a0148ea39ed3fac2b26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 22 Jun 2022 16:29:52 +0200 Subject: [PATCH] Support store_activations in spancat and morphologizer --- spacy/pipeline/morphologizer.pyx | 19 ++++++++++++--- spacy/pipeline/spancat.py | 25 ++++++++++++++++--- spacy/tests/pipeline/test_morphologizer.py | 28 ++++++++++++++++++++++ spacy/tests/pipeline/test_spancat.py | 26 ++++++++++++++++++++ 4 files changed, 92 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index d0c62649f..882de3cd0 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, Union, Dict, Callable +from typing import Callable, Dict, List, Optional, Union import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "store_activations": False + }, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -62,8 +68,10 @@ def make_morphologizer( overwrite: bool, extend: bool, scorer: Optional[Callable], + store_activations: Union[bool, List[str]], ): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, + store_activations=store_activations) def morphologizer_score(examples, **kwargs): @@ -95,6 +103,7 @@ class Morphologizer(Tagger): overwrite: bool = BACKWARD_OVERWRITE, extend: bool = BACKWARD_EXTEND, scorer: Optional[Callable] = morphologizer_score, + store_activations=False, ): """Initialize a morphologizer. @@ -124,6 +133,7 @@ class Morphologizer(Tagger): } self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.store_activations = store_activations @property def labels(self): @@ -234,6 +244,9 @@ class Morphologizer(Tagger): cdef bint extend = self.cfg["extend"] labels = self.labels for i, doc in enumerate(docs): + doc.activations[self.name] = {} + for activation in self.store_activations: + doc.activations[self.name][activation] = activations[activation][i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 1b7a9eecb..a1bce5f5e 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,4 +1,5 @@ from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import Union from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d, Ints1d @@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: "model": DEFAULT_SPANCAT_MODEL, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "store_activations": False, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -118,6 +120,7 @@ def make_spancat( scorer: Optional[Callable], threshold: float, max_positive: Optional[int], + store_activations: Union[bool, List[str]], ) -> "SpanCategorizer": """Create a SpanCategorizer component. The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller @@ -148,6 +151,7 @@ def make_spancat( max_positive=max_positive, name=name, scorer=scorer, + store_activations=store_activations, ) @@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe): threshold: float = 0.5, max_positive: Optional[int] = None, scorer: Optional[Callable] = spancat_score, + store_activations=False, ) -> None: """Initialize the span categorizer. vocab (Vocab): The shared vocabulary. @@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe): self.model = model self.name = name self.scorer = scorer + self.store_activations = store_activations @property def key(self) -> str: @@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe): """ indices = self.suggester(docs, ops=self.model.ops) scores = self.model.predict((docs, indices)) # type: ignore - return indices, scores + return {"indices": indices, "scores": scores} def set_candidates( self, docs: Iterable[Doc], *, candidates_key: str = "candidates" @@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe): for index in candidates.dataXd: doc.spans[candidates_key].append(doc[index[0] : index[1]]) - def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: + def set_annotations(self, docs: Iterable[Doc], activations) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#set_annotations """ labels = self.labels - indices, scores = indices_scores + + indices = activations["indices"] + scores = activations["scores"] offset = 0 for i, doc in enumerate(docs): indices_i = indices[i].dataXd + doc.activations[self.name] = {} + if "indices" in self.store_activations: + doc.activations[self.name]["indices"] = indices_i + if "scores" in self.store_activations: + doc.activations[self.name]["scores"] = scores[ + offset : offset + indices.lengths[i] + ] doc.spans[self.key] = self._make_span_group( doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] ) @@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe): spans.append(Span(doc, start, end, label=labels[j])) return spans + + @property + def activations(self): + return ["indices", "scores"] diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 33696bfd8..fc2f18730 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal @@ -7,6 +8,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir from spacy.morphology import Morphology +from spacy.pipeline import TrainablePipe from spacy.attrs import MORPH from spacy.tokens import Doc @@ -197,3 +199,29 @@ def test_overfitting_IO(): gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags + + +def test_store_activations(): + # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly + nlp = English() + morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer")) + train_examples = [] + for inst in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert len(list(doc.activations["morphologizer"].keys())) == 0 + + morphologizer.store_activations = True + doc = nlp("This is a test.") + assert "morphologizer" in doc.activations + assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"} + assert doc.activations["morphologizer"]["probs"].shape == (5, 6) + assert doc.activations["morphologizer"]["guesses"].shape == (5,) + + morphologizer.store_activations = ["probs"] + doc = nlp("This is a test.") + assert "morphologizer" in doc.activations + assert set(doc.activations["morphologizer"].keys()) == {"probs"} + assert doc.activations["morphologizer"]["probs"].shape == (5, 6) diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 15256a763..d88de1d09 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -419,3 +419,29 @@ def test_set_candidates(): assert len(docs[0].spans["candidates"]) == 9 assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][4].text == "Just a" + + +def test_store_activations(): + # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly + nlp = English() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + nO = spancat.model.get_dim("nO") + assert nO == 2 + assert set(spancat.labels) == {"LOC", "PERSON"} + + doc = nlp("This is a test.") + assert len(list(doc.activations["spancat"].keys())) == 0 + + spancat.store_activations = True + doc = nlp("This is a test.") + assert set(doc.activations["spancat"].keys()) == {"indices", "scores"} + assert doc.activations["spancat"]["indices"].shape == (12, 2) + assert doc.activations["spancat"]["scores"].shape == (12, nO) + spancat.store_activations = True + + spancat.store_activations = ["scores"] + doc = nlp("This is a test.") + assert set(doc.activations["spancat"].keys()) == {"scores"} + assert doc.activations["spancat"]["scores"].shape == (12, nO)