diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 24f98508f..d7a9569f1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -217,7 +217,7 @@ class Morphologizer(Tagger): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs, scores_guesses): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -225,6 +225,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#set_annotations """ + _, batch_tag_ids = scores_guesses if isinstance(docs, Doc): docs = [docs] cdef Doc doc diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 6808fe70e..2caeee5c1 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -38,11 +38,16 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_config={ + "model": DEFAULT_SENTER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.senter_scorer.v1"}, + "store_activations": False + }, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) -def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): - return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) +def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable], store_activations: bool): + return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, store_activations=store_activations) def senter_score(examples, **kwargs): @@ -72,6 +77,7 @@ class SentenceRecognizer(Tagger): *, overwrite=BACKWARD_OVERWRITE, scorer=senter_score, + store_activations=False, ): """Initialize a sentence recognizer. @@ -90,6 +96,7 @@ class SentenceRecognizer(Tagger): self._rehearsal_model = None self.cfg = {"overwrite": overwrite} self.scorer = scorer + self.store_activations = store_activations @property def labels(self): @@ -107,7 +114,7 @@ class SentenceRecognizer(Tagger): def label_data(self): return None - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs, scores_guesses): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -115,11 +122,17 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ + _, batch_tag_ids = scores_guesses if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): + if self.store_activations: + doc.activations[self.name] = { + "probs": scores_guesses[0][i], + "guesses": scores_guesses[1][i], + } doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index d6ecbf084..3505cbaaf 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -45,7 +45,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"}, + default_config={ + "model": DEFAULT_TAGGER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, + "neg_prefix": "!", + "store_activations": False + }, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -55,6 +61,7 @@ def make_tagger( overwrite: bool, scorer: Optional[Callable], neg_prefix: str, + store_activations: bool, ): """Construct a part-of-speech tagger component. @@ -63,7 +70,7 @@ def make_tagger( in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, store_activations=store_activations) def tagger_score(examples, **kwargs): @@ -89,6 +96,7 @@ class Tagger(TrainablePipe): overwrite=BACKWARD_OVERWRITE, scorer=tagger_score, neg_prefix="!", + store_activations=False, ): """Initialize a part-of-speech tagger. @@ -108,6 +116,7 @@ class Tagger(TrainablePipe): cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix} self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.store_activations = store_activations @property def labels(self): @@ -139,12 +148,12 @@ class Tagger(TrainablePipe): n_labels = len(self.labels) guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] assert len(guesses) == len(docs) - return guesses + return guesses, guesses scores = self.model.predict(docs) assert len(scores) == len(docs), (len(scores), len(docs)) guesses = self._scores2guesses(scores) assert len(guesses) == len(docs) - return guesses + return scores, guesses def _scores2guesses(self, scores): guesses = [] @@ -155,7 +164,7 @@ class Tagger(TrainablePipe): guesses.append(doc_guesses) return guesses - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs, scores_guesses): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -163,6 +172,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#set_annotations """ + _, batch_tag_ids = scores_guesses if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -170,6 +180,11 @@ class Tagger(TrainablePipe): cdef bint overwrite = self.cfg["overwrite"] labels = self.labels for i, doc in enumerate(docs): + if self.store_activations: + doc.activations[self.name] = { + "probs": scores_guesses[0][i], + "guesses": scores_guesses[1][i], + } doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index 65daa8b22..411f0819d 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -6,3 +6,4 @@ cdef class TrainablePipe(Pipe): cdef public object model cdef public object cfg cdef public object scorer + cdef public bint store_activations diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 047f59bef..91ceacf00 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal from spacy.attrs import SENT_START @@ -6,6 +7,7 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from spacy.pipeline import TrainablePipe from spacy.tests.util import make_tempdir @@ -101,3 +103,23 @@ def test_overfitting_IO(): # test internal pipe labels vs. Language.pipe_labels with hidden labels assert nlp.get_pipe("senter").labels == ("I", "S") assert "senter" not in nlp.pipe_labels + + +def test_store_activations(): + # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly + nlp = English() + senter = cast(TrainablePipe, nlp.add_pipe("senter")) + + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + + senter.store_activations = True + + doc = nlp("This is a test.") + assert "senter" in doc.activations + assert set(doc.activations["senter"].keys()) == {"guesses", "probs"} + assert doc.activations["senter"]["probs"].shape == (5, 2) + assert doc.activations["senter"]["guesses"].shape == (5,) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 96e75851e..6a8f75648 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal from spacy.attrs import TAG @@ -6,6 +7,7 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from spacy.pipeline import TrainablePipe from thinc.api import compounding from ..util import make_tempdir @@ -211,6 +213,25 @@ def test_overfitting_IO(): assert doc3[0].tag_ != "N" +def test_store_activations(): + # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly + nlp = English() + tagger = cast(TrainablePipe, nlp.add_pipe("tagger")) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + + tagger.store_activations = True + + doc = nlp("This is a test.") + + assert "tagger" in doc.activations + assert set(doc.activations["tagger"].keys()) == {"guesses", "probs"} + assert doc.activations["tagger"]["probs"].shape == (5, len(TAGS)) + assert doc.activations["tagger"]["guesses"].shape == (5,) + + def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 57d087958..83a940cbb 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,6 +50,8 @@ cdef class Doc: cdef public float sentiment + cdef public dict activations + cdef public dict user_hooks cdef public dict user_token_hooks cdef public dict user_span_hooks diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e38de02b4..92605bc3d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -245,6 +245,7 @@ cdef class Doc: self.length = 0 self.sentiment = 0.0 self.cats = {} + self.activations = {} self.user_hooks = {} self.user_token_hooks = {} self.user_span_hooks = {}