Support store_activations in spancat and morphologizer

This commit is contained in:
Daniël de Kok 2022-06-22 16:29:52 +02:00
parent 789a44755e
commit acf47e8e48
4 changed files with 92 additions and 6 deletions

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Optional, Union, Dict, Callable
from typing import Callable, Dict, List, Optional, Union
import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice
@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"store_activations": False
},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
@ -62,8 +68,10 @@ def make_morphologizer(
overwrite: bool,
extend: bool,
scorer: Optional[Callable],
store_activations: Union[bool, List[str]],
):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
store_activations=store_activations)
def morphologizer_score(examples, **kwargs):
@ -95,6 +103,7 @@ class Morphologizer(Tagger):
overwrite: bool = BACKWARD_OVERWRITE,
extend: bool = BACKWARD_EXTEND,
scorer: Optional[Callable] = morphologizer_score,
store_activations=False,
):
"""Initialize a morphologizer.
@ -124,6 +133,7 @@ class Morphologizer(Tagger):
}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.store_activations = store_activations
@property
def labels(self):
@ -234,6 +244,9 @@ class Morphologizer(Tagger):
cdef bint extend = self.cfg["extend"]
labels = self.labels
for i, doc in enumerate(docs):
doc.activations[self.name] = {}
for activation in self.store_activations:
doc.activations[self.name][activation] = activations[activation][i]
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()

View File

@ -1,4 +1,5 @@
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
from typing import Union
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"store_activations": False,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
@ -118,6 +120,7 @@ def make_spancat(
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
store_activations: Union[bool, List[str]],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
@ -148,6 +151,7 @@ def make_spancat(
max_positive=max_positive,
name=name,
scorer=scorer,
store_activations=store_activations,
)
@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe):
threshold: float = 0.5,
max_positive: Optional[int] = None,
scorer: Optional[Callable] = spancat_score,
store_activations=False,
) -> None:
"""Initialize the span categorizer.
vocab (Vocab): The shared vocabulary.
@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe):
self.model = model
self.name = name
self.scorer = scorer
self.store_activations = store_activations
@property
def key(self) -> str:
@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe):
"""
indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores
return {"indices": indices, "scores": scores}
def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe):
for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]])
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
def set_annotations(self, docs: Iterable[Doc], activations) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#set_annotations
"""
labels = self.labels
indices, scores = indices_scores
indices = activations["indices"]
scores = activations["scores"]
offset = 0
for i, doc in enumerate(docs):
indices_i = indices[i].dataXd
doc.activations[self.name] = {}
if "indices" in self.store_activations:
doc.activations[self.name]["indices"] = indices_i
if "scores" in self.store_activations:
doc.activations[self.name]["scores"] = scores[
offset : offset + indices.lengths[i]
]
doc.spans[self.key] = self._make_span_group(
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
)
@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe):
spans.append(Span(doc, start, end, label=labels[j]))
return spans
@property
def activations(self):
return ["indices", "scores"]

View File

@ -1,3 +1,4 @@
from typing import cast
import pytest
from numpy.testing import assert_equal
@ -7,6 +8,7 @@ from spacy.lang.en import English
from spacy.language import Language
from spacy.tests.util import make_tempdir
from spacy.morphology import Morphology
from spacy.pipeline import TrainablePipe
from spacy.attrs import MORPH
from spacy.tokens import Doc
@ -197,3 +199,29 @@ def test_overfitting_IO():
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags
def test_store_activations():
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
nlp = English()
morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
train_examples = []
for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
nlp.initialize(get_examples=lambda: train_examples)
doc = nlp("This is a test.")
assert len(list(doc.activations["morphologizer"].keys())) == 0
morphologizer.store_activations = True
doc = nlp("This is a test.")
assert "morphologizer" in doc.activations
assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"}
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
assert doc.activations["morphologizer"]["guesses"].shape == (5,)
morphologizer.store_activations = ["probs"]
doc = nlp("This is a test.")
assert "morphologizer" in doc.activations
assert set(doc.activations["morphologizer"].keys()) == {"probs"}
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)

View File

@ -419,3 +419,29 @@ def test_set_candidates():
assert len(docs[0].spans["candidates"]) == 9
assert docs[0].spans["candidates"][0].text == "Just"
assert docs[0].spans["candidates"][4].text == "Just a"
def test_store_activations():
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
nlp = English()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
nO = spancat.model.get_dim("nO")
assert nO == 2
assert set(spancat.labels) == {"LOC", "PERSON"}
doc = nlp("This is a test.")
assert len(list(doc.activations["spancat"].keys())) == 0
spancat.store_activations = True
doc = nlp("This is a test.")
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
assert doc.activations["spancat"]["indices"].shape == (12, 2)
assert doc.activations["spancat"]["scores"].shape == (12, nO)
spancat.store_activations = True
spancat.store_activations = ["scores"]
doc = nlp("This is a test.")
assert set(doc.activations["spancat"].keys()) == {"scores"}
assert doc.activations["spancat"]["scores"].shape == (12, nO)