Support store_activations in spancat and morphologizer

This commit is contained in:
Daniël de Kok 2022-06-22 16:29:52 +02:00
parent 789a44755e
commit acf47e8e48
4 changed files with 92 additions and 6 deletions

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional, Union, Dict, Callable from typing import Callable, Dict, List, Optional, Union
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -52,7 +52,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"morphologizer", "morphologizer",
assigns=["token.morph", "token.pos"], assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"store_activations": False
},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
) )
def make_morphologizer( def make_morphologizer(
@ -62,8 +68,10 @@ def make_morphologizer(
overwrite: bool, overwrite: bool,
extend: bool, extend: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
store_activations: Union[bool, List[str]],
): ):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer) return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
store_activations=store_activations)
def morphologizer_score(examples, **kwargs): def morphologizer_score(examples, **kwargs):
@ -95,6 +103,7 @@ class Morphologizer(Tagger):
overwrite: bool = BACKWARD_OVERWRITE, overwrite: bool = BACKWARD_OVERWRITE,
extend: bool = BACKWARD_EXTEND, extend: bool = BACKWARD_EXTEND,
scorer: Optional[Callable] = morphologizer_score, scorer: Optional[Callable] = morphologizer_score,
store_activations=False,
): ):
"""Initialize a morphologizer. """Initialize a morphologizer.
@ -124,6 +133,7 @@ class Morphologizer(Tagger):
} }
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer self.scorer = scorer
self.store_activations = store_activations
@property @property
def labels(self): def labels(self):
@ -234,6 +244,9 @@ class Morphologizer(Tagger):
cdef bint extend = self.cfg["extend"] cdef bint extend = self.cfg["extend"]
labels = self.labels labels = self.labels
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc.activations[self.name] = {}
for activation in self.store_activations:
doc.activations[self.name][activation] = activations[activation][i]
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"): if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()

View File

@ -1,4 +1,5 @@
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
from typing import Union
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
@ -106,6 +107,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
"model": DEFAULT_SPANCAT_MODEL, "model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"store_activations": False,
}, },
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
) )
@ -118,6 +120,7 @@ def make_spancat(
scorer: Optional[Callable], scorer: Optional[Callable],
threshold: float, threshold: float,
max_positive: Optional[int], max_positive: Optional[int],
store_activations: Union[bool, List[str]],
) -> "SpanCategorizer": ) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two """Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller parts: a suggester function that proposes candidate spans, and a labeller
@ -148,6 +151,7 @@ def make_spancat(
max_positive=max_positive, max_positive=max_positive,
name=name, name=name,
scorer=scorer, scorer=scorer,
store_activations=store_activations,
) )
@ -186,6 +190,7 @@ class SpanCategorizer(TrainablePipe):
threshold: float = 0.5, threshold: float = 0.5,
max_positive: Optional[int] = None, max_positive: Optional[int] = None,
scorer: Optional[Callable] = spancat_score, scorer: Optional[Callable] = spancat_score,
store_activations=False,
) -> None: ) -> None:
"""Initialize the span categorizer. """Initialize the span categorizer.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
@ -218,6 +223,7 @@ class SpanCategorizer(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self.scorer = scorer self.scorer = scorer
self.store_activations = store_activations
@property @property
def key(self) -> str: def key(self) -> str:
@ -270,7 +276,7 @@ class SpanCategorizer(TrainablePipe):
""" """
indices = self.suggester(docs, ops=self.model.ops) indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores return {"indices": indices, "scores": scores}
def set_candidates( def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates" self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@ -290,7 +296,7 @@ class SpanCategorizer(TrainablePipe):
for index in candidates.dataXd: for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]]) doc.spans[candidates_key].append(doc[index[0] : index[1]])
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: def set_annotations(self, docs: Iterable[Doc], activations) -> None:
"""Modify a batch of Doc objects, using pre-computed scores. """Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify. docs (Iterable[Doc]): The documents to modify.
@ -299,10 +305,19 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#set_annotations DOCS: https://spacy.io/api/spancategorizer#set_annotations
""" """
labels = self.labels labels = self.labels
indices, scores = indices_scores
indices = activations["indices"]
scores = activations["scores"]
offset = 0 offset = 0
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
indices_i = indices[i].dataXd indices_i = indices[i].dataXd
doc.activations[self.name] = {}
if "indices" in self.store_activations:
doc.activations[self.name]["indices"] = indices_i
if "scores" in self.store_activations:
doc.activations[self.name]["scores"] = scores[
offset : offset + indices.lengths[i]
]
doc.spans[self.key] = self._make_span_group( doc.spans[self.key] = self._make_span_group(
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
) )
@ -474,3 +489,7 @@ class SpanCategorizer(TrainablePipe):
spans.append(Span(doc, start, end, label=labels[j])) spans.append(Span(doc, start, end, label=labels[j]))
return spans return spans
@property
def activations(self):
return ["indices", "scores"]

View File

@ -1,3 +1,4 @@
from typing import cast
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal
@ -7,6 +8,7 @@ from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
from spacy.morphology import Morphology from spacy.morphology import Morphology
from spacy.pipeline import TrainablePipe
from spacy.attrs import MORPH from spacy.attrs import MORPH
from spacy.tokens import Doc from spacy.tokens import Doc
@ -197,3 +199,29 @@ def test_overfitting_IO():
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
assert [str(t.morph) for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
def test_store_activations():
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
nlp = English()
morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
train_examples = []
for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
nlp.initialize(get_examples=lambda: train_examples)
doc = nlp("This is a test.")
assert len(list(doc.activations["morphologizer"].keys())) == 0
morphologizer.store_activations = True
doc = nlp("This is a test.")
assert "morphologizer" in doc.activations
assert set(doc.activations["morphologizer"].keys()) == {"guesses", "probs"}
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)
assert doc.activations["morphologizer"]["guesses"].shape == (5,)
morphologizer.store_activations = ["probs"]
doc = nlp("This is a test.")
assert "morphologizer" in doc.activations
assert set(doc.activations["morphologizer"].keys()) == {"probs"}
assert doc.activations["morphologizer"]["probs"].shape == (5, 6)

View File

@ -419,3 +419,29 @@ def test_set_candidates():
assert len(docs[0].spans["candidates"]) == 9 assert len(docs[0].spans["candidates"]) == 9
assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][0].text == "Just"
assert docs[0].spans["candidates"][4].text == "Just a" assert docs[0].spans["candidates"][4].text == "Just a"
def test_store_activations():
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
nlp = English()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
nO = spancat.model.get_dim("nO")
assert nO == 2
assert set(spancat.labels) == {"LOC", "PERSON"}
doc = nlp("This is a test.")
assert len(list(doc.activations["spancat"].keys())) == 0
spancat.store_activations = True
doc = nlp("This is a test.")
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
assert doc.activations["spancat"]["indices"].shape == (12, 2)
assert doc.activations["spancat"]["scores"].shape == (12, nO)
spancat.store_activations = True
spancat.store_activations = ["scores"]
doc = nlp("This is a test.")
assert set(doc.activations["spancat"].keys()) == {"scores"}
assert doc.activations["spancat"]["scores"].shape == (12, nO)