spaCy/spacy/pipeline/textcat.py
Daniël de Kok efdbb722c5
Store activations in Docs when save_activations is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled

This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.

As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.

* Change type of `store_activations` to `Union[bool, List[str]]`

When the value is:

- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored

* Formatting fixes in Tagger

* Support store_activations in spancat and morphologizer

* Make Doc.activations type visible to MyPy

* textcat/textcat_multilabel: add store_activations option

* trainable_lemmatizer/entity_linker: add store_activations option

* parser/ner: do not currently support returning activations

* Extend tagger and senter tests

So that they, like the other tests, also check that we get no
activations if no activations were requested.

* Document `Doc.activations` and `store_activations` in the relevant pipes

* Start errors/warnings at higher numbers to avoid merge conflicts

Between the master and v4 branches.

* Add `store_activations` to docstrings.

* Replace store_activations setter by set_store_activations method

Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.

* Use dict comprehension suggested by @svlandeg

* Revert "Use dict comprehension suggested by @svlandeg"

This reverts commit 6e7b958f70.

* EntityLinker: add type annotations to _add_activations

* _store_activations: make kwarg-only, remove doc_scores_lens arg

* set_annotations: add type annotations

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* TextCat.predict: return dict

* Make the `TrainablePipe.store_activations` property a bool

This means that we can also bring back `store_activations` setter.

* Remove `TrainablePipe.activations`

We do not need to enumerate the activations anymore since `store_activations` is
`bool`.

* Add type annotations for activations in predict/set_annotations

* Rename `TrainablePipe.store_activations` to `save_activations`

* Error E1400 is not used anymore

This error was used when activations were still `Union[bool, List[str]]`.

* Change wording in API docs after store -> save change

* docs: tag (save_)activations as new in spaCy 4.0

* Fix copied line in morphologizer activations test

* Don't train in any test_save_activations test

* Rename activations

- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
  "guesses" -> "tree_ids".

* Remove unused W400 warning.

This warning was used when we still allowed the user to specify
which activations to save.

* Formatting fixes

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Replace "kb_ids" by a constant

* spancat: replace a cast by an assertion

* Fix EOF spacing

* Fix comments in test_save_activations tests

* Do not set RNG seed in activation saving tests

* Revert "spancat: replace a cast by an assertion"

This reverts commit 0bd5730d16.

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 09:51:12 +02:00

420 lines
15 KiB
Python

from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
from thinc.types import Floats2d
import numpy
from itertools import islice
from .trainable_pipe import TrainablePipe
from ..language import Language
from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab
ActivationsT = Dict[str, Floats2d]
single_label_default_config = """
[model]
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["model"]
single_label_bow_config = """
[model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
single_label_cnn_config = """
[model]
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
@Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={
"threshold": 0.5,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
"save_activations": False,
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
"cats_macro_auc_per_type": None,
},
)
def make_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
save_activations: bool,
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be mutually exclusive (i.e. one true label per doc).
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
save_activations (bool): save model activations in Doc when annotating.
"""
return TextCategorizer(
nlp.vocab,
model,
name,
threshold=threshold,
scorer=scorer,
save_activations=save_activations,
)
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
@registry.scorers("spacy.textcat_scorer.v1")
def make_textcat_scorer():
return textcat_score
class TextCategorizer(TrainablePipe):
"""Pipeline component for single-label text classification.
DOCS: https://spacy.io/api/textcategorizer
"""
def __init__(
self,
vocab: Vocab,
model: Model,
name: str = "textcat",
*,
threshold: float,
scorer: Optional[Callable] = textcat_score,
save_activations: bool = False,
) -> None:
"""Initialize a text categorizer for single-label classification.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats".
DOCS: https://spacy.io/api/textcategorizer#init
"""
self.vocab = vocab
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg)
self.scorer = scorer
self.save_activations = save_activations
@property
def support_missing_values(self):
# There are no missing values as the textcat should always
# predict exactly one label. All other labels are 0.0
# Subclasses may override this property to change internal behaviour.
return False
@property
def labels(self) -> Tuple[str]:
"""RETURNS (Tuple[str]): The labels currently added to the component.
DOCS: https://spacy.io/api/textcategorizer#labels
"""
return tuple(self.cfg["labels"]) # type: ignore[arg-type, return-value]
@property
def label_data(self) -> List[str]:
"""RETURNS (List[str]): Information about the component's labels.
DOCS: https://spacy.io/api/textcategorizer#label_data
"""
return self.labels # type: ignore[return-value]
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.
DOCS: https://spacy.io/api/textcategorizer#predict
"""
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
tensors = [doc.tensor for doc in docs]
xp = self.model.ops.xp
scores = xp.zeros((len(list(docs)), len(self.labels)))
return {"probabilities": scores}
scores = self.model.predict(docs)
scores = self.model.ops.asarray(scores)
return {"probabilities": scores}
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
scores: The scores to set, produced by TextCategorizer.predict.
DOCS: https://spacy.io/api/textcategorizer#set_annotations
"""
probs = activations["probabilities"]
for i, doc in enumerate(docs):
if self.save_activations:
doc.activations[self.name] = {}
doc.activations[self.name]["probabilities"] = probs[i]
for j, label in enumerate(self.labels):
doc.cats[label] = float(probs[i, j])
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/textcategorizer#update
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_examples(examples, "TextCategorizer.update")
self._validate_categories(examples)
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return losses
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def rehearse(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
to try to address the "catastrophic forgetting" problem. This feature is
experimental.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/textcategorizer#rehearse
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
if self._rehearsal_model is None:
return losses
validate_examples(examples, "TextCategorizer.rehearse")
self._validate_categories(examples)
docs = [eg.predicted for eg in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return losses
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update(docs)
target, _ = self._rehearsal_model.begin_update(docs)
gradient = scores - target
bp_scores(gradient)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += (gradient**2).sum()
return losses
def _examples_to_truth(
self, examples: Iterable[Example]
) -> Tuple[numpy.ndarray, numpy.ndarray]:
nr_examples = len(list(examples))
truths = numpy.zeros((nr_examples, len(self.labels)), dtype="f")
not_missing = numpy.ones((nr_examples, len(self.labels)), dtype="f")
for i, eg in enumerate(examples):
for j, label in enumerate(self.labels):
if label in eg.reference.cats:
truths[i, j] = eg.reference.cats[label]
elif self.support_missing_values:
not_missing[i, j] = 0.0
truths = self.model.ops.asarray(truths) # type: ignore
return truths, not_missing # type: ignore
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/textcategorizer#get_loss
"""
validate_examples(examples, "TextCategorizer.get_loss")
self._validate_categories(examples)
truths, not_missing = self._examples_to_truth(examples)
not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = scores - truths
d_scores *= not_missing
mean_square_error = (d_scores**2).mean()
return float(mean_square_error), d_scores
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/textcategorizer#add_label
"""
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
self._allow_extra_label()
self.cfg["labels"].append(label) # type: ignore[attr-defined]
if self.model and "resize_output" in self.model.attrs:
self.model = self.model.attrs["resize_output"](self.model, len(self.labels))
self.vocab.strings.add(label)
return 1
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
labels: Optional[Iterable[str]] = None,
positive_label: Optional[str] = None,
) -> None:
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
positive_label (Optional[str]): The positive label for a binary task with exclusive classes,
`None` otherwise and by default.
DOCS: https://spacy.io/api/textcategorizer#initialize
"""
validate_get_examples(get_examples, "TextCategorizer.initialize")
self._validate_categories(get_examples())
if labels is None:
for example in get_examples():
for cat in example.y.cats:
self.add_label(cat)
else:
for label in labels:
self.add_label(label)
if len(self.labels) < 2:
raise ValueError(Errors.E867)
if positive_label is not None:
if positive_label not in self.labels:
err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
raise ValueError(err)
if len(self.labels) != 2:
err = Errors.E919.format(pos_label=positive_label, labels=self.labels)
raise ValueError(err)
self.cfg["positive_label"] = positive_label
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch]
label_sample, _ = self._examples_to_truth(subbatch)
self._require_labels()
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def _validate_categories(self, examples: Iterable[Example]):
"""Check whether the provided examples all have single-label cats annotations."""
for ex in examples:
if list(ex.reference.cats.values()).count(1.0) > 1:
raise ValueError(Errors.E895.format(value=ex.reference.cats))