spaCy/spacy/pipeline/tagger.pyx
2023-07-19 16:37:31 +02:00

368 lines
14 KiB
Cython

# cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy
from thinc.api import Config, Model, set_dropout_rate
from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d
from ..tokens.doc cimport Doc
from .. import util
from ..errors import Errors
from ..language import Language
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
from .trainable_pipe import TrainablePipe
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
default_config={
"model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"label_smoothing": 0.0,
"save_activations": False,
},
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(
nlp: Language,
name: str,
model: Model,
overwrite: bool,
scorer: Optional[Callable],
neg_prefix: str,
label_smoothing: float,
save_activations: bool,
):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
the tag probabilities. The output vectors should match the number of tags
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
label_smoothing=label_smoothing, save_activations=save_activations)
def tagger_score(examples, **kwargs):
return Scorer.score_token_attr(examples, "tag", **kwargs)
@registry.scorers("spacy.tagger_scorer.v1")
def make_tagger_scorer():
return tagger_score
class Tagger(TrainablePipe):
"""Pipeline component for part-of-speech tagging.
DOCS: https://spacy.io/api/tagger
"""
def __init__(
self,
vocab,
model,
name="tagger",
*,
overwrite=False,
scorer=tagger_score,
neg_prefix="!",
label_smoothing=0.0,
save_activations: bool = False,
):
"""Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/tagger#init
"""
self.vocab = vocab
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {
"labels": [],
"overwrite": overwrite,
"neg_prefix": neg_prefix,
"label_smoothing": label_smoothing
}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.save_activations = save_activations
@property
def labels(self):
"""The labels currently added to the component. Note that even for a
blank component, this will always include the built-in coarse-grained
part-of-speech tags by default.
RETURNS (Tuple[str]): The labels.
DOCS: https://spacy.io/api/tagger#labels
"""
return tuple(self.cfg["labels"])
@property
def label_data(self):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
def predict(self, docs) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.
DOCS: https://spacy.io/api/tagger#predict
"""
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.labels)
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
return {"probabilities": guesses, "label_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = self._scores2guesses(scores)
assert len(guesses) == len(docs)
return {"probabilities": scores, "label_ids": guesses}
def _scores2guesses(self, scores):
guesses = []
for doc_scores in scores:
doc_guesses = doc_scores.argmax(axis=1)
if not isinstance(doc_guesses, numpy.ndarray):
doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
return guesses
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
DOCS: https://spacy.io/api/tagger#set_annotations
"""
batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
doc.activations[self.name][act_name] = acts[i]
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
if doc.c[j].tag == 0 or overwrite:
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
def update(self, examples, *, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/tagger#update
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_examples(examples, "Tagger.update")
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return losses
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()):
raise ValueError(Errors.E940)
loss, d_tag_scores = self.get_loss(examples, tag_scores)
bp_tag_scores(d_tag_scores)
if sgd not in (None, False):
self.finish_update(sgd)
losses[self.name] += loss
return losses
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
to try to address the "catastrophic forgetting" problem. This feature is
experimental.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/tagger#rehearse
"""
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_examples(examples, "Tagger.rehearse")
docs = [eg.predicted for eg in examples]
if self._rehearsal_model is None:
return losses
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return losses
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs)
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
bp_tag_scores(grads)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
) -> Tuple[float, List[Floats2d]]:
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
teacher_scores: Scores representing the teacher model's predictions.
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
"""
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
d_scores, loss = loss_func(student_scores, teacher_scores)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/tagger#get_loss
"""
validate_examples(examples, "Tagger.get_loss")
loss_func = LegacySequenceCategoricalCrossentropy(
names=self.labels,
normalize=False,
neg_prefix=self.cfg["neg_prefix"],
label_smoothing=self.cfg["label_smoothing"]
)
# Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing
# value None.
truths = []
for eg in examples:
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def initialize(self, get_examples, *, nlp=None, labels=None):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects..
nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://spacy.io/api/tagger#initialize
"""
validate_get_examples(get_examples, "Tagger.initialize")
util.check_lexeme_norms(self.vocab, "tagger")
if labels is not None:
for tag in labels:
self.add_label(tag)
else:
tags = set()
for example in get_examples():
for token in example.y:
if token.tag_:
tags.add(token.tag_)
for tag in sorted(tags):
self.add_label(tag)
doc_sample = []
label_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
gold_tags = example.get_aligned("TAG", as_string=True)
gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
self._require_labels()
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def add_label(self, label):
"""Add a new label to the pipe.
label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/tagger#add_label
"""
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label in self.labels:
return 0
self._allow_extra_label()
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
return 1