spaCy/spacy/pipeline/tagger.pyx

# cython: infer_types=True, profile=True, binding=True
from typing import Callable, Optional
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
from thinc.types import Floats2d
import warnings
from itertools import islice

from ..tokens.doc cimport Doc
from ..morphology cimport Morphology
from ..vocab cimport Vocab

from .trainable_pipe import TrainablePipe
from .pipe import deserialize_config
from ..language import Language
from ..attrs import POS, ID
from ..parts_of_speech import X
from ..errors import Errors, Warnings
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util


default_model_config = """
[model]
@architectures = "spacy.Tagger.v1"

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]


@Language.factory(
    "tagger",
    assigns=["token.tag"],
    default_config={"model": DEFAULT_TAGGER_MODEL, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
    default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model, scorer: Optional[Callable]):
    """Construct a part-of-speech tagger component.

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
        the tag probabilities. The output vectors should match the number of tags
        in size, and be normalized as probabilities (all scores between 0 and 1,
        with the rows summing to 1).
    """
    return Tagger(nlp.vocab, model, name, scorer=scorer)


def tagger_score(examples, **kwargs):
    return Scorer.score_token_attr(examples, "tag", **kwargs)


@registry.scorers("spacy.tagger_scorer.v1")
def make_tagger_scorer():
    return tagger_score


class Tagger(TrainablePipe):
    """Pipeline component for part-of-speech tagging.

    DOCS: https://spacy.io/api/tagger
    """
    def __init__(self, vocab, model, name="tagger", *, scorer=tagger_score):
        """Initialize a part-of-speech tagger.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "tag".

        DOCS: https://spacy.io/api/tagger#init
        """
        self.vocab = vocab
        self.model = model
        self.name = name
        self._rehearsal_model = None
        cfg = {"labels": []}
        self.cfg = dict(sorted(cfg.items()))
        self.scorer = scorer

    @property
    def labels(self):
        """The labels currently added to the component. Note that even for a
        blank component, this will always include the built-in coarse-grained
        part-of-speech tags by default.

        RETURNS (Tuple[str]): The labels.

        DOCS: https://spacy.io/api/tagger#labels
        """
        return tuple(self.cfg["labels"])

    @property
    def label_data(self):
        """Data about the labels currently added to the component."""
        return tuple(self.cfg["labels"])

    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS: The models prediction for each document.

        DOCS: https://spacy.io/api/tagger#predict
        """
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.labels)
            guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
            assert len(guesses) == len(docs)
            return guesses
        scores = self.model.predict(docs)
        assert len(scores) == len(docs), (len(scores), len(docs))
        guesses = self._scores2guesses(scores)
        assert len(guesses) == len(docs)
        return guesses

    def _scores2guesses(self, scores):
        guesses = []
        for doc_scores in scores:
            doc_guesses = doc_scores.argmax(axis=1)
            if not isinstance(doc_guesses, numpy.ndarray):
                doc_guesses = doc_guesses.get()
            guesses.append(doc_guesses)
        return guesses

    def set_annotations(self, docs, batch_tag_ids):
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        batch_tag_ids: The IDs to set, produced by Tagger.predict.

        DOCS: https://spacy.io/api/tagger#set_annotations
        """
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0:
                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]

    def update(self, examples, *, drop=0., sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/tagger#update
        """
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "Tagger.update")
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return losses
        set_dropout_rate(self.model, drop)
        tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
                raise ValueError(Errors.E940)
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
        bp_tag_scores(d_tag_scores)
        if sgd not in (None, False):
            self.finish_update(sgd)

        losses[self.name] += loss
        return losses

    def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
        """Perform a "rehearsal" update from a batch of data. Rehearsal updates
        teach the current model to make predictions similar to an initial model,
        to try to address the "catastrophic forgetting" problem. This feature is
        experimental.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/tagger#rehearse
        """
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "Tagger.rehearse")
        docs = [eg.predicted for eg in examples]
        if self._rehearsal_model is None:
            return losses
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return losses
        set_dropout_rate(self.model, drop)
        guesses, backprop = self.model.begin_update(docs)
        target = self._rehearsal_model(examples)
        gradient = guesses - target
        backprop(gradient)
        self.finish_update(sgd)
        losses[self.name] += (gradient**2).sum()
        return losses

    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.

        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.

        DOCS: https://spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix="!")
        # Convert empty tag "" to missing value None so that both misaligned
        # tokens and tokens with missing annotation have the default missing
        # value None.
        truths = []
        for eg in examples:
            eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores

    def initialize(self, get_examples, *, nlp=None, labels=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects..
        nlp (Language): The current nlp object the component is part of.
        labels: The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.

        DOCS: https://spacy.io/api/tagger#initialize
        """
        validate_get_examples(get_examples, "Tagger.initialize")
        util.check_lexeme_norms(self.vocab, "tagger")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
        else:
            tags = set()
            for example in get_examples():
                for token in example.y:
                    if token.tag_:
                        tags.add(token.tag_)
            for tag in sorted(tags):
                self.add_label(tag)
        doc_sample = []
        label_sample = []
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            gold_tags = example.get_aligned("TAG", as_string=True)
            gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
            label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
        self._require_labels()
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

    def add_label(self, label):
        """Add a new label to the pipe.

        label (str): The label to add.
        RETURNS (int): 0 if label is already present, otherwise 1.

        DOCS: https://spacy.io/api/tagger#add_label
        """
        if not isinstance(label, str):
            raise ValueError(Errors.E187)
        if label in self.labels:
            return 0
        self._allow_extra_label()
        self.cfg["labels"].append(label)
        self.vocab.strings.add(label)
        return 1