Tidy up pipes (#5906)

* Tidy up pipes * Fix init, defaults and raise custom errors * Update docs * Update docs [ci skip] * Apply suggestions from code review Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> * Tidy up error handling and validation, fix consistency * Simplify get_examples check * Remove unused import [ci skip] Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-09-07 21:05:05 +03:00 · 2020-08-11 23:29:31 +02:00 · 2020-08-11 23:29:31 +02:00 · 950832f087
commit 950832f087
parent b7ec06e331
33 changed files with 354 additions and 209 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -295,7 +295,11 @@ def train_while_improving(
                nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
        # TODO: refactor this so we don't have to run it separately in here
        for name, proc in nlp.pipeline:
-            if name not in exclude and hasattr(proc, "model"):
+            if (
                name not in exclude
                and hasattr(proc, "model")
                and proc.model not in (True, False, None)
            ):
                proc.model.finish_update(optimizer)
        optimizer.step_schedules()
        if not (step % eval_frequency):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -482,6 +482,15 @@ class Errors:
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
    # TODO: fix numbering after merging develop into master
    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
    E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
            "'{name}'. If the component is trainable and you want to use this "
            "method, make sure it's overwritten on the subclass. If your "
            "component isn't trainable, add a method that does nothing or "
            "don't use the Pipe base class.")
    E940 = ("Found NaN values in scores.")
    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
            "model from a shortcut, which is deprecated as of spaCy v3.0. To "
            "load the model, use its full name instead:\n\n"
@ -578,8 +587,7 @@ class Errors:
            "but received None.")
    E977 = ("Can not compare a MorphAnalysis with a string object. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
-    E978 = ("The '{method}' method of {name} takes a list of Example objects, "
+    E978 = ("The {name} method takes a list of Example objects, but got: {types}")
            "but found {types} instead.")
    E979 = ("Cannot convert {type} to an Example object.")
    E980 = ("Each link annotation should refer to a dictionary with at most one "
            "identifier mapping to 1.0, and all others to 0.0.")
--- a/spacy/gold/init.py
+++ b/spacy/gold/init.py
@ -1,5 +1,5 @@
 from .corpus import Corpus  # noqa: F401
-from .example import Example  # noqa: F401
+from .example import Example, validate_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
 from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags  # noqa: F401
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -1,5 +1,5 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
 from ..tokens.doc cimport Doc
@ -26,6 +26,22 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
    return output
 def validate_examples(examples, method):
    """Check that a batch of examples received during processing is valid.
    This function lives here to prevent circular imports.
    examples (Iterable[Examples]): A batch of examples.
    method (str): The method name to show in error messages.
    """
    if not isinstance(examples, IterableInstance):
        err = Errors.E978.format(name=method, types=type(examples))
        raise TypeError(err)
    wrong = set([type(eg) for eg in examples if not isinstance(eg, Example)])
    if wrong:
        err = Errors.E978.format(name=method, types=wrong)
        raise TypeError(err)
 cdef class Example:
    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
        if predicted is None:
@ -263,12 +279,10 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([vocab.morphology.add(v) for v in value])
        else:
            attrs.append(key)
-            try:
+            if not all(isinstance(v, str) for v in value):
-                values.append([vocab.strings.add(v) for v in value])
+                types = set([type(v) for v in value])
            except TypeError:
                types= set([type(v) for v in value])
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
-
+            values.append([vocab.strings.add(v) for v in value])
    array = numpy.asarray(values, dtype="uint64")
    return attrs, array.T
--- a/spacy/language.py
+++ b/spacy/language.py
@ -5,7 +5,6 @@ import random
 import itertools
 import weakref
 import functools
 from collections import Iterable as IterableInstance
 from contextlib import contextmanager
 from copy import copy, deepcopy
 from pathlib import Path
@ -19,7 +18,7 @@ from timeit import default_timer as timer
 from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .gold import Example
+from .gold import Example, validate_examples
 from .scorer import Scorer
 from .util import create_default_optimizer, registry
 from .util import SimpleFrozenDict, combine_score_weights
@ -935,17 +934,7 @@ class Language:
            losses = {}
        if len(examples) == 0:
            return losses
-        if not isinstance(examples, IterableInstance):
+        validate_examples(examples, "Language.update")
            raise TypeError(
                Errors.E978.format(
                    name="language", method="update", types=type(examples)
                )
            )
        wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
        if wrong_types:
            raise TypeError(
                Errors.E978.format(name="language", method="update", types=wrong_types)
            )
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = create_default_optimizer()
@ -962,7 +951,11 @@ class Language:
            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
        if sgd not in (None, False):
            for name, proc in self.pipeline:
-                if name not in exclude and hasattr(proc, "model"):
+                if (
                    name not in exclude
                    and hasattr(proc, "model")
                    and proc.model not in (True, False, None)
                ):
                    proc.model.finish_update(sgd)
        return losses
@ -999,19 +992,7 @@ class Language:
        """
        if len(examples) == 0:
            return
-        if not isinstance(examples, IterableInstance):
+        validate_examples(examples, "Language.rehearse")
            raise TypeError(
                Errors.E978.format(
                    name="language", method="rehearse", types=type(examples)
                )
            )
        wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
        if wrong_types:
            raise TypeError(
                Errors.E978.format(
                    name="language", method="rehearse", types=wrong_types
                )
            )
        if sgd is None:
            if self._optimizer is None:
                self._optimizer = create_default_optimizer()
@ -1060,7 +1041,15 @@ class Language:
        if get_examples is None:
            get_examples = lambda: []
        else:  # Populate vocab
            if not hasattr(get_examples, "__call__"):
                err = Errors.E930.format(name="Language", obj=type(get_examples))
                raise ValueError(err)
            for example in get_examples():
                if not isinstance(example, Example):
                    err = Errors.E978.format(
                        name="Language.begin_training", types=type(example)
                    )
                    raise ValueError(err)
                for word in [t.text for t in example.reference]:
                    _ = self.vocab[word]  # noqa: F841
        if device >= 0:  # TODO: do we need this here?
@ -1133,17 +1122,7 @@ class Language:
        DOCS: https://spacy.io/api/language#evaluate
        """
-        if not isinstance(examples, IterableInstance):
+        validate_examples(examples, "Language.evaluate")
            err = Errors.E978.format(
                name="language", method="evaluate", types=type(examples)
            )
            raise TypeError(err)
        wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
        if wrong_types:
            err = Errors.E978.format(
                name="language", method="evaluate", types=wrong_types
            )
            raise TypeError(err)
        if component_cfg is None:
            component_cfg = {}
        if scorer_cfg is None:
@ -1663,7 +1642,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None:
    else:
        raise ValueError(Errors.E092)
    for name, proc in nlp.pipeline:
-        if not hasattr(proc, "cfg"):
+        if not hasattr(proc, "cfg") or not isinstance(proc.cfg, dict):
            continue
        proc.cfg.setdefault("deprecation_fixes", {})
        proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -9,6 +9,7 @@ from .functions import merge_subtokens
 from ..language import Language
 from ._parser_internals import nonproj
 from ..scorer import Scorer
 from ..gold import validate_examples
 default_model_config = """
@ -147,6 +148,7 @@ cdef class DependencyParser(Parser):
        DOCS: https://spacy.io/api/dependencyparser#score
        """
        validate_examples(examples, "DependencyParser.score")
        def dep_getter(token, attr):
            dep = getattr(token, attr)
            dep = token.vocab.strings.as_string(dep).lower()
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -11,7 +11,7 @@ from ..tokens import Doc
 from .pipe import Pipe, deserialize_config
 from ..language import Language
 from ..vocab import Vocab
-from ..gold import Example
+from ..gold import Example, validate_examples
 from ..errors import Errors, Warnings
 from .. import util
@ -142,7 +142,7 @@ class EntityLinker(Pipe):
    def begin_training(
        self,
-        get_examples: Callable[[], Iterable[Example]] = lambda: [],
+        get_examples: Callable[[], Iterable[Example]],
        *,
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
@ -197,14 +197,9 @@ class EntityLinker(Pipe):
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
        validate_examples(examples, "EntityLinker.update")
        sentence_docs = []
        try:
        docs = [eg.predicted for eg in examples]
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(
                Errors.E978.format(name="EntityLinker", method="update", types=types)
            ) from None
        if set_annotations:
            # This seems simpler than other ways to get that exact output -- but
            # it does run the model twice :(
@ -250,6 +245,7 @@ class EntityLinker(Pipe):
        return losses
    def get_loss(self, examples: Iterable[Example], sentence_encodings):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -9,6 +9,7 @@ from ..util import ensure_path, to_disk, from_disk
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
 from ..scorer import Scorer
 from ..gold import validate_examples
 DEFAULT_ENT_ID_SEP = "||"
@ -312,6 +313,7 @@ class EntityRuler:
        return label
    def score(self, examples, **kwargs):
        validate_examples(examples, "EntityRuler.score")
        return Scorer.score_spans(examples, "ents", **kwargs)
    def from_bytes(
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@ -1,5 +1,4 @@
 from typing import Optional, List, Dict, Any
 from thinc.api import Model
 from .pipe import Pipe
@ -9,6 +8,7 @@ from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
 from ..gold import validate_examples
 from .. import util
@ -135,10 +135,10 @@ class Lemmatizer(Pipe):
        elif self.mode == "rule":
            self.lemmatize = self.rule_lemmatize
        else:
-            try:
+            mode_attr = f"{self.mode}_lemmatize"
-                self.lemmatize = getattr(self, f"{self.mode}_lemmatize")
+            if not hasattr(self, mode_attr):
            except AttributeError:
                raise ValueError(Errors.E1003.format(mode=mode))
            self.lemmatize = getattr(self, mode_attr)
        self.cache = {}
    @property
@ -271,6 +271,7 @@ class Lemmatizer(Pipe):
        DOCS: https://spacy.io/api/lemmatizer#score
        """
        validate_examples(examples, "Lemmatizer.score")
        return Scorer.score_token_attr(examples, "lemma", **kwargs)
    def to_disk(self, path, *, exclude=tuple()):
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -6,15 +6,16 @@ from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..morphology cimport Morphology
 from ..parts_of_speech import IDS as POS_IDS
 from ..symbols import POS
 from ..language import Language
 from ..errors import Errors
 from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
 from ..gold import validate_examples
 default_model_config = """
@ -126,7 +127,7 @@ class Morphologizer(Tagger):
            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1
-    def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        get_examples (Callable[[], Iterable[Example]]): Optional function that
@ -140,6 +141,9 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#begin_training
        """
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Morphologizer", obj=type(get_examples))
            raise ValueError(err)
        for example in get_examples():
            for i, token in enumerate(example.reference):
                pos = token.pos_
@ -192,6 +196,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#get_loss
        """
        validate_examples(examples, "Morphologizer.get_loss")
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = []
        for eg in examples:
@ -228,6 +233,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#score
        """
        validate_examples(examples, "Morphologizer.score")
        results = {}
        results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
        results.update(Scorer.score_token_attr(examples, "morph", **kwargs))
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -8,6 +8,7 @@ from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from .tagger import Tagger
 from ..gold import validate_examples
 from ..language import Language
 from ._parser_internals import nonproj
 from ..attrs import POS, ID
@ -80,10 +81,11 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
+    def begin_training(self, get_examples, pipeline=None, sgd=None):
-        gold_examples = nonproj.preprocess_training_data(get_examples())
+        if not hasattr(get_examples, "__call__"):
-        # for raw_text, doc_annot in gold_tuples:
+            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
-        for example in gold_examples:
+            raise ValueError(err)
        for example in get_examples():
            for token in example.y:
                label = self.make_label(token)
                if label is not None and label not in self.labels:
@ -175,7 +177,7 @@ class ClozeMultitask(Pipe):
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
+    def begin_training(self, get_examples, pipeline=None, sgd=None):
        self.model.initialize()
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.begin_training(X)
@ -189,6 +191,7 @@ class ClozeMultitask(Pipe):
        return tokvecs, vectors
    def get_loss(self, examples, vectors, prediction):
        validate_examples(examples, "ClozeMultitask.get_loss")
        # The simplest way to implement this would be to vstack the
        # token.vector values, but that's a bit inefficient, especially on GPU.
        # Instead we fetch the index into the vectors table for each of our tokens,
@ -206,18 +209,16 @@ class ClozeMultitask(Pipe):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        set_dropout_rate(self.model, drop)
-        try:
+        validate_examples(examples, "ClozeMultitask.rehearse")
-            predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
+        docs = [eg.predicted for eg in examples]
-        except AttributeError:
+        predictions, bp_predictions = self.model.begin_update()
            types = set([type(eg) for eg in examples])
            raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None
        loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions)
        if sgd is not None:
            self.model.finish_update(sgd)
        if losses is not None:
            losses[self.name] += loss
        return losses
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -7,6 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
 from ..language import Language
 from ..scorer import Scorer
 from ..gold import validate_examples
 default_model_config = """
@ -120,4 +121,5 @@ cdef class EntityRecognizer(Parser):
        DOCS: https://spacy.io/api/entityrecognizer#score
        """
        validate_examples(examples, "EntityRecognizer.score")
        return Scorer.score_spans(examples, "ents", **kwargs)
--- a/spacy/pipeline/pipe.pxd
+++ b/spacy/pipeline/pipe.pxd
@ -1,2 +1,5 @@
 cdef class Pipe:
    cdef public object vocab
    cdef public object model
    cdef public str name
    cdef public object cfg
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,9 +1,10 @@
 # cython: infer_types=True, profile=True
 import srsly
 from thinc.api import set_dropout_rate, Model
 from ..tokens.doc cimport Doc
-from ..util import create_default_optimizer
+from ..gold import validate_examples
 from ..errors import Errors
 from .. import util
@ -16,7 +17,6 @@ cdef class Pipe:
    DOCS: https://spacy.io/api/pipe
    """
    def __init__(self, vocab, model, name, **cfg):
        """Initialize a pipeline component.
@ -27,7 +27,10 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#init
        """
-        raise NotImplementedError
+        self.vocab = vocab
        self.model = model
        self.name = name
        self.cfg = dict(cfg)
    def __call__(self, Doc doc):
        """Apply the pipe to one document. The document is modified in place,
@ -68,7 +71,7 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#predict
        """
-        raise NotImplementedError
+        raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
    def set_annotations(self, docs, scores):
        """Modify a batch of documents, using pre-computed scores.
@ -78,7 +81,43 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#set_annotations
        """
-        raise NotImplementedError
+        raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
    def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        set_annotations (bool): Whether or not to update the Example objects
            with the predictions.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.
        DOCS: https://spacy.io/api/pipe#update
        """
        if losses is None:
            losses = {}
        if not hasattr(self, "model") or self.model in (None, True, False):
            return losses
        losses.setdefault(self.name, 0.0)
        validate_examples(examples, "Pipe.update")
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
        loss, d_scores = self.get_loss(examples, scores)
        bp_scores(d_scores)
        if sgd not in (None, False):
            self.model.finish_update(sgd)
        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, scores=scores)
        return losses
    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        """Perform a "rehearsal" update from a batch of data. Rehearsal updates
@ -107,7 +146,7 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#get_loss
        """
-        raise NotImplementedError
+        raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
    def add_label(self, label):
        """Add an output label, to be predicted by the model. It's possible to
@ -119,7 +158,7 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#add_label
        """
-        raise NotImplementedError
+        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
    def create_optimizer(self):
        """Create an optimizer for the pipeline component.
@ -128,9 +167,9 @@ cdef class Pipe:
        DOCS: https://spacy.io/api/pipe#create_optimizer
        """
-        return create_default_optimizer()
+        return util.create_default_optimizer()
-    def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        get_examples (Callable[[], Iterable[Example]]): Optional function that
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -7,6 +7,7 @@ from ..tokens.doc cimport Doc
 from .pipe import Pipe
 from ..language import Language
 from ..scorer import Scorer
 from ..gold import validate_examples
 from .. import util
@ -58,7 +59,7 @@ class Sentencizer(Pipe):
        else:
            self.punct_chars = set(self.default_punct_chars)
-    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None):
+    def begin_training(self, get_examples, pipeline=None, sgd=None):
        pass
    def __call__(self, doc):
@ -158,6 +159,7 @@ class Sentencizer(Pipe):
        DOCS: https://spacy.io/api/sentencizer#score
        """
        validate_examples(examples, "Sentencizer.score")
        results = Scorer.score_spans(examples, "sents", **kwargs)
        del results["sents_per_type"]
        return results
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -9,6 +9,7 @@ from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
 from ..gold import validate_examples
 from .. import util
@ -102,6 +103,7 @@ class SentenceRecognizer(Tagger):
        DOCS: https://spacy.io/api/sentencerecognizer#get_loss
        """
        validate_examples(examples, "SentenceRecognizer.get_loss")
        labels = self.labels
        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
        truths = []
@ -121,7 +123,7 @@ class SentenceRecognizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores
-    def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        get_examples (Callable[[], Iterable[Example]]): Optional function that
@ -151,6 +153,7 @@ class SentenceRecognizer(Tagger):
        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
        DOCS: https://spacy.io/api/sentencerecognizer#score
        """
        validate_examples(examples, "SentenceRecognizer.score")
        results = Scorer.score_spans(examples, "sents", **kwargs)
        del results["sents_per_type"]
        return results
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -1,4 +1,4 @@
-from typing import List, Iterable, Optional, Dict, Tuple, Callable
+from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
 from thinc.types import Floats2d
 from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
 from thinc.api import Optimizer, Config
@ -6,6 +6,7 @@ from thinc.util import to_numpy
 from ..errors import Errors
 from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
 from ..gold import validate_examples
 from ..tokens import Doc
 from ..language import Language
 from ..vocab import Vocab
@ -127,6 +128,7 @@ class SimpleNER(Pipe):
        if losses is None:
            losses = {}
        losses.setdefault("ner", 0.0)
        validate_examples(examples, "SimpleNER.update")
        if not any(_has_ner(eg) for eg in examples):
            return losses
        docs = [eg.predicted for eg in examples]
@ -142,6 +144,7 @@ class SimpleNER(Pipe):
        return losses
    def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
        validate_examples(examples, "SimpleNER.get_loss")
        truths = []
        for eg in examples:
            tags = eg.get_aligned_ner()
@ -161,14 +164,17 @@ class SimpleNER(Pipe):
    def begin_training(
        self,
-        get_examples: Callable,
+        get_examples: Callable[[], Iterable[Example]],
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
    ):
        all_labels = set()
        if not hasattr(get_examples, "__call__"):
-            gold_tuples = get_examples
+            err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
-            get_examples = lambda: gold_tuples
+            raise ValueError(err)
-        for label in _get_labels(get_examples()):
+        for example in get_examples():
            all_labels.update(_get_labels(example))
        for label in sorted(all_labels):
            self.add_label(label)
        labels = self.labels
        n_actions = self.model.attrs["get_num_actions"](len(labels))
@ -185,6 +191,7 @@ class SimpleNER(Pipe):
        pass
    def score(self, examples, **kwargs):
        validate_examples(examples, "SimpleNER.score")
        return Scorer.score_spans(examples, "ents", **kwargs)
@ -196,10 +203,9 @@ def _has_ner(example: Example) -> bool:
        return False
-def _get_labels(examples: List[Example]) -> List[str]:
+def _get_labels(example: Example) -> Set[str]:
    labels = set()
-    for eg in examples:
+    for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
        for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
        if ner_tag != "O" and ner_tag != "-":
            labels.add(ner_tag)
-    return list(sorted(labels))
+    return labels
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -16,6 +16,7 @@ from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, TempErrors, Warnings
 from ..scorer import Scorer
 from ..gold import validate_examples
 from .. import util
@ -187,19 +188,15 @@ class Tagger(Pipe):
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
-        try:
+        validate_examples(examples, "Tagger.update")
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None
        set_dropout_rate(self.model, drop)
-        tag_scores, bp_tag_scores = self.model.begin_update(
+        tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
            [eg.predicted for eg in examples])
        for sc in tag_scores:
            if self.model.ops.xp.isnan(sc.sum()):
-                raise ValueError("nan value in scores")
+                raise ValueError(Errors.E940)
        loss, d_tag_scores = self.get_loss(examples, tag_scores)
        bp_tag_scores(d_tag_scores)
        if sgd not in (None, False):
@ -226,11 +223,8 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#rehearse
        """
-        try:
+        validate_examples(examples, "Tagger.rehearse")
        docs = [eg.predicted for eg in examples]
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None
        if self._rehearsal_model is None:
            return
        if not any(len(doc) for doc in docs):
@ -256,6 +250,7 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#get_loss
        """
        validate_examples(examples, "Tagger.get_loss")
        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
        d_scores, loss = loss_func(scores, truths)
@ -263,7 +258,7 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores
-    def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
+    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        get_examples (Callable[[], Iterable[Example]]): Optional function that
@ -277,13 +272,12 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#begin_training
        """
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Tagger", obj=type(get_examples))
            raise ValueError(err)
        tags = set()
        for example in get_examples():
-            try:
+            for token in example.y:
                y = example.y
            except AttributeError:
                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
            for token in y:
                tags.add(token.tag_)
        for tag in sorted(tags):
            self.add_label(tag)
@ -318,6 +312,7 @@ class Tagger(Pipe):
        DOCS: https://spacy.io/api/tagger#score
        """
        validate_examples(examples, "Tagger.score")
        return Scorer.score_token_attr(examples, "tag", **kwargs)
    def to_bytes(self, *, exclude=tuple()):
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -5,7 +5,7 @@ import numpy
 from .pipe import Pipe
 from ..language import Language
-from ..gold import Example
+from ..gold import Example, validate_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from .. import util
@ -209,15 +209,10 @@ class TextCategorizer(Pipe):
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
-        try:
+        validate_examples(examples, "TextCategorizer.update")
        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
            # Handle cases where there are no tokens in any docs.
            return losses
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(
                Errors.E978.format(name="TextCategorizer", method="update", types=types)
            ) from None
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
        loss, d_scores = self.get_loss(examples, scores)
@ -252,19 +247,12 @@ class TextCategorizer(Pipe):
        DOCS: https://spacy.io/api/textcategorizer#rehearse
        """
        if losses is not None:
            losses.setdefault(self.name, 0.0)
        if self._rehearsal_model is None:
            return losses
-        try:
+        validate_examples(examples, "TextCategorizer.rehearse")
        docs = [eg.predicted for eg in examples]
        except AttributeError:
            types = set([type(eg) for eg in examples])
            err = Errors.E978.format(
                name="TextCategorizer", method="rehearse", types=types
            )
            raise TypeError(err) from None
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return losses
@ -303,6 +291,7 @@ class TextCategorizer(Pipe):
        DOCS: https://spacy.io/api/textcategorizer#get_loss
        """
        validate_examples(examples, "TextCategorizer.get_loss")
        truths, not_missing = self._examples_to_truth(examples)
        not_missing = self.model.ops.asarray(not_missing)
        d_scores = (scores - truths) / scores.shape[0]
@ -338,7 +327,7 @@ class TextCategorizer(Pipe):
    def begin_training(
        self,
-        get_examples: Callable[[], Iterable[Example]] = lambda: [],
+        get_examples: Callable[[], Iterable[Example]],
        *,
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
@ -356,21 +345,20 @@ class TextCategorizer(Pipe):
        DOCS: https://spacy.io/api/textcategorizer#begin_training
        """
-        # TODO: begin_training is not guaranteed to see all data / labels ?
+        if not hasattr(get_examples, "__call__"):
-        examples = list(get_examples())
+            err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
-        for example in examples:
+            raise ValueError(err)
-            try:
+        subbatch = []  # Select a subbatch of examples to initialize the model
-                y = example.y
+        for example in get_examples():
-            except AttributeError:
+            if len(subbatch) < 2:
-                err = Errors.E978.format(
+                subbatch.append(example)
-                    name="TextCategorizer", method="update", types=type(example)
+            for cat in example.y.cats:
                )
                raise TypeError(err) from None
            for cat in y.cats:
                self.add_label(cat)
        self.require_labels()
        docs = [eg.reference for eg in subbatch]
        if not docs:  # need at least one doc
            docs = [Doc(self.vocab, words=["hello"])]
-        truths, _ = self._examples_to_truth(examples)
+        truths, _ = self._examples_to_truth(subbatch)
        self.set_output(len(self.labels))
        self.model.initialize(X=docs, Y=truths)
        if sgd is None:
@ -392,6 +380,7 @@ class TextCategorizer(Pipe):
        DOCS: https://spacy.io/api/textcategorizer#score
        """
        validate_examples(examples, "TextCategorizer.score")
        return Scorer.score_cats(
            examples,
            "cats",
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -2,7 +2,7 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List,
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from .pipe import Pipe
-from ..gold import Example
+from ..gold import Example, validate_examples
 from ..tokens import Doc
 from ..vocab import Vocab
 from ..language import Language
@ -166,9 +166,8 @@ class Tok2Vec(Pipe):
        """
        if losses is None:
            losses = {}
        validate_examples(examples, "Tok2Vec.update")
        docs = [eg.predicted for eg in examples]
        if isinstance(docs, Doc):
            docs = [docs]
        set_dropout_rate(self.model, drop)
        tokvecs, bp_tokvecs = self.model.begin_update(docs)
        d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
@ -204,7 +203,7 @@ class Tok2Vec(Pipe):
    def begin_training(
        self,
-        get_examples: Callable[[], Iterable[Example]] = lambda: [],
+        get_examples: Callable[[], Iterable[Example]],
        *,
        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
        sgd: Optional[Optimizer] = None,
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@ -8,11 +8,8 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
 cdef class Parser(Pipe):
    cdef readonly Vocab vocab
    cdef public object model
    cdef public object _rehearsal_model
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef public object _multitasks
    cdef void _parseC(self, StateC** states,
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -8,22 +8,21 @@ from libc.string cimport memset
 from libc.stdlib cimport calloc, free
 import srsly
 from thinc.api import set_dropout_rate
 import numpy.random
 import numpy
 import warnings
 from ._parser_internals.stateclass cimport StateClass
 from ..ml.parser_model cimport alloc_activations, free_activations
 from ..ml.parser_model cimport predict_states, arg_max_if_valid
 from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
 from ..gold import validate_examples
 from ..errors import Errors, Warnings
 from .. import util
 from ..util import create_default_optimizer
 from thinc.api import set_dropout_rate
 import numpy.random
 import numpy
 import warnings
 cdef class Parser(Pipe):
@ -266,6 +265,7 @@ cdef class Parser(Pipe):
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.)
        validate_examples(examples, "Parser.update")
        for multitask in self._multitasks:
            multitask.update(examples, drop=drop, sgd=sgd)
        n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
@ -329,7 +329,7 @@ cdef class Parser(Pipe):
        if self._rehearsal_model is None:
            return None
        losses.setdefault(self.name, 0.)
-
+        validate_examples(examples, "Parser.rehearse")
        docs = [eg.predicted for eg in examples]
        states = self.moves.init_batch(docs)
        # This is pretty dirty, but the NER can resize itself in init_batch,
@ -398,21 +398,18 @@ cdef class Parser(Pipe):
            losses[self.name] += (d_scores**2).sum()
        return d_scores
    def create_optimizer(self):
        return create_default_optimizer()
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)
    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
            raise ValueError(err)
        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs))
        if not hasattr(get_examples, '__call__'):
            gold_tuples = get_examples
            get_examples = lambda: gold_tuples
        actions = self.moves.get_actions(
            examples=get_examples(),
            min_freq=self.cfg['min_action_freq'],
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training([])
+    ner.begin_training(lambda: [])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
@ -41,7 +41,7 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.make_from_config(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training([])
+    ner.begin_training(lambda: [])
    ner(doc)
    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
    doc.ents = list(doc.ents)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training([], **parser.cfg)
+    parser.begin_training(lambda: [], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(5):
@ -75,7 +75,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training([])
+    ner1.begin_training(lambda: [])
    ner2 = EntityRecognizer(Vocab(), model, **config)
    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -28,7 +28,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training([], **parser.cfg)
+    parser.begin_training(lambda: [], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(10):
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -136,7 +136,7 @@ def test_kb_undefined(nlp):
    """Test that the EL can't train without defining a KB"""
    entity_linker = nlp.add_pipe("entity_linker", config={})
    with pytest.raises(ValueError):
-        entity_linker.begin_training()
+        entity_linker.begin_training(lambda: [])
 def test_kb_empty(nlp):
@ -145,7 +145,7 @@ def test_kb_empty(nlp):
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError):
-        entity_linker.begin_training()
+        entity_linker.begin_training(lambda: [])
 def test_candidate_generation(nlp):
@ -249,7 +249,7 @@ def test_preserving_links_asdoc(nlp):
    ruler.add_patterns(patterns)
    el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False}
    el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    el_pipe.begin_training()
+    el_pipe.begin_training(lambda: [])
    el_pipe.incl_context = False
    el_pipe.incl_prior = True
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -54,7 +54,7 @@ def test_textcat_learns_multilabel():
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
-    optimizer = textcat.begin_training()
+    optimizer = textcat.begin_training(lambda: [])
    for i in range(30):
        losses = {}
        examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -20,7 +20,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    tagger.begin_training()
+    tagger.begin_training(lambda: [])
    doc = nlp("hello world")
    assert doc.is_tagged
    docs = nlp.pipe(["hello", "world"])
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -303,7 +303,7 @@ def test_issue4313():
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    ner.add_label("SOME_LABEL")
-    ner.begin_training([])
+    ner.begin_training(lambda: [])
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -62,7 +62,7 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
    return tagger
@ -81,7 +81,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    entity_linker.begin_training(pipeline=nlp.pipeline)
+    entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
    return entity_linker
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -24,6 +24,7 @@ from .util import registry
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
 from .gold import validate_examples
 cdef class Tokenizer:
@ -712,6 +713,7 @@ cdef class Tokenizer:
        return tokens
    def score(self, examples, **kwargs):
        validate_examples(examples, "Tokenizer.score")
        return Scorer.score_tokenization(examples)
    def to_disk(self, path, **kwargs):
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -45,18 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 <Infobox variant="danger">
 This method needs to be overwritten with your own custom `__init__` method.
 </Infobox>
 | Name    | Type                                       | Description                                                                                                                     |
-| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab` | `Vocab`                                    | The shared vocabulary.                                                                                                          |
 | `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.                                           |
 | `name`  | str                                        | String name of the component instance. Used to add entries to the `losses` during training.                                     |
-| `**cfg` |                                            | Additional config parameters and settings.                                                  |
+| `**cfg` |                                            | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
 ## Pipe.\_\_call\_\_ {#call tag="method"}
@ -182,12 +176,6 @@ method.
 Learn from a batch of [`Example`](/api/example) objects containing the
 predictions and gold-standard annotations, and update the component's model.
 <Infobox variant="danger">
 This method needs to be overwritten with your own custom `update` method.
 </Infobox>
 > #### Example
 >
 > ```python
@ -384,6 +372,15 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `exclude`      | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS**    | `Pipe`          | The pipe.                                                                 |
 ## Attributes {#attributes}
 | Name    | Type                                       | Description                                                                                           |
 | ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- |
 | `vocab` | [`Vocab`](/api/vocab)                      | The shared vocabulary that's passed in on initialization.                                             |
 | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component.                                                                     |
 | `name`  | str                                        | The name of the component instance in the pipeline. Can be used in the losses.                        |
 | `cfg`   | dict                                       | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. |
 ## Serialization fields {#serialization-fields}
 During serialization, spaCy will export several data fields used to restore
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -5,7 +5,6 @@ menu:
  - ['Processing Text', 'processing']
  - ['How Pipelines Work', 'pipelines']
  - ['Custom Components', 'custom-components']
  # - ['Trainable Components', 'trainable-components']
  - ['Extension Attributes', 'custom-components-attributes']
  - ['Plugins & Wrappers', 'plugins']
 ---
@ -885,15 +884,117 @@ available, falls back to looking up the regular factory name.
 </Infobox>
-<!-- TODO:
+### Trainable components {#trainable-components new="3"}
 ## Trainable components {#trainable-components new="3"}
 spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
 components that have their own model instance, make predictions over `Doc`
 objects and can be updated using [`spacy train`](/api/cli#train). This lets you
-plug fully custom machine learning components into your pipeline.
+plug fully custom machine learning components into your pipeline. You'll need
 the following:
--->
+1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
   can be a model using [layers](https://thinc.ai/docs/api-layers) implemented
   in Thinc, or a [wrapped model](https://thinc.ai/docs/usage-frameworks)
   implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The
   model must take a list of [`Doc`](/api/doc) objects as input and can have any
   type of output.
 2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
   two methods: [`Pipe.predict`](/api/pipe#predict) and
   [`Pipe.set_annotations`](/api/pipe#set_annotations).
 3. **Component factory:** A component factory registered with
   [`@Language.factory`](/api/language#factory) that takes the `nlp` object and
   component `name` and optional settings provided by the config and returns an
   instance of your trainable component.
 > #### Example
 >
 > ```python
 > from spacy.pipeline import Pipe
 > from spacy.language import Language
 >
 > class TrainableComponent(Pipe):
 >     def predict(self, docs):
 >         ...
 >
 >     def set_annotations(self, docs, scores):
 >         ...
 >
 > @Language.factory("my_trainable_component")
 > def make_component(nlp, name, model):
 >     return TrainableComponent(nlp.vocab, model, name=name)
 > ```
 | Name                                           | Description                                                                                                         |
 | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
 | [`predict`](/api/pipe#predict)                 | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. |
 | [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`.                      |
 By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the
 [`Model`](https://thinc.ai/docs/api-model) and the name of the component
 instance in the pipeline, which you can use as a key in the losses. All other
 keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will
 also be serialized with the component.
 <Accordion title="Why components should be passed a Model instance, not create it" spaced>
 spaCy's [config system](/usage/training#config) resolves the config describing
 the pipeline components and models **bottom-up**. This means that it will
 _first_ create a `Model` from a [registered architecture](/api/architectures),
 validate its arguments and _then_ pass the object forward to the component. This
 means that the config can express very complex, nested trees of objects – but
 the objects don't have to pass the model settings all the way down to the
 components. It also makes the components more **modular** and lets you swap
 different architectures in your config, and re-use model definitions.
 ```ini
 ### config.cfg (excerpt)
 [components]
 [components.textcat]
 factory = "textcat"
 labels = []
 # This function is created and then passed to the "textcat" component as
 # the argument "model"
 [components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
 exclusive_classes = false
 pretrained_vectors = null
 width = 64
 conv_depth = 2
 embed_size = 2000
 window_size = 1
 ngram_size = 1
 dropout = null
 [components.other_textcat]
 factory = "textcat"
 # This references the [components.textcat.model] block above
 model = ${components.textcat.model}
 labels = []
 ```
 Your trainable pipeline component factories should therefore always take a
 `model` argument instead of instantiating the
 [`Model`](https://thinc.ai/docs/api-model) inside the component. To register
 custom architectures, you can use the
 [`@spacy.registry.architectures`](/api/top-level#registry) decorator. Also see
 the [training guide](/usage/training#config) for details.
 </Accordion>
 For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 | Name                                         | Description                                                                                                                                                                                                                                                                                                        |
 | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`update`](/api/pipe#update)                 | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
 | [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
 | [`get_loss`](/api/pipe#get_loss)             | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
 | [`score`](/api/pipe#score)                   | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
 <!-- TODO: add more details, examples and maybe an example project -->
 ## Extension attributes {#custom-components-attributes new="2"}