begin_training -> initialize

2025-10-20 18:54:21 +03:00 · 2020-09-28 21:35:09 +02:00 · 2020-09-28 21:35:09 +02:00 · ff9a63bfbd
commit ff9a63bfbd
parent 046f655d86
57 changed files with 301 additions and 253 deletions
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -103,12 +103,12 @@ def debug_model(
    with data_validation(False):
        try:
            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.begin_training(lambda: train_corpus(nlp))
+            nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
-                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -85,6 +85,7 @@ class Warnings:
            "attribute or operator.")
    # TODO: fix numbering after merging develop into master
    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
            "settings: {opts}")
    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
    E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call begin_training()?")
+            "call initialize()?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
    E111 = ("Pickling a token is not supported, because tokens are only views "
            "of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
            "provided {found}.")
    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
            "by calling add_label, or by providing a representative batch of "
-            "examples to the component's begin_training method.")
+            "examples to the component's initialize method.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
            "but the provided argument {loc} points to a file.")
    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
            "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+    E930 = ("Received invalid get_examples callback in {name}.initialize. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
    E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1154,6 +1154,16 @@ class Language:
        *,
        sgd: Optional[Optimizer] = None,
        device: int = -1,
    ) -> Optimizer:
        warnings.warn(Warnings.W089, DeprecationWarning)
        return self.initialize(get_examples, sgd=sgd, device=device)
    def initialize(
        self,
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        sgd: Optional[Optimizer] = None,
        device: int = -1,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.
@ -1163,11 +1173,11 @@ class Language:
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/language#begin_training
+        DOCS: https://nightly.spacy.io/api/language#initialize
        """
        if get_examples is None:
            util.logger.debug(
-                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
@ -1179,7 +1189,7 @@ class Language:
        for example in get_examples():
            if not isinstance(example, Example):
                err = Errors.E978.format(
-                    name="Language.begin_training", types=type(example)
+                    name="Language.initialize", types=type(example)
                )
                raise ValueError(err)
            else:
@ -1198,8 +1208,8 @@ class Language:
            sgd = create_default_optimizer()
        self._optimizer = sgd
        for name, proc in self.pipeline:
-            if hasattr(proc, "begin_training"):
+            if hasattr(proc, "initialize"):
-                proc.begin_training(
+                proc.initialize(
                    get_examples, pipeline=self.pipeline, sgd=self._optimizer
                )
        self._link_components()
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
    @property
    def labels(self):
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -140,7 +140,7 @@ class EntityLinker(Pipe):
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))
-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
+        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
        """
        self._ensure_examples(get_examples)
        self._require_kb()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -129,7 +129,7 @@ class Morphologizer(Tagger):
            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
+        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
        """
        self._ensure_examples(get_examples)
        # First, fetch all labels from the data
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
    def set_annotations(self, docs, dep_ids):
        pass
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.begin_training(X)
+        self.model.output_layer.initialize(X)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, pipeline=pipeline)
    @property
    def labels(self):
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -183,7 +183,7 @@ cdef class Pipe:
        """
        return util.create_default_optimizer()
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        This method needs to be implemented by each Pipe component,
        ensuring the internal model (if available) is initialized properly
@ -198,7 +198,7 @@ cdef class Pipe:
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/pipe#begin_training
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
        """
        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
        else:
            self.punct_chars = set(self.default_punct_chars)
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        pass
    def __call__(self, doc):
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
+        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -256,7 +256,7 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.
@ -269,7 +269,7 @@ class Tagger(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/tagger#begin_training
+        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
        self.labels = tuple(list(self.labels) + [label])
        return 1
-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
+        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
        self._ensure_examples(get_examples)
        subbatch = []  # Select a subbatch of examples to initialize the model
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
    def get_loss(self, examples, scores) -> None:
        pass
-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.
-        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
+        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)
-    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
        self._ensure_examples(get_examples)
        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training(lambda: [_ner_example(ner1)])
+    ner1.initialize(lambda: [_ner_example(ner1)])
    ner2 = EntityRecognizer(Vocab(), model, **config)
    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -202,7 +202,7 @@ def test_train_empty():
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
 def test_overwrite_token():
    nlp = English()
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    # The untrained NER will predict O for each token
    doc = nlp("I live in New York")
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("John is watching the news about Croatia's elections")
    # if this goes wrong, the initialization of the parser's upper layer is probably broken
    result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
    # 1: untrained NER - should set everything to O
    untrained_ner = nlp.add_pipe("ner", name="uner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    # 2 : Entity Ruler - should set "this" to B and keep everything else O
    patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
    nlp.add_pipe("blocker", config={"start": 2, "end": 5})
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti L Korhonen speaking in Finland")
    expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
    expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(50):
        losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" in caplog.text
    caplog.clear()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" not in caplog.text
@ -358,5 +358,5 @@ class BlockerComponent1:
        self.name = name
    def __call__(self, doc):
-        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
+        doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
        return doc
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -191,7 +191,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -34,7 +34,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)
    for i in range(10):
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
    """Test that the EL can't train without defining a KB"""
    entity_linker = nlp.add_pipe("entity_linker", config={})
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
    ruler.add_patterns(patterns)
    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    nlp.begin_training()
+    nlp.initialize()
    assert entity_linker.model.get_dim("nO") == vector_length
    # test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
    )
    # train the NEL pipe
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -33,7 +33,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("morphologizer")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
-    nlp.begin_training()
+    nlp.initialize()
    # this throws an error because the morphologizer can't be resized after initialization
    with pytest.raises(ValueError):
        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
+    nlp.initialize()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -31,19 +31,19 @@ TRAIN_DATA = [
 ]
-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
+    nlp.initialize()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
    train_examples[1].reference[11].is_sent_start = False
    nlp.add_pipe("senter")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(200):
        losses = {}
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -15,14 +15,14 @@ def test_label_types():
        tagger.add_label(9)
-def test_tagger_begin_training_tag_map():
+def test_tagger_initialize_tag_map():
-    """Test that Tagger.begin_training() without gold tuples does not clobber
+    """Test that Tagger.initialize() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
@ -38,7 +38,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
    tagger.add_label("N")
    tagger.add_label("V")
    assert tagger.labels == ("N", "V")
-    nlp.begin_training()
+    nlp.initialize()
    assert tagger.model.get_dim("nO") == 2
    # this throws an error because the tagger can't be resized after initialization
    with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
+    nlp.initialize()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: train_examples[0])
+        nlp.initialize(get_examples=lambda: train_examples[0])
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=lambda: [])
+        nlp.initialize(get_examples=lambda: [])
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)
    for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
    nlp = English()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -26,7 +26,7 @@ def test_simple_train():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("answer")
-    nlp.begin_training()
+    nlp.initialize()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
-    optimizer = textcat.begin_training(lambda: [])
+    optimizer = textcat.initialize(lambda: [])
    for i in range(30):
        losses = {}
        examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("textcat")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
-    nlp.begin_training()
+    nlp.initialize()
    assert textcat.model.get_dim("nO") == 2
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")
-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
+    nlp.initialize()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 2
    for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -88,7 +88,7 @@ def test_init_tok2vec():
    nlp = English()
    tok2vec = nlp.add_pipe("tok2vec")
    assert tok2vec.listeners == []
-    nlp.begin_training()
+    nlp.initialize()
    assert tok2vec.model.get_dim("nO")
@ -154,7 +154,7 @@ def test_tok2vec_listener():
    # Check that the Tok2Vec component finds it listeners
    assert tok2vec.listeners == []
-    optimizer = nlp.begin_training(lambda: train_examples)
+    optimizer = nlp.initialize(lambda: train_examples)
    assert tok2vec.listeners == [tagger_tok2vec]
    for i in range(5):
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -428,7 +428,7 @@ def test_issue999():
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -250,7 +250,7 @@ def test_issue1915():
    ner = nlp.add_pipe("ner")
    ner.add_label("answer")
    with pytest.raises(ValueError):
-        nlp.begin_training(**cfg)
+        nlp.initialize(**cfg)
 def test_issue1945():
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -30,7 +30,7 @@ def test_issue2179():
    nlp = Italian()
    ner = nlp.add_pipe("ner")
    ner.add_label("CITIZENSHIP")
-    nlp.begin_training()
+    nlp.initialize()
    nlp2 = Italian()
    nlp2.add_pipe("ner")
    assert len(nlp2.get_pipe("ner").labels) == 0
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -18,7 +18,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
    ner = nlp.add_pipe("ner")
    for entity_type in list(entity_types):
        ner.add_label(entity_type)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(20):
        losses = {}
        random.shuffle(train_data)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -92,7 +92,7 @@ def test_issue3209():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("ANIMAL")
-    nlp.begin_training()
+    nlp.initialize()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    list(nlp.pipe(["hi", ""]))
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -223,7 +223,7 @@ def test_issue3611():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels
@ -342,7 +342,7 @@ def test_issue3880():
    nlp.add_pipe("parser").add_label("dep")
    nlp.add_pipe("ner").add_label("PERSON")
    nlp.add_pipe("tagger").add_label("NN")
-    nlp.begin_training()
+    nlp.initialize()
    for doc in nlp.pipe(texts):
        pass
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -66,7 +66,7 @@ def test_issue4030():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
    # add ner pipe
    ner = nlp.add_pipe("ner")
    ner.add_label("SOME_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    # Add entity ruler
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
    # add ner pipe
    ner1 = nlp1.add_pipe("ner")
    ner1.add_label("SOME_LABEL")
-    nlp1.begin_training()
+    nlp1.initialize()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("PEOPLE")
-    nlp.begin_training()
+    nlp.initialize()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    ner.add_label("SOME_LABEL")
-    ner.begin_training(lambda: [])
+    ner.initialize(lambda: [])
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
    TRAIN_DATA = [example, example]
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -180,7 +180,7 @@ def test_issue4725_2():
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -64,7 +64,7 @@ def tagger():
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    return tagger
@ -85,7 +85,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    nlp.begin_training()
+    nlp.initialize()
    return entity_linker
--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@ -25,7 +25,7 @@ def test_issue5551():
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
        for label in set(example[1]["cats"]):
            pipe.add_label(label)
-        nlp.begin_training()
+        nlp.initialize()
        # Store the result of each iteration
        result = pipe.model.predict([nlp.make_doc(example[0])])
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -152,7 +152,7 @@ def test_serialize_nlp():
    nlp_config = Config().from_str(nlp_config_string)
    nlp = load_model_from_config(nlp_config, auto_fill=True)
    nlp.get_pipe("tagger").add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert "tok2vec" in nlp.pipe_names
    assert "tagger" in nlp.pipe_names
    assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
    parser_cfg = dict()
    parser_cfg["model"] = {"@architectures": "my_test_parser"}
    nlp.add_pipe("parser", config=parser_cfg)
-    nlp.begin_training()
+    nlp.initialize()
    with make_tempdir() as d:
        nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
    model_config = Config().from_str(parser_config_string)
    parser = nlp.add_pipe("parser", config=model_config)
    parser.add_label("nsubj")
-    nlp.begin_training()
+    nlp.initialize()
    with make_tempdir() as d:
        nlp.to_disk(d)
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -18,7 +18,7 @@ def nlp():
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    return nlp
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -47,7 +47,7 @@ def test_readers():
    )
    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
    scores = nlp.evaluate(list(dev_corpus(nlp)))
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
    )
    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -600,7 +600,7 @@ def _train_tuples(train_data):
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
            msg.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
+    # Verify the config after calling 'initialize' to ensure labels
    # are properly initialized
    verify_config(nlp)
    if "pretraining" in config and config["pretraining"]:
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -517,18 +517,18 @@ specific data and challenge.
 Stacked ensemble of a bag-of-words model and a neural network model. The neural
 network has an internal CNN Tok2Vec layer and uses attention.
-| Name                 | Description                                                                                                                                                                                        |
+| Name                 | Description                                                                                                                                                                                    |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
+| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                        |
+| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
-| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                             |
+| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
-| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                              |
+| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                          |
-| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                                |
+| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                            |
-| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                        |
+| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
-| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
+| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
-| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                        |
+| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
 vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.
-| Name                | Description                                                                                                                                                                                        |
+| Name                | Description                                                                                                                                                                                    |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                            |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
 An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.
-| Name                | Description                                                                                                                                                                                        |
+| Name                | Description                                                                                                                                                                                    |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
+| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                               |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                           |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
 The `EntityLinker` model architecture is a Thinc `Model` with a
 [`Linear`](https://thinc.ai/api-layers#linear) output layer.
-| Name        | Description                                                                                                                                                                                                             |
+| Name        | Description                                                                                                                                                                                                         |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                                 |
+| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                             |
-| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                        |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 ### spacy.EmptyKB.v1 {#EmptyKB}
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## DependencyParser.begin_training {#begin_training tag="method"}
+## DependencyParser.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -151,11 +151,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = parser.update(examples, sgd=optimizer)
 > ```
@ -294,11 +300,10 @@ context, the original parameters are restored.
 ## DependencyParser.add_label {#add_label tag="method"}
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
+provide a **representative data sample** to the [`initialize`](#initialize)
-[`begin_training`](#begin_training) method. In this case, all labels found in
+method. In this case, all labels found in the sample will be automatically added
-the sample will be automatically added to the model, and the output dimension
+to the model, and the output dimension will be
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 automatically.
 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## EntityLinker.begin_training {#begin_training tag="method"}
+## EntityLinker.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -150,11 +150,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## EntityRecognizer.begin_training {#begin_training tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -140,11 +140,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = ner.update(examples, sgd=optimizer)
 > ```
@ -282,11 +288,10 @@ context, the original parameters are restored.
 ## EntityRecognizer.add_label {#add_label tag="method"}
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
+provide a **representative data sample** to the [`initialize`](#initialize)
-[`begin_training`](#begin_training) method. In this case, all labels found in
+method. In this case, all labels found in the sample will be automatically added
-the sample will be automatically added to the model, and the output dimension
+to the model, and the output dimension will be
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 automatically.
 > #### Example
 >
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
-## Language.begin_training {#begin_training tag="method"}
+## Language.initialize {#initialize tag="method"}
 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
 function that returns an iterable of [`Example`](/api/example) objects. The data
 examples can either be the full training data or a representative sample. They
 are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`begin_training`](/api/pipe#begin_training) method, if
+passed each component's [`initialize`](/api/pipe#initialize) method, if
 available. Initialization includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.
-If no `get_examples` function is provided when calling `nlp.begin_training`, the
+If no `get_examples` function is provided when calling `nlp.initialize`, the
 pipeline components will be initialized with generic data. In this case, it is
 crucial that the output dimension of each component has already been defined
 either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
-<Infobox variant="warning" title="Changed in v3.0">
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-The `Language.update` method now takes a **function** that is called with no
+This method was previously called `begin_training`. It now also takes a
-arguments and returns a sequence of [`Example`](/api/example) objects instead of
+**function** that is called with no arguments and returns a sequence of
-tuples of `Doc` and `GoldParse` objects.
+[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
 objects.
 </Infobox>
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
 >
 > ```python
 > get_examples = lambda: examples
-> optimizer = nlp.begin_training(get_examples)
+> optimizer = nlp.initialize(get_examples)
 > ```
 | Name           | Description                                                                                                                                              |
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
 >
 > ```python
 > with nlp.select_pipes(disable=["tagger", "parser"]):
->    nlp.begin_training()
+>    nlp.initialize()
 >
 > with nlp.select_pipes(enable="ner"):
->     nlp.begin_training()
+>     nlp.initialize()
 >
 > disabled = nlp.select_pipes(disable=["tagger", "parser"])
-> nlp.begin_training()
+> nlp.initialize()
 > disabled.restore()
 > ```
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## Morphologizer.begin_training {#begin_training tag="method"}
+## Morphologizer.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
 > nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = morphologizer.update(examples, sgd=optimizer)
 > ```
@ -259,12 +259,11 @@ context, the original parameters are restored.
 Add a new label to the pipe. If the `Morphologizer` should set annotations for
 both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
 Raises an error if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). Note that you don't have to
+already been fully [initialized](#initialize). Note that you don't have to call
-call this method if you provide a **representative data sample** to the
+this method if you provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
+[`initialize`](#initialize) method. In this case, all labels found in the sample
-the sample will be automatically added to the model, and the output dimension
+will be automatically added to the model, and the output dimension will be
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 automatically.
 > #### Example
 >
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## Pipe.begin_training {#begin_training tag="method"}
+## Pipe.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -109,11 +109,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=optimizer)
 > ```
@ -296,9 +302,9 @@ context, the original parameters are restored.
 Add a new label to the pipe, to be predicted by the model. The actual
 implementation depends on the specific component, but in general `add_label`
 shouldn't be called if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). If these conditions are
+already been fully [initialized](#initialize). If these conditions are violated,
-violated, the function will raise an Error. The exception to this rule is when
+the function will raise an Error. The exception to this rule is when the
-the component is [resizable](#is_resizable), in which case
+component is [resizable](#is_resizable), in which case
 [`set_output`](#set_output) should be called to ensure that the model is
 properly resized.
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
 | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
 Note that in general, you don't have to call `pipe.add_label` if you provide a
-representative data sample to the [`begin_training`](#begin_training) method. In
+representative data sample to the [`initialize`](#initialize) method. In this
-this case, all labels found in the sample will be automatically added to the
+case, all labels found in the sample will be automatically added to the model,
-model, and the output dimension will be
+and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 ## Pipe.is_resizable {#is_resizable tag="method"}
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## SentenceRecognizer.begin_training {#begin_training tag="method"}
+## SentenceRecognizer.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = senter.update(examples, sgd=optimizer)
 > ```
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## Tagger.begin_training {#begin_training tag="method"}
+## Tagger.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -123,11 +123,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tagger.update(examples, sgd=optimizer)
 > ```
@ -289,12 +295,12 @@ context, the original parameters are restored.
 ## Tagger.add_label {#add_label tag="method"}
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
+sample** to the [`initialize`](#initialize) method. In this case, all labels
-labels found in the sample will be automatically added to the model, and the
+found in the sample will be automatically added to the model, and the output
-output dimension will be
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+automatically.
 > #### Example
 >
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## TextCategorizer.begin_training {#begin_training tag="method"}
+## TextCategorizer.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -136,11 +136,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 This method was previously called `begin_training`.
 </Infobox>
 > #### Example
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = textcat.update(examples, sgd=optimizer)
 > ```
 | Name              | Description                                                                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    | 
+| _keyword-only_    |                                                                                                                                    |
 | `drop`            | The dropout rate. ~~float~~                                                                                                        |
 | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
 | `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          | 
+| _keyword-only_ |                                                                                                                          |
 | `drop`         | The dropout rate. ~~float~~                                                                                              |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
 ## TextCategorizer.add_label {#add_label tag="method"}
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
+sample** to the [`initialize`](#initialize) method. In this case, all labels
-labels found in the sample will be automatically added to the model, and the
+found in the sample will be automatically added to the model, and the output
-output dimension will be
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+automatically.
 > #### Example
 >
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## Tok2Vec.begin_training {#begin_training tag="method"}
+## Tok2Vec.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tok2vec.update(examples, sgd=optimizer)
 > ```
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-## Transformer.begin_training {#begin_training tag="method"}
+## Transformer.initialize {#initialize tag="method"}
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 | Name           | Description                                                                                                                           |
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = trf.update(examples, sgd=optimizer)
 > ```
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
 that their internal models are **always initialized** with appropriate sample
 data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
 ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
-functionality is triggered when
+functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
-[`nlp.begin_training`](/api/language#begin_training) is called.
+called.
 ### Dropout and normalization in Thinc {#thinc-dropout-norm}
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
 <!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
+- Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
-| Name                                         | Description                                                                                                                                                                                                                                                                                                        |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)                 | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)             | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)                   | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
 <Infobox title="Custom trainable components and models" emoji="📖">
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -1045,8 +1045,8 @@ of being dropped.
 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
+> - [`nlp.initialize`](/api/language#initialize): Start the training and return
->   return an optimizer to update the component model weights.
+>   an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1057,7 +1057,7 @@ of being dropped.
 ```python
 ### Example training loop
-optimizer = nlp.begin_training()
+optimizer = nlp.initialize()
 for itn in range(100):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
  [`Pipe.update`](/api/pipe#update) methods now all take batches of
  [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
  raw text and a dictionary of annotations.
-  [`Language.begin_training`](/api/language#begin_training) and
+  [`Language.initialize`](/api/language#initialize) and
-  [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
+  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
-  returns a sequence of `Example` objects to initialize the model instead of a
+  sequence of `Example` objects to initialize the model instead of a list of
-  list of tuples.
+  tuples.
 - The `begin_training` methods have been renamed to `initialize`.
 - [`Matcher.add`](/api/matcher#add) and
  [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
  patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
 | `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
 | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
@ -936,7 +938,7 @@ TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.begin_training()
+nlp.initialize()
 for i in range(20):
    random.shuffle(TRAIN_DATA)
    for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
        nlp.update(examples)
 ```
-[`Language.begin_training`](/api/language#begin_training) and
+`Language.begin_training` and `Pipe.begin_training` have been renamed to
-[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
+[`Language.initialize`](/api/language#initialize) and
-returns a sequence of `Example` objects to initialize the model instead of a
+[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
-list of tuples. The data examples are used to **initialize the models** of
+that returns a sequence of `Example` objects to initialize the model instead of
 a list of tuples. The data examples are used to **initialize the models** of
 trainable pipeline components, which includes validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.
 ```diff
- nlp.begin_training(examples)
+- nlp.initialize(examples)
-+ nlp.begin_training(lambda: examples)
+ nlp.initialize(lambda: examples)
 ```
 #### Packaging trained pipelines {#migrating-training-packaging}