begin_training -> initialize

2025-07-17 11:42:30 +03:00 · 2020-09-28 21:35:09 +02:00 · 2020-09-28 21:35:09 +02:00 · ff9a63bfbd
commit ff9a63bfbd
parent 046f655d86
57 changed files with 301 additions and 253 deletions
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -103,12 +103,12 @@ def debug_model(
    with data_validation(False):
        try:
            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.begin_training(lambda: train_corpus(nlp))
+            nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
-                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -85,6 +85,7 @@ class Warnings:
            "attribute or operator.")

    # TODO: fix numbering after merging develop into master
+    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
            "settings: {opts}")
    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
    E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call begin_training()?")
+            "call initialize()?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
    E111 = ("Pickling a token is not supported, because tokens are only views "
            "of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
            "provided {found}.")
    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
            "by calling add_label, or by providing a representative batch of "
-            "examples to the component's begin_training method.")
+            "examples to the component's initialize method.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
            "but the provided argument {loc} points to a file.")
    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
            "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+    E930 = ("Received invalid get_examples callback in {name}.initialize. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
    E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1154,6 +1154,16 @@ class Language:
        *,
        sgd: Optional[Optimizer] = None,
        device: int = -1,
+    ) -> Optimizer:
+        warnings.warn(Warnings.W089, DeprecationWarning)
+        return self.initialize(get_examples, sgd=sgd, device=device)
+
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
+        device: int = -1,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.

@ -1163,11 +1173,11 @@ class Language:
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/language#begin_training
+        DOCS: https://nightly.spacy.io/api/language#initialize
        """
        if get_examples is None:
            util.logger.debug(
-                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
@ -1179,7 +1189,7 @@ class Language:
        for example in get_examples():
            if not isinstance(example, Example):
                err = Errors.E978.format(
-                    name="Language.begin_training", types=type(example)
+                    name="Language.initialize", types=type(example)
                )
                raise ValueError(err)
            else:
@ -1198,8 +1208,8 @@ class Language:
            sgd = create_default_optimizer()
        self._optimizer = sgd
        for name, proc in self.pipeline:
-            if hasattr(proc, "begin_training"):
-                proc.begin_training(
+            if hasattr(proc, "initialize"):
+                proc.initialize(
                    get_examples, pipeline=self.pipeline, sgd=self._optimizer
                )
        self._link_components()
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)

    @property
    def labels(self):
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -140,7 +140,7 @@ class EntityLinker(Pipe):
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
+        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
        """
        self._ensure_examples(get_examples)
        self._require_kb()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -129,7 +129,7 @@ class Morphologizer(Tagger):
            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

@ -142,7 +142,7 @@ class Morphologizer(Tagger):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
+        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
        """
        self._ensure_examples(get_examples)
        # First, fetch all labels from the data
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
    def set_annotations(self, docs, dep_ids):
        pass

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.begin_training(X)
+        self.model.output_layer.initialize(X)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, pipeline=pipeline)

    @property
    def labels(self):
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -183,7 +183,7 @@ cdef class Pipe:
        """
        return util.create_default_optimizer()

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using data examples if available.
        This method needs to be implemented by each Pipe component,
        ensuring the internal model (if available) is initialized properly
@ -198,7 +198,7 @@ cdef class Pipe:
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/pipe#begin_training
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
        """
        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))

--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
        else:
            self.punct_chars = set(self.default_punct_chars)

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
        pass

    def __call__(self, doc):
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
+        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -256,7 +256,7 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

@ -269,7 +269,7 @@ class Tagger(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/tagger#begin_training
+        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
        self.labels = tuple(list(self.labels) + [label])
        return 1

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
+        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
        self._ensure_examples(get_examples)
        subbatch = []  # Select a subbatch of examples to initialize the model
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
    def get_loss(self, examples, scores) -> None:
        pass

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
            create_optimizer if it doesn't exist.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
+        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)

-    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
        self._ensure_examples(get_examples)
        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)

    for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training(lambda: [_ner_example(ner1)])
+    ner1.initialize(lambda: [_ner_example(ner1)])
    ner2 = EntityRecognizer(Vocab(), model, **config)

    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -202,7 +202,7 @@ def test_train_empty():
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
 def test_overwrite_token():
    nlp = English()
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    # The untrained NER will predict O for each token
    doc = nlp("I live in New York")
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("John is watching the news about Croatia's elections")
    # if this goes wrong, the initialization of the parser's upper layer is probably broken
    result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
    # 1: untrained NER - should set everything to O
    untrained_ner = nlp.add_pipe("ner", name="uner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()

    # 2 : Entity Ruler - should set "this" to B and keep everything else O
    patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
    nlp.add_pipe("blocker", config={"start": 2, "end": 5})
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti L Korhonen speaking in Finland")
    expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
    expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()

    for i in range(50):
        losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" in caplog.text
    caplog.clear()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" not in caplog.text


--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -191,7 +191,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -34,7 +34,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
    sgd = Adam(0.001)

    for i in range(10):
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
    """Test that the EL can't train without defining a KB"""
    entity_linker = nlp.add_pipe("entity_linker", config={})
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])


 def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])


 def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
    ruler.add_patterns(patterns)
    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    nlp.begin_training()
+    nlp.initialize()
    assert entity_linker.model.get_dim("nO") == vector_length

    # test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
    )

    # train the NEL pipe
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -33,7 +33,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("morphologizer")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


 def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
-    nlp.begin_training()
+    nlp.initialize()
    # this throws an error because the morphologizer can't be resized after initialization
    with pytest.raises(ValueError):
        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -31,19 +31,19 @@ TRAIN_DATA = [
 ]


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
    train_examples[1].reference[11].is_sent_start = False

    nlp.add_pipe("senter")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()

    for i in range(200):
        losses = {}
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -15,14 +15,14 @@ def test_label_types():
        tagger.add_label(9)


-def test_tagger_begin_training_tag_map():
-    """Test that Tagger.begin_training() without gold tuples does not clobber
+def test_tagger_initialize_tag_map():
+    """Test that Tagger.initialize() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)


@ -38,7 +38,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
    tagger.add_label("N")
    tagger.add_label("V")
    assert tagger.labels == ("N", "V")
-    nlp.begin_training()
+    nlp.initialize()
    assert tagger.model.get_dim("nO") == 2
    # this throws an error because the tagger can't be resized after initialization
    with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: train_examples[0])
+        nlp.initialize(get_examples=lambda: train_examples[0])
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=lambda: [])
+        nlp.initialize(get_examples=lambda: [])
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
    nlp = English()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -26,7 +26,7 @@ def test_simple_train():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("answer")
-    nlp.begin_training()
+    nlp.initialize()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
-    optimizer = textcat.begin_training(lambda: [])
+    optimizer = textcat.initialize(lambda: [])
    for i in range(30):
        losses = {}
        examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("textcat")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


 def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
-    nlp.begin_training()
+    nlp.initialize()
    assert textcat.model.get_dim("nO") == 2
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 2

    for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -88,7 +88,7 @@ def test_init_tok2vec():
    nlp = English()
    tok2vec = nlp.add_pipe("tok2vec")
    assert tok2vec.listeners == []
-    nlp.begin_training()
+    nlp.initialize()
    assert tok2vec.model.get_dim("nO")


@ -154,7 +154,7 @@ def test_tok2vec_listener():

    # Check that the Tok2Vec component finds it listeners
    assert tok2vec.listeners == []
-    optimizer = nlp.begin_training(lambda: train_examples)
+    optimizer = nlp.initialize(lambda: train_examples)
    assert tok2vec.listeners == [tagger_tok2vec]

    for i in range(5):
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -428,7 +428,7 @@ def test_issue999():
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -250,7 +250,7 @@ def test_issue1915():
    ner = nlp.add_pipe("ner")
    ner.add_label("answer")
    with pytest.raises(ValueError):
-        nlp.begin_training(**cfg)
+        nlp.initialize(**cfg)


 def test_issue1945():
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -30,7 +30,7 @@ def test_issue2179():
    nlp = Italian()
    ner = nlp.add_pipe("ner")
    ner.add_label("CITIZENSHIP")
-    nlp.begin_training()
+    nlp.initialize()
    nlp2 = Italian()
    nlp2.add_pipe("ner")
    assert len(nlp2.get_pipe("ner").labels) == 0
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -18,7 +18,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
    ner = nlp.add_pipe("ner")
    for entity_type in list(entity_types):
        ner.add_label(entity_type)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(20):
        losses = {}
        random.shuffle(train_data)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -92,7 +92,7 @@ def test_issue3209():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("ANIMAL")
-    nlp.begin_training()
+    nlp.initialize()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    list(nlp.pipe(["hi", ""]))


--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -223,7 +223,7 @@ def test_issue3611():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels


@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels


@ -342,7 +342,7 @@ def test_issue3880():
    nlp.add_pipe("parser").add_label("dep")
    nlp.add_pipe("ner").add_label("PERSON")
    nlp.add_pipe("tagger").add_label("NN")
-    nlp.begin_training()
+    nlp.initialize()
    for doc in nlp.pipe(texts):
        pass

--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -66,7 +66,7 @@ def test_issue4030():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
    # add ner pipe
    ner = nlp.add_pipe("ner")
    ner.add_label("SOME_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    # Add entity ruler
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
    # add ner pipe
    ner1 = nlp1.add_pipe("ner")
    ner1.add_label("SOME_LABEL")
-    nlp1.begin_training()
+    nlp1.initialize()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("PEOPLE")
-    nlp.begin_training()
+    nlp.initialize()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    ner.add_label("SOME_LABEL")
-    ner.begin_training(lambda: [])
+    ner.initialize(lambda: [])
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
    TRAIN_DATA = [example, example]
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -180,7 +180,7 @@ def test_issue4725_2():
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -64,7 +64,7 @@ def tagger():
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    return tagger


@ -85,7 +85,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    nlp.begin_training()
+    nlp.initialize()
    return entity_linker


--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@ -25,7 +25,7 @@ def test_issue5551():
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
        for label in set(example[1]["cats"]):
            pipe.add_label(label)
-        nlp.begin_training()
+        nlp.initialize()

        # Store the result of each iteration
        result = pipe.model.predict([nlp.make_doc(example[0])])
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -152,7 +152,7 @@ def test_serialize_nlp():
    nlp_config = Config().from_str(nlp_config_string)
    nlp = load_model_from_config(nlp_config, auto_fill=True)
    nlp.get_pipe("tagger").add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert "tok2vec" in nlp.pipe_names
    assert "tagger" in nlp.pipe_names
    assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
    parser_cfg = dict()
    parser_cfg["model"] = {"@architectures": "my_test_parser"}
    nlp.add_pipe("parser", config=parser_cfg)
-    nlp.begin_training()
+    nlp.initialize()

    with make_tempdir() as d:
        nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
    model_config = Config().from_str(parser_config_string)
    parser = nlp.add_pipe("parser", config=model_config)
    parser.add_label("nsubj")
-    nlp.begin_training()
+    nlp.initialize()

    with make_tempdir() as d:
        nlp.to_disk(d)
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -18,7 +18,7 @@ def nlp():
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    return nlp


--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -47,7 +47,7 @@ def test_readers():
    )
    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
    scores = nlp.evaluate(list(dev_corpus(nlp)))
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
    )
    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -600,7 +600,7 @@ def _train_tuples(train_data):
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
            msg.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
+    # Verify the config after calling 'initialize' to ensure labels
    # are properly initialized
    verify_config(nlp)
    if "pretraining" in config and config["pretraining"]:
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -518,7 +518,7 @@ Stacked ensemble of a bag-of-words model and a neural network model. The neural
 network has an internal CNN Tok2Vec layer and uses attention.

 | Name                 | Description                                                                                                                                                                                    |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
 | `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
@ -527,7 +527,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
 | `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
 | `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -556,10 +556,10 @@ vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.

 | Name                | Description                                                                                                                                                                                    |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -579,11 +579,11 @@ An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.

 | Name                | Description                                                                                                                                                                                    |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
 | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                           |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -630,9 +630,9 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 [`Linear`](https://thinc.ai/api-layers#linear) output layer.

 | Name        | Description                                                                                                                                                                                                         |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                             |
-| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |

 ### spacy.EmptyKB.v1 {#EmptyKB}
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## DependencyParser.begin_training {#begin_training tag="method"}
+## DependencyParser.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -151,11 +151,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = parser.update(examples, sgd=optimizer)
 > ```

@ -294,11 +300,10 @@ context, the original parameters are restored.
 ## DependencyParser.add_label {#add_label tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityLinker.begin_training {#begin_training tag="method"}
+## EntityLinker.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -150,11 +150,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityRecognizer.begin_training {#begin_training tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -140,11 +140,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = ner.update(examples, sgd=optimizer)
 > ```

@ -282,11 +288,10 @@ context, the original parameters are restored.
 ## EntityRecognizer.add_label {#add_label tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |

-## Language.begin_training {#begin_training tag="method"}
+## Language.initialize {#initialize tag="method"}

 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
 function that returns an iterable of [`Example`](/api/example) objects. The data
 examples can either be the full training data or a representative sample. They
 are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`begin_training`](/api/pipe#begin_training) method, if
+passed each component's [`initialize`](/api/pipe#initialize) method, if
 available. Initialization includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.

-If no `get_examples` function is provided when calling `nlp.begin_training`, the
+If no `get_examples` function is provided when calling `nlp.initialize`, the
 pipeline components will be initialized with generic data. In this case, it is
 crucial that the output dimension of each component has already been defined
 either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).

-<Infobox variant="warning" title="Changed in v3.0">
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">

-The `Language.update` method now takes a **function** that is called with no
-arguments and returns a sequence of [`Example`](/api/example) objects instead of
-tuples of `Doc` and `GoldParse` objects.
+This method was previously called `begin_training`. It now also takes a
+**function** that is called with no arguments and returns a sequence of
+[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
+objects.

 </Infobox>

@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
 >
 > ```python
 > get_examples = lambda: examples
-> optimizer = nlp.begin_training(get_examples)
+> optimizer = nlp.initialize(get_examples)
 > ```

 | Name           | Description                                                                                                                                              |
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
 >
 > ```python
 > with nlp.select_pipes(disable=["tagger", "parser"]):
->    nlp.begin_training()
+>    nlp.initialize()
 >
 > with nlp.select_pipes(enable="ner"):
->     nlp.begin_training()
+>     nlp.initialize()
 >
 > disabled = nlp.select_pipes(disable=["tagger", "parser"])
-> nlp.begin_training()
+> nlp.initialize()
 > disabled.restore()
 > ```

--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Morphologizer.begin_training {#begin_training tag="method"}
+## Morphologizer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
 > nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = morphologizer.update(examples, sgd=optimizer)
 > ```

@ -259,12 +259,11 @@ context, the original parameters are restored.
 Add a new label to the pipe. If the `Morphologizer` should set annotations for
 both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
 Raises an error if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). Note that you don't have to
-call this method if you provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+already been fully [initialized](#initialize). Note that you don't have to call
+this method if you provide a **representative data sample** to the
+[`initialize`](#initialize) method. In this case, all labels found in the sample
+will be automatically added to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Pipe.begin_training {#begin_training tag="method"}
+## Pipe.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -109,11 +109,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=optimizer)
 > ```

@ -296,9 +302,9 @@ context, the original parameters are restored.
 Add a new label to the pipe, to be predicted by the model. The actual
 implementation depends on the specific component, but in general `add_label`
 shouldn't be called if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). If these conditions are
-violated, the function will raise an Error. The exception to this rule is when
-the component is [resizable](#is_resizable), in which case
+already been fully [initialized](#initialize). If these conditions are violated,
+the function will raise an Error. The exception to this rule is when the
+component is [resizable](#is_resizable), in which case
 [`set_output`](#set_output) should be called to ensure that the model is
 properly resized.

@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
 | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |

 Note that in general, you don't have to call `pipe.add_label` if you provide a
-representative data sample to the [`begin_training`](#begin_training) method. In
-this case, all labels found in the sample will be automatically added to the
-model, and the output dimension will be
+representative data sample to the [`initialize`](#initialize) method. In this
+case, all labels found in the sample will be automatically added to the model,
+and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 ## Pipe.is_resizable {#is_resizable tag="method"}
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## SentenceRecognizer.begin_training {#begin_training tag="method"}
+## SentenceRecognizer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = senter.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Tagger.begin_training {#begin_training tag="method"}
+## Tagger.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -123,11 +123,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tagger.update(examples, sgd=optimizer)
 > ```

@ -289,12 +295,12 @@ context, the original parameters are restored.
 ## Tagger.add_label {#add_label tag="method"}

 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.

 > #### Example
 >
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## TextCategorizer.begin_training {#begin_training tag="method"}
+## TextCategorizer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -136,11 +136,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.

+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -196,7 +202,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = textcat.update(examples, sgd=optimizer)
 > ```

@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
 ## TextCategorizer.add_label {#add_label tag="method"}

 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.

 > #### Example
 >
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Tok2Vec.begin_training {#begin_training tag="method"}
+## Tok2Vec.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tok2vec.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Transformer.begin_training {#begin_training tag="method"}
+## Transformer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = trf.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
 that their internal models are **always initialized** with appropriate sample
 data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
 ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
-functionality is triggered when
-[`nlp.begin_training`](/api/language#begin_training) is called.
+functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
+called.

 ### Dropout and normalization in Thinc {#thinc-dropout-norm}

@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):

 <!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
+- Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -1127,9 +1127,9 @@ customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.

 | Name                                 | Description                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
 | [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
 | [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -1045,8 +1045,8 @@ of being dropped.

 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
->   return an optimizer to update the component model weights.
+> - [`nlp.initialize`](/api/language#initialize): Start the training and return
+>   an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1057,7 +1057,7 @@ of being dropped.

 ```python
 ### Example training loop
-optimizer = nlp.begin_training()
+optimizer = nlp.initialize()
 for itn in range(100):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
  [`Pipe.update`](/api/pipe#update) methods now all take batches of
  [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
  raw text and a dictionary of annotations.
-  [`Language.begin_training`](/api/language#begin_training) and
-  [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-  returns a sequence of `Example` objects to initialize the model instead of a
-  list of tuples.
+  [`Language.initialize`](/api/language#initialize) and
+  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
+  sequence of `Example` objects to initialize the model instead of a list of
+  tuples.
+- The `begin_training` methods have been renamed to `initialize`.
 - [`Matcher.add`](/api/matcher#add) and
  [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
  patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
 | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
@ -936,7 +938,7 @@ TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.begin_training()
+nlp.initialize()
 for i in range(20):
    random.shuffle(TRAIN_DATA)
    for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
        nlp.update(examples)
 ```

-[`Language.begin_training`](/api/language#begin_training) and
-[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-returns a sequence of `Example` objects to initialize the model instead of a
-list of tuples. The data examples are used to **initialize the models** of
+`Language.begin_training` and `Pipe.begin_training` have been renamed to
+[`Language.initialize`](/api/language#initialize) and
+[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
+that returns a sequence of `Example` objects to initialize the model instead of
+a list of tuples. The data examples are used to **initialize the models** of
 trainable pipeline components, which includes validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.

 ```diff
- nlp.begin_training(examples)
-+ nlp.begin_training(lambda: examples)
+- nlp.initialize(examples)
+ nlp.initialize(lambda: examples)
 ```

 #### Packaging trained pipelines {#migrating-training-packaging}