diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 0b4db70b6..eca85dc04 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -103,12 +103,12 @@ def debug_model(
     with data_validation(False):
         try:
             train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.begin_training(lambda: train_corpus(nlp))
+            nlp.initialize(lambda: train_corpus(nlp))
             msg.info("Initialized the model with the training corpus.")
         except ValueError:
             try:
                 _set_output_dim(nO=7, model=model)
-                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except Exception:
                 msg.fail(
diff --git a/spacy/errors.py b/spacy/errors.py
index 640419182..1f9bcb0ae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,6 +85,7 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
+    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@@ -306,7 +307,7 @@ class Errors:
             "settings: {opts}")
     E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
     E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call begin_training()?")
+            "call initialize()?")
     E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
     E111 = ("Pickling a token is not supported, because tokens are only views "
             "of the parent Doc and can't exist on their own. A pickled token "
@@ -376,7 +377,7 @@ class Errors:
             "provided {found}.")
     E143 = ("Labels for component '{name}' not initialized. This can be fixed "
             "by calling add_label, or by providing a representative batch of "
-            "examples to the component's begin_training method.")
+            "examples to the component's initialize method.")
     E145 = ("Error reading `{param}` from input file.")
     E146 = ("Could not access `{path}`.")
     E147 = ("Unexpected error in the {method} functionality of the "
@@ -517,7 +518,7 @@ class Errors:
             "but the provided argument {loc} points to a file.")
     E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
             "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+    E930 = ("Received invalid get_examples callback in {name}.initialize. "
             "Expected function that returns an iterable of Example objects but "
             "got: {obj}")
     E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
diff --git a/spacy/language.py b/spacy/language.py
index c1d2df026..a5b78b178 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1154,6 +1154,16 @@ class Language:
         *,
         sgd: Optional[Optimizer] = None,
         device: int = -1,
+    ) -> Optimizer:
+        warnings.warn(Warnings.W089, DeprecationWarning)
+        return self.initialize(get_examples, sgd=sgd, device=device)
+
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
+        device: int = -1,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
 
@@ -1163,11 +1173,11 @@ class Language:
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/language#begin_training
+        DOCS: https://nightly.spacy.io/api/language#initialize
         """
         if get_examples is None:
             util.logger.debug(
-                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
             )
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
@@ -1179,7 +1189,7 @@ class Language:
         for example in get_examples():
             if not isinstance(example, Example):
                 err = Errors.E978.format(
-                    name="Language.begin_training", types=type(example)
+                    name="Language.initialize", types=type(example)
                 )
                 raise ValueError(err)
             else:
@@ -1198,8 +1208,8 @@ class Language:
             sgd = create_default_optimizer()
         self._optimizer = sgd
         for name, proc in self.pipeline:
-            if hasattr(proc, "begin_training"):
-                proc.begin_training(
+            if hasattr(proc, "initialize"):
+                proc.initialize(
                     get_examples, pipeline=self.pipeline, sgd=self._optimizer
                 )
         self._link_components()
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index a447434d2..95effac59 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 039e2a891..0f33378b4 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -140,7 +140,7 @@ class EntityLinker(Pipe):
         if len(self.kb) == 0:
             raise ValueError(Errors.E139.format(name=self.name))
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -159,7 +159,7 @@ class EntityLinker(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
+        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
         """
         self._ensure_examples(get_examples)
         self._require_kb()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5fee9a900..d035172a8 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -129,7 +129,7 @@ class Morphologizer(Tagger):
             self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -142,7 +142,7 @@ class Morphologizer(Tagger):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
+        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
         """
         self._ensure_examples(get_examples)
         # First, fetch all labels from the data
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 2f8940124..3fd034b30 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
             raise ValueError(err)
@@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         self.model.initialize()  # TODO: fix initialization by defining X and Y
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.begin_training(X)
+        self.model.output_layer.initialize(X)
         if sgd is None:
             sgd = self.create_optimizer()
         return sgd
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index fc0dda40d..effcef2e3 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, pipeline=pipeline)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 324c8e19c..bff2be1af 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -183,7 +183,7 @@ cdef class Pipe:
         """
         return util.create_default_optimizer()
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using data examples if available.
         This method needs to be implemented by each Pipe component,
         ensuring the internal model (if available) is initialized properly
@@ -198,7 +198,7 @@ cdef class Pipe:
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/pipe#begin_training
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
         """
         raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 2882f6f8b..0f49033ff 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -58,7 +58,7 @@ class Sentencizer(Pipe):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         pass
 
     def __call__(self, doc):
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index da85a9cf2..68a9860a5 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
+        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3efe29916..66f8b38b6 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -256,7 +256,7 @@ class Tagger(Pipe):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -269,7 +269,7 @@ class Tagger(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/tagger#begin_training
+        DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 6b8c0ca65..37665adfc 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
         self.labels = tuple(list(self.labels) + [label])
         return 1
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
+        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
         self._ensure_examples(get_examples)
         subbatch = []  # Select a subbatch of examples to initialize the model
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 9ab4e42b7..7c8bbf5e5 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
     def get_loss(self, examples, scores) -> None:
         pass
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
+        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 1350e1f12..5a4503cf9 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -405,7 +405,7 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
         self._ensure_examples(get_examples)
         self.cfg.update(kwargs)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 86aa883bd..fa0206fdd 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
     ner(doc)
 
     doc.ents = [("ANIMAL", 3, 4)]
@@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
     ner(doc)
     orig_iobs = [t.ent_iob_ for t in doc]
     doc.ents = list(doc.ents)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index cd376e0fc..fb1eabf7d 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
     fix_random_seed(1)
     parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
     sgd = Adam(0.001)
 
     for i in range(5):
@@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
     ner1.add_label("C")
     ner1.add_label("B")
     ner1.add_label("A")
-    ner1.begin_training(lambda: [_ner_example(ner1)])
+    ner1.initialize(lambda: [_ner_example(ner1)])
     ner2 = EntityRecognizer(Vocab(), model, **config)
 
     # the second model needs to be resized before we can call from_bytes
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index cd5581769..b657ae2e8 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -202,7 +202,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.begin_training()
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -213,7 +213,7 @@ def test_train_empty():
 def test_overwrite_token():
     nlp = English()
     nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
     # The untrained NER will predict O for each token
     doc = nlp("I live in New York")
     assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@@ -235,7 +235,7 @@ def test_empty_ner():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("John is watching the news about Croatia's elections")
     # if this goes wrong, the initialization of the parser's upper layer is probably broken
     result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@@ -254,7 +254,7 @@ def test_ruler_before_ner():
     # 2: untrained NER - should set everything else to O
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("This is Antti Korhonen speaking in Finland")
     expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
     expected_types = ["THING", "", "", "", "", "", ""]
@@ -269,7 +269,7 @@ def test_ner_before_ruler():
     # 1: untrained NER - should set everything to O
     untrained_ner = nlp.add_pipe("ner", name="uner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
 
     # 2 : Entity Ruler - should set "this" to B and keep everything else O
     patterns = [{"label": "THING", "pattern": "This"}]
@@ -290,7 +290,7 @@ def test_block_ner():
     nlp.add_pipe("blocker", config={"start": 2, "end": 5})
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("This is Antti L Korhonen speaking in Finland")
     expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
     expected_types = ["", "", "", "", "", "", "", ""]
@@ -307,7 +307,7 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for ent in annotations.get("entities"):
             ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
 
     for i in range(50):
         losses = {}
@@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
     assert not len(nlp.vocab.lookups)
     nlp.add_pipe("ner")
     with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
         assert "W033" in caplog.text
     caplog.clear()
     nlp.vocab.lookups.add_table("lexeme_norm")
     nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
     with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
         assert "W033" not in caplog.text
 
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
+        doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
         return doc
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 8648f2018..ffb6f23f1 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -191,7 +191,7 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(100):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index e8dfa68c7..d8f861b02 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -34,7 +34,7 @@ def parser(vocab):
     parser.cfg["hidden_width"] = 32
     # parser.add_label('right')
     parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
     sgd = Adam(0.001)
 
     for i in range(10):
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 878f41a28..d5c8de36b 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
     """Test that the EL can't train without defining a KB"""
     entity_linker = nlp.add_pipe("entity_linker", config={})
     with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 
 
 def test_kb_empty(nlp):
@@ -143,7 +143,7 @@ def test_kb_empty(nlp):
     entity_linker = nlp.add_pipe("entity_linker", config=config)
     assert len(entity_linker.kb) == 0
     with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 
 
 def test_kb_serialize(nlp):
@@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
     ruler.add_patterns(patterns)
     el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
     entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    nlp.begin_training()
+    nlp.initialize()
     assert entity_linker.model.get_dim("nO") == vector_length
 
     # test whether the entity links are preserved by the `as_doc()` function
@@ -463,7 +463,7 @@ def test_overfitting_IO():
     )
 
     # train the NEL pipe
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert entity_linker.model.get_dim("nO") == vector_length
     assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
 
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 864c7332e..c86ee3617 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -33,7 +33,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("morphologizer")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_implicit_label():
@@ -42,7 +42,7 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
 def test_no_resize():
@@ -50,13 +50,13 @@ def test_no_resize():
     morphologizer = nlp.add_pipe("morphologizer")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
-    nlp.begin_training()
+    nlp.initialize()
     # this throws an error because the morphologizer can't be resized after initialization
     with pytest.raises(ValueError):
         morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@@ -64,12 +64,12 @@ def test_begin_training_examples():
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -79,7 +79,7 @@ def test_overfitting_IO():
     train_examples = []
     for inst in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 
     for i in range(50):
         losses = {}
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 5827f8ff1..5d8a8be41 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -31,19 +31,19 @@ TRAIN_DATA = [
 ]
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     nlp.add_pipe("senter")
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -58,7 +58,7 @@ def test_overfitting_IO():
     train_examples[1].reference[11].is_sent_start = False
 
     nlp.add_pipe("senter")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
 
     for i in range(200):
         losses = {}
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index cd5927675..69a6dd414 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -15,14 +15,14 @@ def test_label_types():
         tagger.add_label(9)
 
 
-def test_tagger_begin_training_tag_map():
-    """Test that Tagger.begin_training() without gold tuples does not clobber
+def test_tagger_initialize_tag_map():
+    """Test that Tagger.initialize() without gold tuples does not clobber
     the tag map."""
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     orig_tag_count = len(tagger.labels)
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
 
 
@@ -38,7 +38,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("tagger")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_no_resize():
@@ -47,7 +47,7 @@ def test_no_resize():
     tagger.add_label("N")
     tagger.add_label("V")
     assert tagger.labels == ("N", "V")
-    nlp.begin_training()
+    nlp.initialize()
     assert tagger.model.get_dim("nO") == 2
     # this throws an error because the tagger can't be resized after initialization
     with pytest.raises(ValueError):
@@ -60,10 +60,10 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     train_examples = []
@@ -72,16 +72,16 @@ def test_begin_training_examples():
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: train_examples[0])
+        nlp.initialize(get_examples=lambda: train_examples[0])
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=lambda: [])
+        nlp.initialize(get_examples=lambda: [])
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -91,7 +91,7 @@ def test_overfitting_IO():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert tagger.model.get_dim("nO") == len(TAGS)
 
     for i in range(50):
@@ -122,4 +122,4 @@ def test_tagger_requires_labels():
     nlp = English()
     nlp.add_pipe("tagger")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 02e189834..2870229c8 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -26,7 +26,7 @@ def test_simple_train():
     nlp = Language()
     textcat = nlp.add_pipe("textcat")
     textcat.add_label("answer")
-    nlp.begin_training()
+    nlp.initialize()
     for i in range(5):
         for text, answer in [
             ("aaaa", 1.0),
@@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
     textcat = TextCategorizer(nlp.vocab, width=8)
     for letter in letters:
         textcat.add_label(letter)
-    optimizer = textcat.begin_training(lambda: [])
+    optimizer = textcat.initialize(lambda: [])
     for i in range(30):
         losses = {}
         examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@@ -86,7 +86,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("textcat")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_implicit_label():
@@ -95,7 +95,7 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
 def test_no_resize():
@@ -103,14 +103,14 @@ def test_no_resize():
     textcat = nlp.add_pipe("textcat")
     textcat.add_label("POSITIVE")
     textcat.add_label("NEGATIVE")
-    nlp.begin_training()
+    nlp.initialize()
     assert textcat.model.get_dim("nO") == 2
     # this throws an error because the textcat can't be resized after initialization
     with pytest.raises(ValueError):
         textcat.add_label("NEUTRAL")
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     textcat = nlp.add_pipe("textcat")
     train_examples = []
@@ -119,12 +119,12 @@ def test_begin_training_examples():
         for label, value in annotations.get("cats").items():
             textcat.add_label(label)
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -139,7 +139,7 @@ def test_overfitting_IO():
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert textcat.model.get_dim("nO") == 2
 
     for i in range(50):
@@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for label, value in annotations.get("cats").items():
             textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 558b9079c..f84b78247 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -88,7 +88,7 @@ def test_init_tok2vec():
     nlp = English()
     tok2vec = nlp.add_pipe("tok2vec")
     assert tok2vec.listeners == []
-    nlp.begin_training()
+    nlp.initialize()
     assert tok2vec.model.get_dim("nO")
 
 
@@ -154,7 +154,7 @@ def test_tok2vec_listener():
 
     # Check that the Tok2Vec component finds it listeners
     assert tok2vec.listeners == []
-    optimizer = nlp.begin_training(lambda: train_examples)
+    optimizer = nlp.initialize(lambda: train_examples)
     assert tok2vec.listeners == [tagger_tok2vec]
 
     for i in range(5):
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index d841ee24b..6bb71f6f4 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -428,7 +428,7 @@ def test_issue999():
     for _, offsets in TRAIN_DATA:
         for start, end, label in offsets:
             ner.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
     for itn in range(20):
         random.shuffle(TRAIN_DATA)
         for raw_text, entity_offsets in TRAIN_DATA:
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index dce3e8298..f85ec70e1 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -250,7 +250,7 @@ def test_issue1915():
     ner = nlp.add_pipe("ner")
     ner.add_label("answer")
     with pytest.raises(ValueError):
-        nlp.begin_training(**cfg)
+        nlp.initialize(**cfg)
 
 
 def test_issue1945():
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index c4c755153..09baab4d8 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -30,7 +30,7 @@ def test_issue2179():
     nlp = Italian()
     ner = nlp.add_pipe("ner")
     ner.add_label("CITIZENSHIP")
-    nlp.begin_training()
+    nlp.initialize()
     nlp2 = Italian()
     nlp2.add_pipe("ner")
     assert len(nlp2.get_pipe("ner").labels) == 0
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 5895b616e..4952a545d 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -18,7 +18,7 @@ def test_issue2564():
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("hello world")
     assert doc.has_annotation("TAG")
     docs = nlp.pipe(["hello", "world"])
@@ -149,7 +149,7 @@ def test_issue2800():
     ner = nlp.add_pipe("ner")
     for entity_type in list(entity_types):
         ner.add_label(entity_type)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(20):
         losses = {}
         random.shuffle(train_data)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 56ef23dbf..6fc42e83f 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -92,7 +92,7 @@ def test_issue3209():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("ANIMAL")
-    nlp.begin_training()
+    nlp.initialize()
     move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
     assert ner.move_names == move_names
     nlp2 = English()
@@ -239,7 +239,7 @@ def test_issue3456():
     nlp = English()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     list(nlp.pipe(["hi", ""]))
 
 
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 304e654c3..31e441d86 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -223,7 +223,7 @@ def test_issue3611():
         textcat.add_label(label)
     # training the network
     with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
     assert "subtok" not in parser.labels
 
 
@@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
     assert "subtok" in parser.labels
 
 
@@ -342,7 +342,7 @@ def test_issue3880():
     nlp.add_pipe("parser").add_label("dep")
     nlp.add_pipe("ner").add_label("PERSON")
     nlp.add_pipe("tagger").add_label("NN")
-    nlp.begin_training()
+    nlp.initialize()
     for doc in nlp.pipe(texts):
         pass
 
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 7b7ddfe0d..753cff37f 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -66,7 +66,7 @@ def test_issue4030():
         textcat.add_label(label)
     # training the network
     with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@@ -87,7 +87,7 @@ def test_issue4042():
     # add ner pipe
     ner = nlp.add_pipe("ner")
     ner.add_label("SOME_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     # Add entity ruler
     patterns = [
         {"label": "MY_ORG", "pattern": "Apple"},
@@ -118,7 +118,7 @@ def test_issue4042_bug2():
     # add ner pipe
     ner1 = nlp1.add_pipe("ner")
     ner1.add_label("SOME_LABEL")
-    nlp1.begin_training()
+    nlp1.initialize()
     # add a new label to the doc
     doc1 = nlp1("What do you think about Apple ?")
     assert len(ner1.labels) == 1
@@ -244,7 +244,7 @@ def test_issue4267():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("PEOPLE")
-    nlp.begin_training()
+    nlp.initialize()
     assert "ner" in nlp.pipe_names
     # assert that we have correct IOB annotations
     doc1 = nlp("hi")
@@ -299,7 +299,7 @@ def test_issue4313():
     config = {}
     ner = nlp.create_pipe("ner", config=config)
     ner.add_label("SOME_LABEL")
-    ner.begin_training(lambda: [])
+    ner.initialize(lambda: [])
     # add a new label to the doc
     doc = nlp("What do you think about Apple ?")
     assert len(ner.labels) == 1
@@ -327,7 +327,7 @@ def test_issue4348():
     TRAIN_DATA = [example, example]
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index e351858f5..6dbbc233b 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -180,7 +180,7 @@ def test_issue4725_2():
     vocab.set_vector("dog", data[1])
     nlp = English(vocab=vocab)
     nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
     docs = ["Kurt is in London."] * 10
     for _ in nlp.pipe(docs, batch_size=2, n_process=2):
         pass
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 531e48ec3..5e320996a 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -64,7 +64,7 @@ def tagger():
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     return tagger
 
 
@@ -85,7 +85,7 @@ def entity_linker():
     # need to add model for two reasons:
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
-    nlp.begin_training()
+    nlp.initialize()
     return entity_linker
 
 
diff --git a/spacy/tests/regression/test_issue5551.py b/spacy/tests/regression/test_issue5551.py
index b7139d463..655764362 100644
--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@@ -25,7 +25,7 @@ def test_issue5551():
         pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
         for label in set(example[1]["cats"]):
             pipe.add_label(label)
-        nlp.begin_training()
+        nlp.initialize()
 
         # Store the result of each iteration
         result = pipe.model.predict([nlp.make_doc(example[0])])
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb5f15007..663e76550 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -152,7 +152,7 @@ def test_serialize_nlp():
     nlp_config = Config().from_str(nlp_config_string)
     nlp = load_model_from_config(nlp_config, auto_fill=True)
     nlp.get_pipe("tagger").add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     assert "tok2vec" in nlp.pipe_names
     assert "tagger" in nlp.pipe_names
     assert "parser" not in nlp.pipe_names
@@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
     parser_cfg = dict()
     parser_cfg["model"] = {"@architectures": "my_test_parser"}
     nlp.add_pipe("parser", config=parser_cfg)
-    nlp.begin_training()
+    nlp.initialize()
 
     with make_tempdir() as d:
         nlp.to_disk(d)
@@ -191,7 +191,7 @@ def test_serialize_parser():
     model_config = Config().from_str(parser_config_string)
     parser = nlp.add_pipe("parser", config=model_config)
     parser.add_label("nsubj")
-    nlp.begin_training()
+    nlp.initialize()
 
     with make_tempdir() as d:
         nlp.to_disk(d)
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index da46ad424..6a487303e 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -18,7 +18,7 @@ def nlp():
     textcat = nlp.add_pipe("textcat")
     for label in ("POSITIVE", "NEGATIVE"):
         textcat.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
     return nlp
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 5c02aca36..ea39e8b90 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -47,7 +47,7 @@ def test_readers():
     )
     optimizer = T["optimizer"]
     # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
         nlp.update([example], sgd=optimizer)
     scores = nlp.evaluate(list(dev_corpus(nlp)))
@@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
     )
     optimizer = T["optimizer"]
     # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
         assert example.y.cats
         # this shouldn't fail if each training example has at least one positive label
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a04e6aadd..9655dd1b6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -600,7 +600,7 @@ def _train_tuples(train_data):
     train_examples = []
     for t in train_data:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 24b00a764..23debfb28 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
+    # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
     if "pretraining" in config and config["pretraining"]:
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index ef2666ec0..3f6258be9 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -517,18 +517,18 @@ specific data and challenge.
 Stacked ensemble of a bag-of-words model and a neural network model. The neural
 network has an internal CNN Tok2Vec layer and uses attention.
 
-| Name                 | Description                                                                                                                                                                                        |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                        |
-| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                             |
-| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                              |
-| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                                |
-| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                        |
-| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                        |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                 | Description                                                                                                                                                                                    |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
+| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
+| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                          |
+| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                            |
+| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
+| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
 
@@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
 vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.
 
-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                            |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ### spacy.TextCatBOW.v1 {#TextCatBOW}
 
@@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
 An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.
 
-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                               |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                           |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
 
@@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
 The `EntityLinker` model architecture is a Thinc `Model` with a
 [`Linear`](https://thinc.ai/api-layers#linear) output layer.
 
-| Name        | Description                                                                                                                                                                                                             |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                                 |
-| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                        |
+| Name        | Description                                                                                                                                                                                                         |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                             |
+| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 
 ### spacy.EmptyKB.v1 {#EmptyKB}
 
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 8af4455d3..c7c41f2a1 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## DependencyParser.begin_training {#begin_training tag="method"}
+## DependencyParser.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -151,11 +151,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = parser.update(examples, sgd=optimizer)
 > ```
 
@@ -294,11 +300,10 @@ context, the original parameters are restored.
 ## DependencyParser.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 945a1568a..1dbe78703 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityLinker.begin_training {#begin_training tag="method"}
+## EntityLinker.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -150,11 +150,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 6d710f425..2c32ff753 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityRecognizer.begin_training {#begin_training tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -140,11 +140,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = ner.update(examples, sgd=optimizer)
 > ```
 
@@ -282,11 +288,10 @@ context, the original parameters are restored.
 ## EntityRecognizer.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index dd3cc57dd..11631502c 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
 
-## Language.begin_training {#begin_training tag="method"}
+## Language.initialize {#initialize tag="method"}
 
 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
 function that returns an iterable of [`Example`](/api/example) objects. The data
 examples can either be the full training data or a representative sample. They
 are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`begin_training`](/api/pipe#begin_training) method, if
+passed each component's [`initialize`](/api/pipe#initialize) method, if
 available. Initialization includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.
 
-If no `get_examples` function is provided when calling `nlp.begin_training`, the
+If no `get_examples` function is provided when calling `nlp.initialize`, the
 pipeline components will be initialized with generic data. In this case, it is
 crucial that the output dimension of each component has already been defined
 either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
 
-<Infobox variant="warning" title="Changed in v3.0">
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
-The `Language.update` method now takes a **function** that is called with no
-arguments and returns a sequence of [`Example`](/api/example) objects instead of
-tuples of `Doc` and `GoldParse` objects.
+This method was previously called `begin_training`. It now also takes a
+**function** that is called with no arguments and returns a sequence of
+[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
+objects.
 
 </Infobox>
 
@@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
 >
 > ```python
 > get_examples = lambda: examples
-> optimizer = nlp.begin_training(get_examples)
+> optimizer = nlp.initialize(get_examples)
 > ```
 
 | Name           | Description                                                                                                                                              |
@@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
 >
 > ```python
 > with nlp.select_pipes(disable=["tagger", "parser"]):
->    nlp.begin_training()
+>    nlp.initialize()
 >
 > with nlp.select_pipes(enable="ner"):
->     nlp.begin_training()
+>     nlp.initialize()
 >
 > disabled = nlp.select_pipes(disable=["tagger", "parser"])
-> nlp.begin_training()
+> nlp.initialize()
 > disabled.restore()
 > ```
 
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index e1a166474..4f00a09ef 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Morphologizer.begin_training {#begin_training tag="method"}
+## Morphologizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -133,7 +133,7 @@ setting up the label scheme based on the data.
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
 > nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = morphologizer.update(examples, sgd=optimizer)
 > ```
 
@@ -259,12 +259,11 @@ context, the original parameters are restored.
 Add a new label to the pipe. If the `Morphologizer` should set annotations for
 both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
 Raises an error if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). Note that you don't have to
-call this method if you provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+already been fully [initialized](#initialize). Note that you don't have to call
+this method if you provide a **representative data sample** to the
+[`initialize`](#initialize) method. In this case, all labels found in the sample
+will be automatically added to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index e4e1e97f1..17752ed5e 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Pipe.begin_training {#begin_training tag="method"}
+## Pipe.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -109,11 +109,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=optimizer)
 > ```
 
@@ -296,9 +302,9 @@ context, the original parameters are restored.
 Add a new label to the pipe, to be predicted by the model. The actual
 implementation depends on the specific component, but in general `add_label`
 shouldn't be called if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). If these conditions are
-violated, the function will raise an Error. The exception to this rule is when
-the component is [resizable](#is_resizable), in which case
+already been fully [initialized](#initialize). If these conditions are violated,
+the function will raise an Error. The exception to this rule is when the
+component is [resizable](#is_resizable), in which case
 [`set_output`](#set_output) should be called to ensure that the model is
 properly resized.
 
@@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
 | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
 
 Note that in general, you don't have to call `pipe.add_label` if you provide a
-representative data sample to the [`begin_training`](#begin_training) method. In
-this case, all labels found in the sample will be automatically added to the
-model, and the output dimension will be
+representative data sample to the [`initialize`](#initialize) method. In this
+case, all labels found in the sample will be automatically added to the model,
+and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 ## Pipe.is_resizable {#is_resizable tag="method"}
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index acf94fb8e..d81725343 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## SentenceRecognizer.begin_training {#begin_training tag="method"}
+## SentenceRecognizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -129,7 +129,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = senter.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index d428d376e..6ca554f49 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Tagger.begin_training {#begin_training tag="method"}
+## Tagger.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -123,11 +123,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tagger.update(examples, sgd=optimizer)
 > ```
 
@@ -289,12 +295,12 @@ context, the original parameters are restored.
 ## Tagger.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index b68039094..4c99d6984 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## TextCategorizer.begin_training {#begin_training tag="method"}
+## TextCategorizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -136,11 +136,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = textcat.update(examples, sgd=optimizer)
 > ```
 
 | Name              | Description                                                                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    | 
+| _keyword-only_    |                                                                                                                                    |
 | `drop`            | The dropout rate. ~~float~~                                                                                                        |
 | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
 | `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
@@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          | 
+| _keyword-only_ |                                                                                                                          |
 | `drop`         | The dropout rate. ~~float~~                                                                                              |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
@@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
 ## TextCategorizer.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 5c7214edc..8269ad7cf 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Tok2Vec.begin_training {#begin_training tag="method"}
+## Tok2Vec.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -138,7 +138,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tok2vec.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index d5bcef229..712214fec 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Transformer.begin_training {#begin_training tag="method"}
+## Transformer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -173,7 +173,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = trf.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index a58ba2ba9..b65c3d903 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
 that their internal models are **always initialized** with appropriate sample
 data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
 ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
-functionality is triggered when
-[`nlp.begin_training`](/api/language#begin_training) is called.
+functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
+called.
 
 ### Dropout and normalization in Thinc {#thinc-dropout-norm}
 
@@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
 
 <!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
-- Initialization life-cycle with `begin_training`, correlation with add_label
+- Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index dbf0881ac..b1cf2723b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                         | Description                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)                 | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)             | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)                   | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 54be6b367..1c1b92e03 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1045,8 +1045,8 @@ of being dropped.
 
 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
->   return an optimizer to update the component model weights.
+> - [`nlp.initialize`](/api/language#initialize): Start the training and return
+>   an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.
@@ -1057,7 +1057,7 @@ of being dropped.
 
 ```python
 ### Example training loop
-optimizer = nlp.begin_training()
+optimizer = nlp.initialize()
 for itn in range(100):
     random.shuffle(train_data)
     for raw_text, entity_offsets in train_data:
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 94c50e1ec..44f902cd5 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`Pipe.update`](/api/pipe#update) methods now all take batches of
   [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
   raw text and a dictionary of annotations.
-  [`Language.begin_training`](/api/language#begin_training) and
-  [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-  returns a sequence of `Example` objects to initialize the model instead of a
-  list of tuples.
+  [`Language.initialize`](/api/language#initialize) and
+  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
+  sequence of `Example` objects to initialize the model instead of a list of
+  tuples.
+- The `begin_training` methods have been renamed to `initialize`.
 - [`Matcher.add`](/api/matcher#add) and
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
@@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
 | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
@@ -936,7 +938,7 @@ TRAIN_DATA = [
     ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
     ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.begin_training()
+nlp.initialize()
 for i in range(20):
     random.shuffle(TRAIN_DATA)
     for batch in minibatch(TRAIN_DATA):
@@ -946,17 +948,18 @@ for i in range(20):
         nlp.update(examples)
 ```
 
-[`Language.begin_training`](/api/language#begin_training) and
-[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-returns a sequence of `Example` objects to initialize the model instead of a
-list of tuples. The data examples are used to **initialize the models** of
+`Language.begin_training` and `Pipe.begin_training` have been renamed to
+[`Language.initialize`](/api/language#initialize) and
+[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
+that returns a sequence of `Example` objects to initialize the model instead of
+a list of tuples. The data examples are used to **initialize the models** of
 trainable pipeline components, which includes validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.
 
 ```diff
-- nlp.begin_training(examples)
-+ nlp.begin_training(lambda: examples)
+- nlp.initialize(examples)
++ nlp.initialize(lambda: examples)
 ```
 
 #### Packaging trained pipelines {#migrating-training-packaging}