Merge pull request #6160 from explosion/feature/prepare

2025-09-16 09:02:35 +03:00 · 2020-09-29 20:55:13 +02:00 · 2020-09-29 20:55:13 +02:00 · 78021089f9
commit 78021089f9
parent c3f8c09d7d d3c63b7965
79 changed files with 1940 additions and 1755 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -98,7 +98,7 @@ universal = false
 formats = gztar

 [flake8]
-ignore = E203, E266, E501, E731, W503
+ignore = E203, E266, E501, E731, W503, E741
 max-line-length = 80
 select = B,C,E,F,W,T4,B9
 exclude =
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,7 +15,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
-from .init_model import init_model  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -10,7 +10,7 @@ from click import NoSuchOption
 from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError, require_gpu
 from configparser import InterpolationError
 import os

@ -275,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)


-def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
-    """RETURNS (List[str]): All sourced components in the original config,
-        e.g. {"source": "en_core_web_sm"}. If the config contains a key
-        "factory", we assume it refers to a component factory.
-    """
-    return [
-        name
-        for name, cfg in config.get("components", {}).items()
-        if "factory" not in cfg and "source" in cfg
-    ]
-
-
 def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
    """Upload a file.

@ -458,3 +446,12 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
            p = int(p)
        result.append(p)
    return result
+
+
+def setup_gpu(use_gpu: int) -> None:
+    """Configure the GPU and log info."""
+    if use_gpu >= 0:
+        msg.info(f"Using GPU: {use_gpu}")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -1,12 +1,14 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config
 from thinc.config import VARIABLE_RE
 import typer

 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
+from ..schemas import ConfigSchemaTraining
+from ..util import registry
 from .. import util


@ -52,10 +54,10 @@ def debug_config(
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
-        # Use the resolved config here in case user has one function returning
-        # a dict of corpora etc.
-        resolved = util.resolve_training_config(nlp.config)
-        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
+        config = nlp.config.interpolate()
+        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+        dot_names = [T["train_corpus"], T["dev_corpus"]]
+        util.resolve_dot_names(config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
@ -97,23 +99,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
        value = util.dot_to_object(config, path)
        result[variable] = repr(value)
    return result
-
-
-def check_section_refs(config: Config, fields: List[str]) -> None:
-    """Validate fields in the config that refer to other sections or values
-    (e.g. in the corpora) and make sure that those references exist.
-    """
-    errors = []
-    for field in fields:
-        # If the field doesn't exist in the config, we ignore it
-        try:
-            value = util.dot_to_object(config, field)
-        except KeyError:
-            continue
-        try:
-            util.dot_to_object(config, value)
-        except KeyError:
-            msg = f"not a valid section reference: {value}"
-            errors.append({"loc": field.split("."), "msg": msg})
-    if errors:
-        raise ConfigValidationError(config=config, errors=errors)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
 import typer

 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, get_sourced_components
+from ._util import import_code, debug_cli
 from ..training import Corpus, Example
+from ..training.initialize import get_sourced_components
+from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
+from ..util import registry
 from .. import util


@ -94,26 +97,15 @@ def debug_data(
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
-        C = util.resolve_training_config(nlp.config)
+        T = registry.resolve(
+            nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
+        )
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
    resume_components = [p for p in sourced_components if p not in frozen_components]
    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
-    tag_map_path = util.ensure_path(C["training"]["tag_map"])
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
-    morph_rules = {}
-    if morph_rules_path is not None:
-        morph_rules = srsly.read_json(morph_rules_path)
-    # Replace tag map with provided mapping
-    nlp.vocab.morphology.load_tag_map(tag_map)
-    # Load morph rules
-    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
@ -145,10 +137,10 @@ def debug_data(

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
-    msg.text(f"Language: {C['nlp']['lang']}")
+    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@ -355,6 +347,7 @@ def debug_data(
    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
+        # TODO: does this need to be updated?
        tag_map = nlp.vocab.morphology.tag_map
        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
        labels_with_counts = _format_labels(
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -2,18 +2,23 @@ from typing import Dict, Any, Optional, Iterable
 from pathlib import Path

 from spacy.training import Example
-from spacy.util import dot_to_object
+from spacy.util import resolve_dot_names
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
 import typer

 from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list
+from ._util import parse_config_overrides, string_to_list, setup_gpu
+from ..schemas import ConfigSchemaTraining
+from ..util import registry
 from .. import util


-@debug_cli.command("model")
+@debug_cli.command(
+    "model",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
 def debug_model_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
@ -37,11 +42,7 @@ def debug_model_cli(

    DOCS: https://nightly.spacy.io/api/cli#debug-model
    """
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
@ -59,14 +60,15 @@ def debug_model_cli(
        raw_config = util.load_config(
            config_path, overrides=config_overrides, interpolate=False
        )
-    config = raw_config.iterpolate()
+    config = raw_config.interpolate()
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    with show_validation_error(config_path):
        nlp = util.load_model_from_config(raw_config)
-        C = util.resolve_training_config(nlp.config)
-    seed = C["training"]["seed"]
+        config = nlp.config.interpolate()
+        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    seed = T["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
@ -77,11 +79,16 @@ def debug_model_cli(
            exits=1,
        )
    model = pipe.model
-    debug_model(C, nlp, model, print_settings=print_settings)
+    debug_model(config, T, nlp, model, print_settings=print_settings)


 def debug_model(
-    config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
+    config,
+    resolved_train_config,
+    nlp,
+    model: Model,
+    *,
+    print_settings: Optional[Dict[str, Any]] = None,
 ):
    if not isinstance(model, Model):
        msg.fail(
@ -102,13 +109,16 @@ def debug_model(
    # The output vector might differ from the official type of the output layer
    with data_validation(False):
        try:
-            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.begin_training(lambda: train_corpus(nlp))
+            dot_names = [resolved_train_config["train_corpus"]]
+            with show_validation_error():
+                (train_corpus,) = resolve_dot_names(config, dot_names)
+                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
-                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                with show_validation_error():
+                    nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,11 +3,11 @@ from wasabi import Printer
 from pathlib import Path
 import re
 import srsly
-from thinc.api import require_gpu, fix_random_seed
+from thinc.api import fix_random_seed

 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, setup_gpu
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
@ -61,8 +61,7 @@ def evaluate(
 ) -> Scorer:
    msg = Printer(no_print=silent, pretty=not silent)
    fix_random_seed()
-    if use_gpu >= 0:
-        require_gpu(use_gpu)
+    setup_gpu(use_gpu)
    data_path = util.ensure_path(data_path)
    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -1,360 +0,0 @@
-from typing import Optional, List, Dict, Any, Union, IO
-import math
-from tqdm import tqdm
-import numpy
-from ast import literal_eval
-from pathlib import Path
-from preshed.counter import PreshCounter
-import tarfile
-import gzip
-import zipfile
-import srsly
-import warnings
-from wasabi import msg, Printer
-import typer
-
-from ._util import app, init_cli, Arg, Opt
-from ..vectors import Vectors
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
-
-try:
-    import ftfy
-except ImportError:
-    ftfy = None
-
-
-DEFAULT_OOV_PROB = -20
-
-
-@init_cli.command("vocab")
-@app.command(
-    "init-model",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-    hidden=True,  # hide this from main CLI help but still allow it to work with warning
-)
-def init_model_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    lang: str = Arg(..., help="Pipeline language"),
-    output_dir: Path = Arg(..., help="Pipeline output directory"),
-    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
-    clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
-    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
-    vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
-    prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
-    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
-    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
-    model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
-    base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
-    # fmt: on
-):
-    """
-    Create a new blank pipeline directory with vocab and vectors from raw data.
-    If vectors are provided in Word2Vec format, they can be either a .txt or
-    zipped as a .zip or .tar.gz.
-
-    DOCS: https://nightly.spacy.io/api/cli#init-vocab
-    """
-    if ctx.command.name == "init-model":
-        msg.warn(
-            "The init-model command is now called 'init vocab'. You can run "
-            "'python -m spacy init --help' for an overview of the other "
-            "available initialization commands."
-        )
-    init_model(
-        lang,
-        output_dir,
-        freqs_loc=freqs_loc,
-        clusters_loc=clusters_loc,
-        jsonl_loc=jsonl_loc,
-        vectors_loc=vectors_loc,
-        prune_vectors=prune_vectors,
-        truncate_vectors=truncate_vectors,
-        vectors_name=vectors_name,
-        model_name=model_name,
-        base_model=base_model,
-        silent=False,
-    )
-
-
-def init_model(
-    lang: str,
-    output_dir: Path,
-    freqs_loc: Optional[Path] = None,
-    clusters_loc: Optional[Path] = None,
-    jsonl_loc: Optional[Path] = None,
-    vectors_loc: Optional[Path] = None,
-    prune_vectors: int = -1,
-    truncate_vectors: int = 0,
-    vectors_name: Optional[str] = None,
-    model_name: Optional[str] = None,
-    base_model: Optional[str] = None,
-    silent: bool = True,
-) -> Language:
-    msg = Printer(no_print=silent, pretty=not silent)
-    if jsonl_loc is not None:
-        if freqs_loc is not None or clusters_loc is not None:
-            settings = ["-j"]
-            if freqs_loc:
-                settings.append("-f")
-            if clusters_loc:
-                settings.append("-c")
-            msg.warn(
-                "Incompatible arguments",
-                "The -f and -c arguments are deprecated, and not compatible "
-                "with the -j argument, which should specify the same "
-                "information. Either merge the frequencies and clusters data "
-                "into the JSONL-formatted file (recommended), or use only the "
-                "-f and -c files, without the other lexical attributes.",
-            )
-        jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = srsly.read_jsonl(jsonl_loc)
-    else:
-        clusters_loc = ensure_path(clusters_loc)
-        freqs_loc = ensure_path(freqs_loc)
-        if freqs_loc is not None and not freqs_loc.exists():
-            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
-        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
-
-    with msg.loading("Creating blank pipeline..."):
-        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
-
-    msg.good("Successfully created blank pipeline")
-    if vectors_loc is not None:
-        add_vectors(
-            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
-        )
-    vec_added = len(nlp.vocab.vectors)
-    lex_added = len(nlp.vocab)
-    msg.good(
-        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
-    )
-    if not output_dir.exists():
-        output_dir.mkdir()
-    nlp.to_disk(output_dir)
-    return nlp
-
-
-def open_file(loc: Union[str, Path]) -> IO:
-    """Handle .gz, .tar.gz or unzipped files"""
-    loc = ensure_path(loc)
-    if tarfile.is_tarfile(str(loc)):
-        return tarfile.open(str(loc), "r:gz")
-    elif loc.parts[-1].endswith("gz"):
-        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
-    elif loc.parts[-1].endswith("zip"):
-        zip_file = zipfile.ZipFile(str(loc))
-        names = zip_file.namelist()
-        file_ = zip_file.open(names[0])
-        return (line.decode("utf8") for line in file_)
-    else:
-        return loc.open("r", encoding="utf8")
-
-
-def read_attrs_from_deprecated(
-    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
-) -> List[Dict[str, Any]]:
-    if freqs_loc is not None:
-        with msg.loading("Counting frequencies..."):
-            probs, _ = read_freqs(freqs_loc)
-        msg.good("Counted frequencies")
-    else:
-        probs, _ = ({}, DEFAULT_OOV_PROB)  # noqa: F841
-    if clusters_loc:
-        with msg.loading("Reading clusters..."):
-            clusters = read_clusters(clusters_loc)
-        msg.good("Read clusters")
-    else:
-        clusters = {}
-    lex_attrs = []
-    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
-    if len(sorted_probs):
-        for i, (word, prob) in tqdm(enumerate(sorted_probs)):
-            attrs = {"orth": word, "id": i, "prob": prob}
-            # Decode as a little-endian string, so that we can do & 15 to get
-            # the first 4 bits. See _parse_features.pyx
-            if word in clusters:
-                attrs["cluster"] = int(clusters[word][::-1], 2)
-            else:
-                attrs["cluster"] = 0
-            lex_attrs.append(attrs)
-    return lex_attrs
-
-
-def create_model(
-    lang: str,
-    lex_attrs: List[Dict[str, Any]],
-    name: Optional[str] = None,
-    base_model: Optional[Union[str, Path]] = None,
-) -> Language:
-    if base_model:
-        nlp = load_model(base_model)
-        # keep the tokenizer but remove any existing pipeline components due to
-        # potentially conflicting vectors
-        for pipe in nlp.pipe_names:
-            nlp.remove_pipe(pipe)
-    else:
-        lang_class = get_lang_class(lang)
-        nlp = lang_class()
-    for lexeme in nlp.vocab:
-        lexeme.rank = OOV_RANK
-    for attrs in lex_attrs:
-        if "settings" in attrs:
-            continue
-        lexeme = nlp.vocab[attrs["orth"]]
-        lexeme.set_attrs(**attrs)
-    if len(nlp.vocab):
-        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
-    else:
-        oov_prob = DEFAULT_OOV_PROB
-    nlp.vocab.cfg.update({"oov_prob": oov_prob})
-    if name:
-        nlp.meta["name"] = name
-    return nlp
-
-
-def add_vectors(
-    msg: Printer,
-    nlp: Language,
-    vectors_loc: Optional[Path],
-    truncate_vectors: int,
-    prune_vectors: int,
-    name: Optional[str] = None,
-) -> None:
-    vectors_loc = ensure_path(vectors_loc)
-    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
-        for lex in nlp.vocab:
-            if lex.rank and lex.rank != OOV_RANK:
-                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
-    else:
-        if vectors_loc:
-            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(
-                    msg, vectors_loc, truncate_vectors
-                )
-            msg.good(f"Loaded vectors from {vectors_loc}")
-        else:
-            vectors_data, vector_keys = (None, None)
-        if vector_keys is not None:
-            for word in vector_keys:
-                if word not in nlp.vocab:
-                    nlp.vocab[word]
-        if vectors_data is not None:
-            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
-    if prune_vectors >= 1:
-        nlp.vocab.prune_vectors(prune_vectors)
-
-
-def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
-    f = open_file(vectors_loc)
-    f = ensure_shape(f)
-    shape = tuple(int(size) for size in next(f).split())
-    if truncate_vectors >= 1:
-        shape = (truncate_vectors, shape[1])
-    vectors_data = numpy.zeros(shape=shape, dtype="f")
-    vectors_keys = []
-    for i, line in enumerate(tqdm(f)):
-        line = line.rstrip()
-        pieces = line.rsplit(" ", vectors_data.shape[1])
-        word = pieces.pop(0)
-        if len(pieces) != vectors_data.shape[1]:
-            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
-        vectors_data[i] = numpy.asarray(pieces, dtype="f")
-        vectors_keys.append(word)
-        if i == truncate_vectors - 1:
-            break
-    return vectors_data, vectors_keys
-
-
-def ensure_shape(lines):
-    """Ensure that the first line of the data is the vectors shape.
-
-    If it's not, we read in the data and output the shape as the first result,
-    so that the reader doesn't have to deal with the problem.
-    """
-    first_line = next(lines)
-    try:
-        shape = tuple(int(size) for size in first_line.split())
-    except ValueError:
-        shape = None
-    if shape is not None:
-        # All good, give the data
-        yield first_line
-        yield from lines
-    else:
-        # Figure out the shape, make it the first value, and then give the
-        # rest of the data.
-        width = len(first_line.split()) - 1
-        captured = [first_line] + list(lines)
-        length = len(captured)
-        yield f"{length} {width}"
-        yield from captured
-
-
-def read_freqs(
-    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
-):
-    counts = PreshCounter()
-    total = 0
-    with freqs_loc.open() as f:
-        for i, line in enumerate(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            freq = int(freq)
-            counts.inc(i + 1, freq)
-            total += freq
-    counts.smooth()
-    log_total = math.log(total)
-    probs = {}
-    with freqs_loc.open() as f:
-        for line in tqdm(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            doc_freq = int(doc_freq)
-            freq = int(freq)
-            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-                try:
-                    word = literal_eval(key)
-                except SyntaxError:
-                    # Take odd strings literally.
-                    word = literal_eval(f"'{key}'")
-                smooth_count = counts.smoother(int(freq))
-                probs[word] = math.log(smooth_count) - log_total
-    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs, oov_prob
-
-
-def read_clusters(clusters_loc: Path) -> dict:
-    clusters = {}
-    if ftfy is None:
-        warnings.warn(Warnings.W004)
-    with clusters_loc.open() as f:
-        for line in tqdm(f):
-            try:
-                cluster, word, freq = line.split()
-                if ftfy is not None:
-                    word = ftfy.fix_text(word)
-            except ValueError:
-                continue
-            # If the clusterer has only seen the word a few times, its
-            # cluster is unreliable.
-            if int(freq) >= 3:
-                clusters[word] = cluster
-            else:
-                clusters[word] = "0"
-    # Expand clusters with re-casing
-    for word, cluster in list(clusters.items()):
-        if word.lower() not in clusters:
-            clusters[word.lower()] = cluster
-        if word.title() not in clusters:
-            clusters[word.title()] = cluster
-        if word.upper() not in clusters:
-            clusters[word.upper()] = cluster
-    return clusters
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -0,0 +1,99 @@
+from typing import Optional
+import logging
+from pathlib import Path
+from wasabi import msg
+import typer
+import srsly
+
+from .. import util
+from ..training.initialize import init_nlp, convert_vectors
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu
+
+
+@init_cli.command("vectors")
+def init_vectors_cli(
+    # fmt: off
+    lang: str = Arg(..., help="The language of the nlp object to create"),
+    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
+    output_dir: Path = Arg(..., help="Pipeline output directory"),
+    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
+    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    # fmt: on
+):
+    """Convert word vectors for use with spaCy. Will export an nlp object that
+    you can use in the [initialize.vocab] block of your config to initialize
+    a model with vectors.
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    msg.info(f"Creating blank nlp object for language '{lang}'")
+    nlp = util.get_lang_class(lang)()
+    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
+    nlp.to_disk(output_dir)
+    msg.good(
+        "Saved nlp object with vectors to output directory. You can now use the "
+        "path to it in your config as the 'vectors' setting in [initialize.vocab].",
+        output_dir,
+    )
+
+
+@init_cli.command(
+    "nlp",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    hidden=True,
+)
+def init_pipeline_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the prepared data"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+    nlp.to_disk(output_path)
+    msg.good(f"Saved initialized pipeline to {output_path}")
+
+
+@init_cli.command(
+    "labels",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def init_labels_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the labels"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    if not output_path.exists():
+        output_path.mkdir()
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu)
+    for name, component in nlp.pipeline:
+        if getattr(component, "label_data", None) is not None:
+            srsly.write_json(output_path / f"{name}.json", component.label_data)
+            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+        else:
+            msg.info(f"No labels found for {name}")
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,25 +1,13 @@
 from typing import Optional
-import numpy
-import time
-import re
-from collections import Counter
 from pathlib import Path
-from thinc.api import require_gpu, set_gpu_allocator
-from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
-import srsly
-from functools import partial
 import typer
+import re

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
-from ..ml.models.multi_task import build_cloze_multi_task_model
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
-from ..tokens import Doc
-from ..attrs import ID
-from .. import util
-from ..util import dot_to_object
+from ._util import import_code, setup_gpu
+from ..training.pretrain import pretrain
+from ..util import load_config


@app.command(
@ -61,15 +49,11 @@ def pretrain_cli(
    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
    msg.info(f"Loading config from: {config_path}")

    with show_validation_error(config_path):
-        raw_config = util.load_config(
+        raw_config = load_config(
            config_path, overrides=config_overrides, interpolate=False
        )
    config = raw_config.interpolate()
@ -89,250 +73,11 @@ def pretrain_cli(
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
+        silent=False,
    )
-
-
-def pretrain(
-    config: Config,
-    output_dir: Path,
-    resume_path: Optional[Path] = None,
-    epoch_resume: Optional[int] = None,
-    use_gpu: int = -1,
-):
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    nlp = util.load_model_from_config(config)
-    C = util.resolve_training_config(nlp.config)
-    P_cfg = C["pretraining"]
-    corpus = dot_to_object(C, P_cfg["corpus"])
-    batcher = P_cfg["batcher"]
-    model = create_pretraining_model(nlp, C["pretraining"])
-    optimizer = C["pretraining"]["optimizer"]
-    # Load in pretrained weights to resume from
-    if resume_path is not None:
-        _resume_model(model, resume_path, epoch_resume)
-    else:
-        # Without '--resume-path' the '--epoch-resume' argument is ignored
-        epoch_resume = 0
-
-    tracker = ProgressTracker(frequency=10000)
-    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
-    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
-    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
-
-    def _save_model(epoch, is_temp=False):
-        is_temp_str = ".temp" if is_temp else ""
-        with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
-                file_.write(model.get_ref("tok2vec").to_bytes())
-            log = {
-                "nr_word": tracker.nr_word,
-                "loss": tracker.loss,
-                "epoch_loss": tracker.epoch_loss,
-                "epoch": epoch,
-            }
-            with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(srsly.json_dumps(log) + "\n")
-
-    objective = create_objective(P_cfg["objective"])
-    # TODO: I think we probably want this to look more like the
-    # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
-        _save_model(epoch)
-        tracker.epoch_loss = 0.0
    msg.good("Successfully finished pretrain")


-def ensure_docs(examples_or_docs):
-    docs = []
-    for eg_or_doc in examples_or_docs:
-        if isinstance(eg_or_doc, Doc):
-            docs.append(eg_or_doc)
-        else:
-            docs.append(eg_or_doc.reference)
-    return docs
-
-
-def _resume_model(model, resume_path, epoch_resume):
-    msg.info(f"Resume training tok2vec from: {resume_path}")
-    with resume_path.open("rb") as file_:
-        weights_data = file_.read()
-        model.get_ref("tok2vec").from_bytes(weights_data)
-    # Parse the epoch number from the given weight file
-    model_name = re.search(r"model\d+\.bin", str(resume_path))
-    if model_name:
-        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-    else:
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-
-
-def make_update(model, docs, optimizer, objective_func):
-    """Perform an update over a single batch of documents.
-
-    docs (iterable): A batch of `Doc` objects.
-    optimizer (callable): An optimizer.
-    RETURNS loss: A float for the loss.
-    """
-    predictions, backprop = model.begin_update(docs)
-    loss, gradients = objective_func(model.ops, docs, predictions)
-    backprop(gradients)
-    model.finish_update(optimizer)
-    # Don't want to return a cupy object here
-    # The gradients are modified in-place by the BERT MLM,
-    # so we get an accurate loss
-    return float(loss)
-
-
-def create_objective(config):
-    """Create the objective for pretraining.
-
-    We'd like to replace this with a registry function but it's tricky because
-    we're also making a model choice based on this. For now we hard-code support
-    for two types (characters, vectors). For characters you can specify
-    n_characters, for vectors you can specify the loss.
-
-    Bleh.
-    """
-    objective_type = config["type"]
-    if objective_type == "characters":
-        return partial(get_characters_loss, nr_char=config["n_characters"])
-    elif objective_type == "vectors":
-        if config["loss"] == "cosine":
-            return partial(
-                get_vectors_loss,
-                distance=CosineDistance(normalize=True, ignore_zeros=True),
-            )
-        elif config["loss"] == "L2":
-            return partial(
-                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
-            )
-        else:
-            raise ValueError("Unexpected loss type", config["loss"])
-    else:
-        raise ValueError("Unexpected objective_type", objective_type)
-
-
-def get_vectors_loss(ops, docs, prediction, distance):
-    """Compute a loss based on a distance between the documents' vectors and
-    the prediction.
-    """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    d_target, loss = distance(prediction, target)
-    return loss, d_target
-
-
-def get_characters_loss(ops, docs, prediction, nr_char):
-    """Compute a loss based on a number of characters predicted from the docs."""
-    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
-    target_ids = target_ids.reshape((-1,))
-    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
-    target = target.reshape((-1, 256 * nr_char))
-    diff = prediction - target
-    loss = (diff ** 2).sum()
-    d_target = diff / float(prediction.shape[0])
-    return loss, d_target
-
-
-def create_pretraining_model(nlp, pretrain_config):
-    """Define a network for the pretraining. We simply add an output layer onto
-    the tok2vec input model. The tok2vec input model needs to be a model that
-    takes a batch of Doc objects (as a list), and returns a list of arrays.
-    Each array in the output needs to have one row per token in the doc.
-    The actual tok2vec layer is stored as a reference, and only this bit will be
-    serialized to file and read back in when calling the 'train' command.
-    """
-    component = nlp.get_pipe(pretrain_config["component"])
-    if pretrain_config.get("layer"):
-        tok2vec = component.model.get_ref(pretrain_config["layer"])
-    else:
-        tok2vec = component.model
-
-    # TODO
-    maxout_pieces = 3
-    hidden_size = 300
-    if pretrain_config["objective"]["type"] == "vectors":
-        model = build_cloze_multi_task_model(
-            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
-        )
-    elif pretrain_config["objective"]["type"] == "characters":
-        model = build_cloze_characters_multi_task_model(
-            nlp.vocab,
-            tok2vec,
-            hidden_size=hidden_size,
-            maxout_pieces=maxout_pieces,
-            nr_char=pretrain_config["objective"]["n_characters"],
-        )
-    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
-    set_dropout_rate(model, pretrain_config["dropout"])
-    return model
-
-
-class ProgressTracker:
-    def __init__(self, frequency=1000000):
-        self.loss = 0.0
-        self.prev_loss = 0.0
-        self.nr_word = 0
-        self.words_per_epoch = Counter()
-        self.frequency = frequency
-        self.last_time = time.time()
-        self.last_update = 0
-        self.epoch_loss = 0.0
-
-    def update(self, epoch, loss, docs):
-        self.loss += loss
-        self.epoch_loss += loss
-        words_in_batch = sum(len(doc) for doc in docs)
-        self.words_per_epoch[epoch] += words_in_batch
-        self.nr_word += words_in_batch
-        words_since_update = self.nr_word - self.last_update
-        if words_since_update >= self.frequency:
-            wps = words_since_update / (time.time() - self.last_time)
-            self.last_update = self.nr_word
-            self.last_time = time.time()
-            loss_per_word = self.loss - self.prev_loss
-            status = (
-                epoch,
-                self.nr_word,
-                _smart_round(self.loss, width=10),
-                _smart_round(loss_per_word, width=6),
-                int(wps),
-            )
-            self.prev_loss = float(self.loss)
-            return status
-        else:
-            return None
-
-
-def _smart_round(figure, width=10, max_decimal=4):
-    """Round large numbers as integers, smaller numbers as decimals."""
-    n_digits = len(str(int(figure)))
-    n_decimal = width - (n_digits + 1)
-    if n_decimal <= 1:
-        return str(int(figure))
-    else:
-        n_decimal = min(n_decimal, max_decimal)
-        format_str = "%." + str(n_decimal) + "f"
-        return format_str % figure
-
-
 def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -278,11 +278,6 @@ path = ${paths.dev}
 max_length = 0

 [training]
-{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
-{% else -%}
-vectors = "{{ word_vectors }}"
-{% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
 {% endif -%}
@ -318,3 +313,10 @@ start = 100
 stop = 1000
 compound = 1.001
 {% endif %}
+
+[initialize]
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,23 +1,16 @@
-from typing import Optional, Dict, Any, Tuple, Union, Callable, List
-from timeit import default_timer as timer
-import srsly
-import tqdm
+from typing import Optional
 from pathlib import Path
 from wasabi import msg
-import thinc
-import thinc.schedules
-from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
-import random
+from thinc.api import Config
 import typer
 import logging

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
+from ._util import import_code, setup_gpu
 from ..language import Language
+from ..training.loop import train
+from ..training.initialize import init_nlp, must_reinitialize
 from .. import util
-from ..training.example import Example
-from ..errors import Errors
-from ..util import dot_to_object


@app.command(
@ -30,8 +23,7 @@ def train_cli(
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
-    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    """
@ -52,389 +44,37 @@ def train_cli(
    verify_cli_args(config_path, output_path)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    train(
-        config_path,
-        output_path=output_path,
-        config_overrides=overrides,
-        use_gpu=use_gpu,
-        resume_training=resume,
-    )
-
-
-def train(
-    config_path: Path,
-    output_path: Optional[Path] = None,
-    config_overrides: Dict[str, Any] = {},
-    use_gpu: int = -1,
-    resume_training: bool = False,
-) -> None:
-    if use_gpu >= 0:
-        msg.info(f"Using GPU: {use_gpu}")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
-    msg.info(f"Loading config and nlp from: {config_path}")
+    setup_gpu(use_gpu)
    with show_validation_error(config_path):
-        # Keep an un-interpolated config so we can preserve variables in
-        # the final nlp object we train and serialize
-        raw_config = util.load_config(
-            config_path, overrides=config_overrides, interpolate=False
-        )
-    config = raw_config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    # Use original config here before it's resolved to functions
-    sourced_components = get_sourced_components(config)
-    with show_validation_error(config_path):
-        nlp = util.load_model_from_config(raw_config)
-        # Resolve all training-relevant sections using the filled nlp config
-        C = util.resolve_training_config(nlp.config)
-    util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"])
-    if C["training"]["vectors"] is not None:
-        add_vectors(nlp, C["training"]["vectors"])
-    raw_text, tag_map, morph_rules, weights_data = load_from_paths(C)
-    T_cfg = C["training"]
-    optimizer = T_cfg["optimizer"]
-    train_corpus = dot_to_object(C, T_cfg["train_corpus"])
-    dev_corpus = dot_to_object(C, T_cfg["dev_corpus"])
-    batcher = T_cfg["batcher"]
-    train_logger = T_cfg["logger"]
-    before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T_cfg["frozen_components"]
-    # Sourced components that require resume_training
-    resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if resume_components:
-        with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
-            nlp.resume_training(sgd=optimizer)
-    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-    # Verify the config after calling 'begin_training' to ensure labels are properly initialized
-    verify_config(nlp)
-
-    if tag_map:
-        # Replace tag map with provided mapping
-        nlp.vocab.morphology.load_tag_map(tag_map)
-    if morph_rules:
-        # Load morph rules
-        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
-    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    if weights_data is not None:
-        tok2vec_component = C["pretraining"]["component"]
-        if tok2vec_component is None:
-            msg.fail(
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them.",
-                exits=1,
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = C["pretraining"]["layer"]
-        if tok2vec_layer:
-            layer = layer.get_ref(tok2vec_layer)
-        layer.from_bytes(weights_data)
-        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
-
-    # Create iterator, which yields out info after each optimization step.
-    msg.info("Start training")
-    score_weights = T_cfg["score_weights"]
-    training_step_iterator = train_while_improving(
-        nlp,
-        optimizer,
-        create_train_batches(train_corpus(nlp), batcher, T_cfg["max_epochs"]),
-        create_evaluation_callback(nlp, dev_corpus, score_weights),
-        dropout=T_cfg["dropout"],
-        accumulate_gradient=T_cfg["accumulate_gradient"],
-        patience=T_cfg["patience"],
-        max_steps=T_cfg["max_steps"],
-        eval_frequency=T_cfg["eval_frequency"],
-        raw_text=None,
-        exclude=frozen_components,
-    )
-    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
-
-    try:
-        progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
-        progress.set_description(f"Epoch 1")
-        for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
-            if is_best_checkpoint is not None:
-                progress.close()
-                print_row(info)
-                if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=frozen_components):
-                        update_meta(T_cfg, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / "model-best")
-                progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
-                progress.set_description(f"Epoch {info['epoch']}")
-    except Exception as e:
-        finalize_logger()
-        if output_path is not None:
-            # We don't want to swallow the traceback if we don't have a
-            # specific error.
-            msg.warn(
-                f"Aborting and saving the final best model. "
-                f"Encountered exception: {str(e)}"
-            )
-            nlp = before_to_disk(nlp)
-            nlp.to_disk(output_path / "model-final")
-        raise e
-    finally:
-        finalize_logger()
-        if output_path is not None:
-            final_model_path = output_path / "model-final"
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    nlp.to_disk(final_model_path)
-            else:
-                nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory {final_model_path}")
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    msg.divider("Initializing pipeline")
+    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
+    msg.divider("Training pipeline")
+    train(nlp, output_path, use_gpu=use_gpu, silent=False)


-def add_vectors(nlp: Language, vectors: str) -> None:
-    title = f"Config validation error for vectors {vectors}"
-    desc = (
-        "This typically means that there's a problem in the config.cfg included "
-        "with the packaged vectors. Make sure that the vectors package you're "
-        "loading is compatible with the current version of spaCy."
-    )
-    with show_validation_error(
-        title=title, desc=desc, hint_fill=False, show_config=False
-    ):
-        util.load_vectors_into_model(nlp, vectors)
-
-
-def create_train_batches(iterator, batcher, max_epochs: int):
-    epoch = 0
-    examples = list(iterator)
-    if not examples:
-        # Raise error if no data
-        raise ValueError(Errors.E986)
-    while max_epochs < 1 or epoch != max_epochs:
-        random.shuffle(examples)
-        for batch in batcher(examples):
-            yield epoch, batch
-        epoch += 1
-
-
-def create_evaluation_callback(
-    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
-) -> Callable[[], Tuple[float, Dict[str, float]]]:
-    weights = {key: value for key, value in weights.items() if value is not None}
-
-    def evaluate() -> Tuple[float, Dict[str, float]]:
-        dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
-        # Calculate a weighted sum based on score_weights for the main score.
-        # We can only consider scores that are ints/floats, not dicts like
-        # entity scores per type etc.
-        for key, value in scores.items():
-            if key in weights and not isinstance(value, (int, float)):
-                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
-        try:
-            weighted_score = sum(
-                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
-            )
-        except KeyError as e:
-            keys = list(scores.keys())
-            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
-            raise KeyError(err) from None
-        return weighted_score, scores
-
-    return evaluate
-
-
-def create_before_to_disk_callback(
-    callback: Optional[Callable[[Language], Language]]
-) -> Callable[[Language], Language]:
-    def before_to_disk(nlp: Language) -> Language:
-        if not callback:
-            return nlp
-        modified_nlp = callback(nlp)
-        if not isinstance(modified_nlp, Language):
-            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
-            raise ValueError(err)
-        return modified_nlp
-
-    return before_to_disk
-
-
-def train_while_improving(
-    nlp: Language,
-    optimizer: Optimizer,
-    train_data,
-    evaluate,
-    *,
-    dropout: float,
-    eval_frequency: int,
-    accumulate_gradient: int,
-    patience: int,
-    max_steps: int,
-    raw_text: List[Dict[str, str]],
-    exclude: List[str],
-):
-    """Train until an evaluation stops improving. Works as a generator,
-    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
-    where info is a dict, and is_best_checkpoint is in [True, False, None] --
-    None indicating that the iteration was not evaluated as a checkpoint.
-    The evaluation is conducted by calling the evaluate callback.
-
-    Positional arguments:
-        nlp: The spaCy pipeline to evaluate.
-        optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
-        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
-            The callback should take no arguments and return a tuple
-            `(main_score, other_scores)`. The main_score should be a float where
-            higher is better. other_scores can be any object.
-
-    Every iteration, the function yields out a tuple with:
-
-    * batch: A list of Example objects.
-    * info: A dict with various information about the last update (see below).
-    * is_best_checkpoint: A value in None, False, True, indicating whether this
-        was the best evaluation so far. You should use this to save the model
-        checkpoints during training. If None, evaluation was not conducted on
-        that iteration. False means evaluation was conducted, but a previous
-        evaluation was better.
-
-    The info dict provides the following information:
-
-        epoch (int): How many passes over the data have been completed.
-        step (int): How many steps have been completed.
-        score (float): The main score from the last evaluation.
-        other_scores: : The other scores from the last evaluation.
-        losses: The accumulated losses throughout training.
-        checkpoints: A list of previous results, where each result is a
-            (score, step, epoch) tuple.
-    """
-    if isinstance(dropout, float):
-        dropouts = thinc.schedules.constant(dropout)
-    else:
-        dropouts = dropout
-    results = []
-    losses = {}
-    if raw_text:
-        random.shuffle(raw_text)
-        raw_examples = [
-            Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
-        ]
-        raw_batches = util.minibatch(raw_examples, size=8)
-
-    words_seen = 0
-    start_time = timer()
-    for step, (epoch, batch) in enumerate(train_data):
-        dropout = next(dropouts)
-        for subbatch in subdivide_batch(batch, accumulate_gradient):
-
-            nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
-            )
-            if raw_text:
-                # If raw text is available, perform 'rehearsal' updates,
-                # which use unlabelled data to reduce overfitting.
-                raw_batch = list(next(raw_batches))
-                nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
-        # TODO: refactor this so we don't have to run it separately in here
-        for name, proc in nlp.pipeline:
-            if (
-                name not in exclude
-                and hasattr(proc, "model")
-                and proc.model not in (True, False, None)
-            ):
-                proc.model.finish_update(optimizer)
-        optimizer.step_schedules()
-        if not (step % eval_frequency):
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    score, other_scores = evaluate()
-            else:
-                score, other_scores = evaluate()
-            results.append((score, step))
-            is_best_checkpoint = score == max(results)[0]
+def init_pipeline(
+    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
+) -> Language:
+    init_kwargs = {"use_gpu": use_gpu}
+    if output_path is not None:
+        init_path = output_path / "model-initial"
+        if not init_path.exists():
+            msg.info(f"Initializing the pipeline in {init_path}")
+            nlp = init_nlp(config, **init_kwargs)
+            nlp.to_disk(init_path)
+            msg.good(f"Saved initialized pipeline to {init_path}")
        else:
-            score, other_scores = (None, None)
-            is_best_checkpoint = None
-        words_seen += sum(len(eg) for eg in batch)
-        info = {
-            "epoch": epoch,
-            "step": step,
-            "score": score,
-            "other_scores": other_scores,
-            "losses": losses,
-            "checkpoints": results,
-            "seconds": int(timer() - start_time),
-            "words": words_seen,
-        }
-        yield batch, info, is_best_checkpoint
-        if is_best_checkpoint is not None:
-            losses = {}
-        # Stop if no improvement in `patience` updates (if specified)
-        best_score, best_step = max(results)
-        if patience and (step - best_step) >= patience:
-            break
-        # Stop if we've exhausted our max steps (if specified)
-        if max_steps and step >= max_steps:
-            break
-
-
-def subdivide_batch(batch, accumulate_gradient):
-    batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
-    sub_len = len(batch) // accumulate_gradient
-    start = 0
-    for i in range(accumulate_gradient):
-        subbatch = batch[start : start + sub_len]
-        if subbatch:
-            yield subbatch
-        start += len(subbatch)
-    subbatch = batch[start:]
-    if subbatch:
-        yield subbatch
-
-
-def update_meta(
-    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
-) -> None:
-    nlp.meta["performance"] = {}
-    for metric in training["score_weights"]:
-        if metric is not None:
-            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
-    for pipe_name in nlp.pipe_names:
-        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
-
-
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    # TODO: separate checks from loading
-    raw_text = util.ensure_path(config["training"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
-    tag_map = {}
-    morph_rules = {}
-    weights_data = None
-    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return raw_text, tag_map, morph_rules, weights_data
+            nlp = util.load_model(init_path)
+            if must_reinitialize(config, nlp.config):
+                msg.warn("Config has changed: need to re-initialize pipeline")
+                nlp = init_nlp(config, **init_kwargs)
+                nlp.to_disk(init_path)
+                msg.good(f"Re-initialized pipeline in {init_path}")
+            else:
+                msg.good(f"Loaded initialized pipeline from {init_path}")
+        return nlp
+    return init_nlp(config, **init_kwargs)


 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@ -445,30 +85,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
        if not output_path.exists():
            output_path.mkdir()
            msg.good(f"Created output directory: {output_path}")
-
-
-def verify_config(nlp: Language) -> None:
-    """Perform additional checks based on the config, loaded nlp object and training data."""
-    # TODO: maybe we should validate based on the actual components, the list
-    # in config["nlp"]["pipeline"] instead?
-    for pipe_config in nlp.config["components"].values():
-        # We can't assume that the component name == the factory
-        factory = pipe_config["factory"]
-        if factory == "textcat":
-            verify_textcat_config(nlp, pipe_config)
-
-
-def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
-    # if 'positive_label' is provided: double check whether it's in the data and
-    # the task is binary
-    if pipe_config.get("positive_label"):
-        textcat_labels = nlp.get_pipe("textcat").labels
-        pos_label = pipe_config.get("positive_label")
-        if pos_label not in textcat_labels:
-            raise ValueError(
-                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
-            )
-        if len(list(textcat_labels)) != 2:
-            raise ValueError(
-                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
-            )
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,7 +1,8 @@
 [paths]
 train = ""
 dev = ""
-raw = null
+vectors = null
+vocab_data = null
 init_tok2vec = null

 [system]
@ -59,11 +60,6 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Extra resources for transfer-learning or pseudo-rehearsal
-init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths.raw}
-vectors = null
-lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
@ -104,3 +100,18 @@ grad_clip = 1.0
 use_averages = false
 eps = 1e-8
 learn_rate = 0.001
+
+# The 'initialize' step is run before training or pretraining. Components and
+# the tokenizer can each define their own arguments via their .initialize
+# methods that are populated by the config. This lets them gather resources like
+# lookup tables and build label sets, construct vocabularies, etc.
+[initialize]
+vocab_data = ${paths.vocab_data}
+lookups = null
+vectors = ${paths.vectors}
+# Extra resources for transfer-learning or pseudo-rehearsal
+init_tok2vec = ${paths.init_tok2vec}
+# Arguments passed to the tokenizer's initialize method
+tokenizer = {}
+# Arguments passed to the initialize methods of the components (keyed by component name)
+components = {}
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -1,3 +1,6 @@
+[paths]
+raw_text = null
+
 [pretraining]
 max_epochs = 1000
 dropout = 0.2
@ -32,7 +35,7 @@ learn_rate = 0.001

 [corpora.pretrain]
@readers = "spacy.JsonlReader.v1"
-path = ${paths.raw}
+path = ${paths.raw_text}
 min_length = 5
 max_length = 500
 limit = 0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -85,6 +85,7 @@ class Warnings:
            "attribute or operator.")

    # TODO: fix numbering after merging develop into master
+    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
    W090 = ("Could not locate any {format} files in path '{path}'.")
    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
            "settings: {opts}")
    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
    E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call begin_training()?")
+            "call initialize()?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
    E111 = ("Pickling a token is not supported, because tokens are only views "
            "of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
            "provided {found}.")
    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
            "by calling add_label, or by providing a representative batch of "
-            "examples to the component's begin_training method.")
+            "examples to the component's initialize method.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
            "but the provided argument {loc} points to a file.")
    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
            "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+    E930 = ("Received invalid get_examples callback in {name}.initialize. "
            "Expected function that returns an iterable of Example objects but "
            "got: {obj}")
    E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
--- a/spacy/language.py
+++ b/spacy/language.py
@ -8,7 +8,7 @@ from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
 import warnings
-from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
+from thinc.api import Model, get_current_ops, Config, Optimizer
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle
@ -18,8 +18,9 @@ from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
-from .util import create_default_optimizer, registry, SimpleFrozenList
+from .util import registry, SimpleFrozenList
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
 from . import util
 from . import about
@ -1065,7 +1067,7 @@ class Language:
        validate_examples(examples, "Language.update")
        if sgd is None:
            if self._optimizer is None:
-                self._optimizer = create_default_optimizer()
+                self._optimizer = self.create_optimizer()
            sgd = self._optimizer
        if component_cfg is None:
            component_cfg = {}
@ -1123,7 +1125,7 @@ class Language:
        validate_examples(examples, "Language.rehearse")
        if sgd is None:
            if self._optimizer is None:
-                self._optimizer = create_default_optimizer()
+                self._optimizer = self.create_optimizer()
            sgd = self._optimizer
        pipes = list(self.pipeline)
        random.shuffle(pipes)
@ -1153,61 +1155,73 @@ class Language:
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
        *,
        sgd: Optional[Optimizer] = None,
-        device: int = -1,
+    ) -> Optimizer:
+        warnings.warn(Warnings.W089, DeprecationWarning)
+        return self.initialize(get_examples, sgd=sgd)
+
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
    ) -> Optimizer:
        """Initialize the pipe for training, using data examples if available.

        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
+        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
+            provided, will be created using the .create_optimizer() method.
        RETURNS (thinc.api.Optimizer): The optimizer.

-        DOCS: https://nightly.spacy.io/api/language#begin_training
+        DOCS: https://nightly.spacy.io/api/language#initialize
        """
        if get_examples is None:
            util.logger.debug(
-                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
            )
            doc = Doc(self.vocab, words=["x", "y", "z"])
            get_examples = lambda: [Example.from_dict(doc, {})]
-        # Populate vocab
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="Language", obj=type(get_examples))
            raise ValueError(err)
-        valid_examples = False
-        for example in get_examples():
-            if not isinstance(example, Example):
-                err = Errors.E978.format(
-                    name="Language.begin_training", types=type(example)
-                )
-                raise ValueError(err)
-            else:
-                valid_examples = True
-            for word in [t.text for t in example.reference]:
-                _ = self.vocab[word]  # noqa: F841
-        if not valid_examples:
-            err = Errors.E930.format(name="Language", obj="empty list")
-            raise ValueError(err)
-        if device >= 0:  # TODO: do we need this here?
-            require_gpu(device)
-            if self.vocab.vectors.data.shape[1] >= 1:
-                ops = get_current_ops()
-                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-        if sgd is None:
-            sgd = create_default_optimizer()
-        self._optimizer = sgd
+        # Make sure the config is interpolated so we can resolve subsections
+        config = self.config.interpolate()
+        # These are the settings provided in the [initialize] block in the config
+        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        init_vocab(
+            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
+        )
+        pretrain_cfg = config.get("pretraining")
+        if pretrain_cfg:
+            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
+            init_tok2vec(self, P, I)
+        if self.vocab.vectors.data.shape[1] >= 1:
+            ops = get_current_ops()
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if hasattr(self.tokenizer, "initialize"):
+            tok_settings = validate_init_settings(
+                self.tokenizer.initialize,
+                I["tokenizer"],
+                section="tokenizer",
+                name="tokenizer",
+            )
+            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
        for name, proc in self.pipeline:
-            if hasattr(proc, "begin_training"):
-                proc.begin_training(
-                    get_examples, pipeline=self.pipeline, sgd=self._optimizer
+            if hasattr(proc, "initialize"):
+                p_settings = I["components"].get(name, {})
+                p_settings = validate_init_settings(
+                    proc.initialize, p_settings, section="components", name=name
                )
+                proc.initialize(get_examples, nlp=self, **p_settings)
        self._link_components()
+        self._optimizer = sgd
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
        return self._optimizer

-    def resume_training(
-        self, *, sgd: Optional[Optimizer] = None, device: int = -1
-    ) -> Optimizer:
+    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
        """Continue training a pretrained model.

        Create and return an optimizer, and initialize "rehearsal" for any pipeline
@ -1216,22 +1230,20 @@ class Language:
        rehearsal, collect samples of text you want the models to retain performance
        on, and call nlp.rehearse() with a batch of Example objects.

-        sgd (Optional[Optimizer]): An optimizer.
        RETURNS (Optimizer): The optimizer.

        DOCS: https://nightly.spacy.io/api/language#resume_training
        """
-        if device >= 0:  # TODO: do we need this here?
-            require_gpu(device)
-            ops = get_current_ops()
-            if self.vocab.vectors.data.shape[1] >= 1:
-                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-        if sgd is None:
-            sgd = create_default_optimizer()
-        self._optimizer = sgd
+        ops = get_current_ops()
+        if self.vocab.vectors.data.shape[1] >= 1:
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
        for name, proc in self.pipeline:
            if hasattr(proc, "_rehearsal_model"):
                proc._rehearsal_model = deepcopy(proc.model)
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
        return self._optimizer

    def evaluate(
@ -1293,6 +1305,11 @@ class Language:
        results["speed"] = n_words / (end_time - start_time)
        return results

+    def create_optimizer(self):
+        """Create an optimizer, usually using the [training.optimizer] config."""
+        subconfig = {"optimizer": self.config["training"]["optimizer"]}
+        return registry.resolve(subconfig)["optimizer"]
+
    @contextmanager
    def use_params(self, params: Optional[dict]):
        """Replace weights of models in the pipeline with those provided in the
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
    def add_multitask_objective(self, mt_component):
        self._multitasks.append(mt_component)

-    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
        for labeller in self._multitasks:
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, nlp=nlp)

    @property
    def labels(self):
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
+from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
 from pathlib import Path
 import srsly
 import random
@ -140,26 +140,20 @@ class EntityLinker(Pipe):
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
+        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
        """
        self._ensure_examples(get_examples)
        self._require_kb()
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def update(
        self,
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional
+from typing import Optional, Union, Dict
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@ -101,6 +101,11 @@ class Morphologizer(Tagger):
        """RETURNS (Tuple[str]): The labels currently added to the component."""
        return tuple(self.cfg["labels_morph"].keys())

+    @property
+    def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
+        """A dictionary with all labels data."""
+        return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
+
    def add_label(self, label):
        """Add a new label to the pipe.

@ -129,20 +134,15 @@ class Morphologizer(Tagger):
            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
+        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
        """
        self._ensure_examples(get_examples)
        # First, fetch all labels from the data
@ -178,9 +178,6 @@ class Morphologizer(Tagger):
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def set_annotations(self, docs, batch_tag_ids):
        """Modify a batch of documents, using pre-computed scores.
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, nlp=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
                if label is not None and label not in self.labels:
                    self.labels[label] = len(self.labels)
        self.model.initialize()   # TODO: fix initialization by defining X and Y
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
@ -177,13 +174,10 @@ class ClozeMultitask(Pipe):
    def set_annotations(self, docs, dep_ids):
        pass

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, nlp=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.begin_training(X)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
+        self.model.output_layer.initialize(X)

    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
        """Register another component as a multi-task objective. Experimental."""
        self._multitasks.append(mt_component)

-    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
        """Setup multi-task objective components. Experimental and internal."""
        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
        for labeller in self._multitasks:
            labeller.model.set_dim("nO", len(self.labels))
            if labeller.model.has_ref("output_layer"):
                labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, nlp=nlp)

    @property
    def labels(self):
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True
+from typing import Optional, Tuple
 import srsly
 from thinc.api import set_dropout_rate, Model

@ -32,6 +33,17 @@ cdef class Pipe:
        self.name = name
        self.cfg = dict(cfg)

+    @property
+    def labels(self) -> Optional[Tuple[str]]:
+        return []
+    
+    @property
+    def label_data(self):
+        """Optional JSON-serializable data that would be sufficient to recreate
+        the label set if provided to the `pipe.initialize()` method.
+        """
+        return None
+
    def __call__(self, Doc doc):
        """Apply the pipe to one document. The document is modified in place,
        and returned. This usually happens under the hood when the nlp object
@ -183,7 +195,7 @@ cdef class Pipe:
        """
        return util.create_default_optimizer()

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using data examples if available.
        This method needs to be implemented by each Pipe component,
        ensuring the internal model (if available) is initialized properly
@ -191,16 +203,11 @@ cdef class Pipe:

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/pipe#begin_training
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
        """
-        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
+        raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))

    def _ensure_examples(self, get_examples):
        if get_examples is None or not hasattr(get_examples, "__call__"):
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
        else:
            self.punct_chars = set(self.default_punct_chars)

-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, nlp=None):
        pass

    def __call__(self, doc):
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
        # are 0
        return tuple(["I", "S"])

+    @property
+    def label_data(self):
+        return self.labels
+
    def set_annotations(self, docs, batch_tag_ids):
        """Modify a batch of documents, using pre-computed scores.

@ -124,20 +128,15 @@ class SentenceRecognizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
+        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
@ -151,9 +150,6 @@ class SentenceRecognizer(Tagger):
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def add_label(self, label, values=None):
        raise NotImplementedError
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -90,6 +90,11 @@ class Tagger(Pipe):
        """
        return tuple(self.cfg["labels"])

+    @property
+    def label_data(self):
+        """Data about the labels currently added to the component."""
+        return tuple(self.cfg["labels"])
+
    def __call__(self, doc):
        """Apply the pipe to a Doc.

@ -256,31 +261,30 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects..
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/tagger#begin_training
+        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
        self._ensure_examples(get_examples)
+        if labels is not None:
+            for tag in labels:
+                self.add_label(tag)
+        else:
+            tags = set()
+            for example in get_examples():
+                for token in example.y:
+                    if token.tag_:
+                        tags.add(token.tag_)
+            for tag in sorted(tags):
+                self.add_label(tag)
        doc_sample = []
        label_sample = []
-        tags = set()
-        for example in get_examples():
-            for token in example.y:
-                if token.tag_:
-                    tags.add(token.tag_)
-        for tag in sorted(tags):
-            self.add_label(tag)
        for example in islice(get_examples(), 10):
            doc_sample.append(example.x)
            gold_tags = example.get_aligned("TAG", as_string=True)
@ -289,9 +293,6 @@ class Tagger(Pipe):
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def add_label(self, label):
        """Add a new label to the pipe.
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -154,8 +154,16 @@ class TextCategorizer(Pipe):

    @labels.setter
    def labels(self, value: List[str]) -> None:
+        # TODO: This really shouldn't be here. I had a look and I added it when
+        # I added the labels property, but it's pretty nasty to have this, and
+        # will lead to problems.
        self.cfg["labels"] = tuple(value)

+    @property
+    def label_data(self) -> List[str]:
+        """RETURNS (List[str]): Information about the component's labels."""
+        return self.labels
+
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
@ -334,43 +342,37 @@ class TextCategorizer(Pipe):
        self.labels = tuple(list(self.labels) + [label])
        return 1

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+        labels: Optional[Dict] = None
+    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
+        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
        self._ensure_examples(get_examples)
-        subbatch = []  # Select a subbatch of examples to initialize the model
-        for example in islice(get_examples(), 10):
-            if len(subbatch) < 2:
-                subbatch.append(example)
-            for cat in example.y.cats:
-                self.add_label(cat)
+        if labels is None:
+            for example in get_examples():
+                for cat in example.y.cats:
+                    self.add_label(cat)
+        else:
+            for label in labels:
+                self.add_label(label)
+        subbatch = list(islice(get_examples(), 10))
        doc_sample = [eg.reference for eg in subbatch]
        label_sample, _ = self._examples_to_truth(subbatch)
        self._require_labels()
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
        """Score a batch of examples.
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,4 +1,4 @@
-from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
+from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice

@ -203,26 +203,20 @@ class Tok2Vec(Pipe):
    def get_loss(self, examples, scores) -> None:
        pass

-    def begin_training(
+    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
+        nlp: Optional[Language] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

-        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
+        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
        """
        self._ensure_examples(get_examples)
        doc_sample = []
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
+# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 from __future__ import print_function
 from cymem.cymem cimport Pool
 cimport numpy as np
@ -7,6 +7,7 @@ from libcpp.vector cimport vector
 from libc.string cimport memset
 from libc.stdlib cimport calloc, free
 import random
+from typing import Optional

 import srsly
 from thinc.api import set_dropout_rate
@ -95,6 +96,10 @@ cdef class Parser(Pipe):
        class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
        return class_names

+    @property
+    def label_data(self):
+        return self.moves.labels
+
    @property
    def tok2vec(self):
        """Return the embedding and convolutional layer of the model."""
@ -354,7 +359,7 @@ cdef class Parser(Pipe):
            # If all weights for an output are 0 in the original model, don't
            # supervise that output. This allows us to add classes.
            loss += (d_scores**2).sum()
-            backprop(d_scores, sgd=sgd)
+            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, guesses)
            states = [state for state in states if not state.is_final()]
@ -405,18 +410,20 @@ cdef class Parser(Pipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)

-    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, nlp=None, labels=None):
        self._ensure_examples(get_examples)
-        self.cfg.update(kwargs)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
-        actions = self.moves.get_actions(
-            examples=get_examples(),
-            min_freq=self.cfg['min_action_freq'],
-            learn_tokens=self.cfg["learn_tokens"]
-        )
+        if labels is not None:
+            actions = dict(labels)
+        else:
+            actions = self.moves.get_actions(
+                examples=get_examples(),
+                min_freq=self.cfg['min_action_freq'],
+                learn_tokens=self.cfg["learn_tokens"]
+            )
        for action, labels in self.moves.labels.items():
            actions.setdefault(action, {})
            for label, freq in labels.items():
@ -425,11 +432,9 @@ cdef class Parser(Pipe):
        self.moves.initialize_actions(actions)
        # make sure we resize so we have an appropriate upper layer
        self._resize()
-        if sgd is None:
-            sgd = self.create_optimizer()
        doc_sample = []
-        if pipeline is not None:
-            for name, component in pipeline:
+        if nlp is not None:
+            for name, component in nlp.pipeline:
                if component is self:
                    break
                if hasattr(component, "pipe"):
@ -441,9 +446,8 @@ cdef class Parser(Pipe):
                doc_sample.append(example.predicted)
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(doc_sample)
-        if pipeline is not None:
-            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
-        return sgd
+        if nlp is not None:
+            self.init_multitask_objectives(get_examples, nlp.pipeline)

    def to_disk(self, path, exclude=tuple()):
        serializers = {
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,15 +1,17 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
-from pydantic import root_validator
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError
 from thinc.config import Promise
 from collections import defaultdict
-from thinc.api import Optimizer
+import inspect

 from .attrs import NAMES
 from .lookups import Lookups
+from .util import is_cython_func

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -44,6 +46,96 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
        return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]


+# Initialization
+
+
+class ArgSchemaConfig:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+class ArgSchemaConfigExtra:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+def get_arg_model(
+    func: Callable,
+    *,
+    exclude: Iterable[str] = tuple(),
+    name: str = "ArgModel",
+    strict: bool = True,
+) -> ModelMetaclass:
+    """Generate a pydantic model for function arguments.
+
+    func (Callable): The function to generate the schema for.
+    exclude (Iterable[str]): Parameter names to ignore.
+    name (str): Name of created model class.
+    strict (bool): Don't allow extra arguments if no variable keyword arguments
+        are allowed on the function.
+    RETURNS (ModelMetaclass): A pydantic model.
+    """
+    sig_args = {}
+    try:
+        sig = inspect.signature(func)
+    except ValueError:
+        # Typically happens if the method is part of a Cython module without
+        # binding=True. Here we just use an empty model that allows everything.
+        return create_model(name, __config__=ArgSchemaConfigExtra)
+    has_variable = False
+    for param in sig.parameters.values():
+        if param.name in exclude:
+            continue
+        if param.kind == param.VAR_KEYWORD:
+            # The function allows variable keyword arguments so we shouldn't
+            # include **kwargs etc. in the schema and switch to non-strict
+            # mode and pass through all other values
+            has_variable = True
+            continue
+        # If no annotation is specified assume it's anything
+        annotation = param.annotation if param.annotation != param.empty else Any
+        # If no default value is specified assume that it's required. Cython
+        # functions/methods will have param.empty for default value None so we
+        # need to treat them differently
+        default_empty = None if is_cython_func(func) else ...
+        default = param.default if param.default != param.empty else default_empty
+        sig_args[param.name] = (annotation, default)
+    is_strict = strict and not has_variable
+    sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
+    return create_model(name, **sig_args)
+
+
+def validate_init_settings(
+    func: Callable,
+    settings: Dict[str, Any],
+    *,
+    section: Optional[str] = None,
+    name: str = "",
+    exclude: Iterable[str] = ("get_examples", "nlp"),
+) -> Dict[str, Any]:
+    """Validate initialization settings against the expected arguments in
+    the method signature. Will parse values if possible (e.g. int to string)
+    and return the updated settings dict. Will raise a ConfigValidationError
+    if types don't match or required values are missing.
+
+    func (Callable): The initialize method of a given component etc.
+    settings (Dict[str, Any]): The settings from the repsective [initialize] block.
+    section (str): Initialize section, for error message.
+    name (str): Name of the block in the section.
+    exclude (Iterable[str]): Parameter names to exclude from schema.
+    RETURNS (Dict[str, Any]): The validated settings.
+    """
+    schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
+    try:
+        return schema(**settings).dict()
+    except ValidationError as e:
+        block = "initialize" if not section else f"initialize.{section}"
+        title = f"Error validating initialization settings in [{block}]"
+        raise ConfigValidationError(
+            title=title, errors=e.errors(), config=settings, parent=name,
+        ) from None
+
+
 # Matcher token patterns


@ -205,8 +297,6 @@ class ModelMetaSchema(BaseModel):

 class ConfigSchemaTraining(BaseModel):
    # fmt: off
-    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
    batcher: Batcher = Field(..., title="Batcher for the training data")
@ -219,8 +309,6 @@ class ConfigSchemaTraining(BaseModel):
    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
    score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
-    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
-    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    logger: Logger = Field(..., title="The logger to track training progress")
    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@ -273,36 +361,40 @@ class ConfigSchemaPretrain(BaseModel):
        arbitrary_types_allowed = True


+class ConfigSchemaInit(BaseModel):
+    # fmt: off
+    vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
+    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
+    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
+    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
    components: Dict[str, Dict[str, Any]]
    corpora: Dict[str, Reader]
-
-    @root_validator(allow_reuse=True)
-    def validate_config(cls, values):
-        """Perform additional validation for settings with dependencies."""
-        pt = values.get("pretraining")
-        if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
-            if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
-                err = "Need nlp.vectors if pretraining.objective.type is vectors"
-                raise ValueError(err)
-        return values
+    initialize: ConfigSchemaInit

    class Config:
        extra = "allow"
        arbitrary_types_allowed = True


-class TrainingSchema(BaseModel):
-    training: ConfigSchemaTraining
-    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
-    corpora: Dict[str, Reader]
-
-    class Config:
-        extra = "allow"
-        arbitrary_types_allowed = True
+CONFIG_SCHEMAS = {
+    "nlp": ConfigSchemaNlp,
+    "training": ConfigSchemaTraining,
+    "pretraining": ConfigSchemaPretrain,
+    "initialize": ConfigSchemaInit,
+}


 # Project config Schema
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)])
    sgd = Adam(0.001)

    for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
-    ner1.begin_training(lambda: [_ner_example(ner1)])
+    ner1.initialize(lambda: [_ner_example(ner1)])
    ner2 = EntityRecognizer(Vocab(), model, **config)

    # the second model needs to be resized before we can call from_bytes
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -202,7 +202,7 @@ def test_train_empty():
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
 def test_overwrite_token():
    nlp = English()
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    # The untrained NER will predict O for each token
    doc = nlp("I live in New York")
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("John is watching the news about Croatia's elections")
    # if this goes wrong, the initialization of the parser's upper layer is probably broken
    result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
    # 1: untrained NER - should set everything to O
    untrained_ner = nlp.add_pipe("ner", name="uner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()

    # 2 : Entity Ruler - should set "this" to B and keep everything else O
    patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
    nlp.add_pipe("blocker", config={"start": 2, "end": 5})
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("This is Antti L Korhonen speaking in Finland")
    expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
    expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()

    for i in range(50):
        losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
    assert not len(nlp.vocab.lookups)
    nlp.add_pipe("ner")
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" in caplog.text
    caplog.clear()
    nlp.vocab.lookups.add_table("lexeme_norm")
    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
    with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
        assert "W033" not in caplog.text


@ -358,5 +358,5 @@ class BlockerComponent1:
        self.name = name

    def __call__(self, doc):
-        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
+        doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
        return doc
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -191,7 +191,7 @@ def test_overfitting_IO():
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -34,7 +34,7 @@ def parser(vocab):
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)])
    sgd = Adam(0.001)

    for i in range(10):
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
    """Test that the EL can't train without defining a KB"""
    entity_linker = nlp.add_pipe("entity_linker", config={})
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])


 def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
    entity_linker = nlp.add_pipe("entity_linker", config=config)
    assert len(entity_linker.kb) == 0
    with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])


 def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
    ruler.add_patterns(patterns)
    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    nlp.begin_training()
+    nlp.initialize()
    assert entity_linker.model.get_dim("nO") == vector_length

    # test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
    )

    # train the NEL pipe
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@ -0,0 +1,69 @@
+import pytest
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.training import Example
+from thinc.api import ConfigValidationError
+from pydantic import StrictBool
+
+
+def test_initialize_arguments():
+    name = "test_initialize_arguments"
+
+    class CustomTokenizer:
+        def __init__(self, tokenizer):
+            self.tokenizer = tokenizer
+            self.from_initialize = None
+
+        def __call__(self, text):
+            return self.tokenizer(text)
+
+        def initialize(self, get_examples, nlp, custom: int):
+            self.from_initialize = custom
+
+    class Component:
+        def __init__(self):
+            self.from_initialize = None
+
+        def initialize(
+            self, get_examples, nlp, custom1: str, custom2: StrictBool = False
+        ):
+            self.from_initialize = (custom1, custom2)
+
+    Language.factory(name, func=lambda nlp, name: Component())
+
+    nlp = English()
+    nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
+    example = Example.from_dict(nlp("x"), {})
+    get_examples = lambda: [example]
+    nlp.add_pipe(name)
+    # The settings here will typically come from the [initialize] block
+    init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
+    nlp.config["initialize"].update(init_cfg)
+    with pytest.raises(ConfigValidationError) as e:
+        # Empty config for component, no required custom1 argument
+        nlp.initialize(get_examples)
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ("custom1",)
+    assert errors[0]["type"] == "value_error.missing"
+    init_cfg = {
+        "tokenizer": {"custom": 1},
+        "components": {name: {"custom1": "x", "custom2": 1}},
+    }
+    nlp.config["initialize"].update(init_cfg)
+    with pytest.raises(ConfigValidationError) as e:
+        # Wrong type of custom 2
+        nlp.initialize(get_examples)
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ("custom2",)
+    assert errors[0]["type"] == "value_error.strictbool"
+    init_cfg = {
+        "tokenizer": {"custom": 1},
+        "components": {name: {"custom1": "x"}},
+    }
+    nlp.config["initialize"].update(init_cfg)
+    nlp.initialize(get_examples)
+    assert nlp.tokenizer.from_initialize == 1
+    pipe = nlp.get_pipe(name)
+    assert pipe.from_initialize == ("x", False)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -33,7 +33,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("morphologizer")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


 def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
-    nlp.begin_training()
+    nlp.initialize()
    # this throws an error because the morphologizer can't be resized after initialization
    with pytest.raises(ValueError):
        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
    train_examples = []
    for inst in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -31,19 +31,19 @@ TRAIN_DATA = [
 ]


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    nlp.add_pipe("senter")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
    train_examples[1].reference[11].is_sent_start = False

    nlp.add_pipe("senter")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()

    for i in range(200):
        losses = {}
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@ -15,14 +15,14 @@ def test_label_types():
        tagger.add_label(9)


-def test_tagger_begin_training_tag_map():
-    """Test that Tagger.begin_training() without gold tuples does not clobber
+def test_tagger_initialize_tag_map():
+    """Test that Tagger.initialize() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)


@ -38,7 +38,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
    tagger.add_label("N")
    tagger.add_label("V")
    assert tagger.labels == ("N", "V")
-    nlp.begin_training()
+    nlp.initialize()
    assert tagger.model.get_dim("nO") == 2
    # this throws an error because the tagger can't be resized after initialization
    with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
-    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: train_examples[0])
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=lambda: [])
+        nlp.initialize(get_examples=lambda: None)
+    with pytest.raises(TypeError):
+        nlp.initialize(get_examples=lambda: train_examples[0])
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=lambda: [])
+    with pytest.raises(ValueError):
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert tagger.model.get_dim("nO") == len(TAGS)

    for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
    nlp = English()
    nlp.add_pipe("tagger")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
+from spacy.training import Example
+from spacy.training.initialize import verify_textcat_config

 from ..util import make_tempdir
-from ...cli.train import verify_textcat_config
-from ...training import Example


 TRAIN_DATA = [
@ -26,7 +26,7 @@ def test_simple_train():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("answer")
-    nlp.begin_training()
+    nlp.initialize()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
-    optimizer = textcat.begin_training(lambda: [])
+    optimizer = textcat.initialize(lambda: [])
    for i in range(30):
        losses = {}
        examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
    nlp = Language()
    nlp.add_pipe("textcat")
    with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()


 def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)


 def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
-    nlp.begin_training()
+    nlp.initialize()
    assert textcat.model.get_dim("nO") == 2
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")


-def test_begin_training_examples():
+def test_initialize_examples():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
    # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=lambda: None)
+    with pytest.raises(ValueError):
+        nlp.initialize(get_examples=train_examples)


 def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert textcat.model.get_dim("nO") == 2

    for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -88,7 +88,7 @@ def test_init_tok2vec():
    nlp = English()
    tok2vec = nlp.add_pipe("tok2vec")
    assert tok2vec.listeners == []
-    nlp.begin_training()
+    nlp.initialize()
    assert tok2vec.model.get_dim("nO")


@ -154,7 +154,7 @@ def test_tok2vec_listener():

    # Check that the Tok2Vec component finds it listeners
    assert tok2vec.listeners == []
-    optimizer = nlp.begin_training(lambda: train_examples)
+    optimizer = nlp.initialize(lambda: train_examples)
    assert tok2vec.listeners == [tagger_tok2vec]

    for i in range(5):
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -428,7 +428,7 @@ def test_issue999():
    for _, offsets in TRAIN_DATA:
        for start, end, label in offsets:
            ner.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@ -250,7 +250,7 @@ def test_issue1915():
    ner = nlp.add_pipe("ner")
    ner.add_label("answer")
    with pytest.raises(ValueError):
-        nlp.begin_training(**cfg)
+        nlp.initialize(**cfg)


 def test_issue1945():
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -30,7 +30,7 @@ def test_issue2179():
    nlp = Italian()
    ner = nlp.add_pipe("ner")
    ner.add_label("CITIZENSHIP")
-    nlp.begin_training()
+    nlp.initialize()
    nlp2 = Italian()
    nlp2.add_pipe("ner")
    assert len(nlp2.get_pipe("ner").labels) == 0
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -18,7 +18,7 @@ def test_issue2564():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
    ner = nlp.add_pipe("ner")
    for entity_type in list(entity_types):
        ner.add_label(entity_type)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(20):
        losses = {}
        random.shuffle(train_data)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -92,7 +92,7 @@ def test_issue3209():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("ANIMAL")
-    nlp.begin_training()
+    nlp.initialize()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    list(nlp.pipe(["hi", ""]))


--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -223,7 +223,7 @@ def test_issue3611():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" not in parser.labels


@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
    parser = DependencyParser(Vocab(), model, **config)
    parser.add_label("nsubj")
    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
    assert "subtok" in parser.labels


@ -342,7 +342,7 @@ def test_issue3880():
    nlp.add_pipe("parser").add_label("dep")
    nlp.add_pipe("ner").add_label("PERSON")
    nlp.add_pipe("tagger").add_label("NN")
-    nlp.begin_training()
+    nlp.initialize()
    for doc in nlp.pipe(texts):
        pass

--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -66,7 +66,7 @@ def test_issue4030():
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
    # add ner pipe
    ner = nlp.add_pipe("ner")
    ner.add_label("SOME_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
    # Add entity ruler
    patterns = [
        {"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
    # add ner pipe
    ner1 = nlp1.add_pipe("ner")
    ner1.add_label("SOME_LABEL")
-    nlp1.begin_training()
+    nlp1.initialize()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("PEOPLE")
-    nlp.begin_training()
+    nlp.initialize()
    assert "ner" in nlp.pipe_names
    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
    config = {}
    ner = nlp.create_pipe("ner", config=config)
    ner.add_label("SOME_LABEL")
-    ner.begin_training(lambda: [])
+    ner.initialize(lambda: [])
    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
    TRAIN_DATA = [example, example]
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -180,7 +180,7 @@ def test_issue4725_2():
    vocab.set_vector("dog", data[1])
    nlp = English(vocab=vocab)
    nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -64,7 +64,7 @@ def tagger():
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    return tagger


@ -85,7 +85,7 @@ def entity_linker():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    nlp.begin_training()
+    nlp.initialize()
    return entity_linker


--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@ -25,7 +25,7 @@ def test_issue5551():
        pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
        for label in set(example[1]["cats"]):
            pipe.add_label(label)
-        nlp.begin_training()
+        nlp.initialize()

        # Store the result of each iteration
        result = pipe.model.predict([nlp.make_doc(example[0])])
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -152,7 +152,7 @@ def test_serialize_nlp():
    nlp_config = Config().from_str(nlp_config_string)
    nlp = load_model_from_config(nlp_config, auto_fill=True)
    nlp.get_pipe("tagger").add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
    assert "tok2vec" in nlp.pipe_names
    assert "tagger" in nlp.pipe_names
    assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
    parser_cfg = dict()
    parser_cfg["model"] = {"@architectures": "my_test_parser"}
    nlp.add_pipe("parser", config=parser_cfg)
-    nlp.begin_training()
+    nlp.initialize()

    with make_tempdir() as d:
        nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
    model_config = Config().from_str(parser_config_string)
    parser = nlp.add_pipe("parser", config=model_config)
    parser.add_label("nsubj")
-    nlp.begin_training()
+    nlp.initialize()

    with make_tempdir() as d:
        nlp.to_disk(d)
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from spacy.cli.debug_config import check_section_refs
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@ -414,15 +413,3 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
-
-
-def test_check_section_refs():
-    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
-    config = Config(config)
-    # Valid section reference
-    check_section_refs(config, ["a.b.c"])
-    # Section that doesn't exist in this config
-    check_section_refs(config, ["x.y.z"])
-    # Invalid section reference
-    with pytest.raises(ConfigValidationError):
-        check_section_refs(config, ["a.b.c", "f.g"])
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -18,7 +18,7 @@ def nlp():
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
    return nlp


--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -7,7 +7,6 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from thinc.api import Optimizer


@pytest.fixture
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
    result = util.dot_to_dict(dot_notation)
    assert result == expected
    assert util.dict_to_dot(result) == dot_notation
-
-
-def test_resolve_training_config():
-    config = {
-        "nlp": {"lang": "en", "disabled": []},
-        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
-        "corpora": {},
-    }
-    resolved = util.resolve_training_config(config)
-    assert resolved["training"]["dropout"] == 0.1
-    assert isinstance(resolved["training"]["optimizer"], Optimizer)
-    assert resolved["corpora"] == {}
-    assert "nlp" not in resolved
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,14 +1,15 @@
 import pytest

-from .util import get_random_doc
-
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
-from ..lang.en import English
-from ..lang.nl import Dutch
-from ..language import DEFAULT_CONFIG_PATH
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc


@pytest.mark.parametrize(
@ -101,8 +102,8 @@ def test_util_dot_section():
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    resolved = util.resolve_training_config(nl_nlp.config)
-    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)


 def test_simple_frozen_list():
@ -120,3 +121,17 @@ def test_simple_frozen_list():
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
    with pytest.raises(NotImplementedError):
        t.append("baz")
+
+
+def test_resolve_dot_names():
+    config = {
+        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+    }
+    result = util.resolve_dot_names(config, ["training.optimizer"])
+    assert isinstance(result[0], Optimizer)
+    with pytest.raises(ConfigValidationError) as e:
+        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ["training", "xyz"]
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
 from spacy import Language
-from spacy.util import load_model_from_config, registry, dot_to_object
-from spacy.util import resolve_training_config
+from spacy.util import load_model_from_config, registry, resolve_dot_names
+from spacy.schemas import ConfigSchemaTraining
 from spacy.training import Example


@ -39,21 +39,24 @@ def test_readers():

    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    T = registry.resolve(
+        nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
+    )
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    assert isinstance(train_corpus, Callable)
-    optimizer = resolved["training"]["optimizer"]
+    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    scores = nlp.evaluate(list(dev_corpus(nlp)))
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
-    extra_corpus = resolved["corpora"]["extra"]
+    corpora = {"corpora": nlp.config.interpolate()["corpora"]}
+    extra_corpus = registry.resolve(corpora)["corpora"]["extra"]
    assert isinstance(extra_corpus, Callable)


@ -89,18 +92,20 @@ def test_cat_readers(reader, additional_config):
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
-    optimizer = resolved["training"]["optimizer"]
+    T = registry.resolve(
+        nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
+    )
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
+    optimizer = T["optimizer"]
    # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        assert example.y.cats
        # this shouldn't fail if each training example has at least one positive label
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -599,7 +599,7 @@ def _train_tuples(train_data):
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
    for i in range(5):
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -0,0 +1,296 @@
+from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
+from thinc.api import Config, fix_random_seed, set_gpu_allocator
+from thinc.api import ConfigValidationError
+from pathlib import Path
+import srsly
+import numpy
+import tarfile
+import gzip
+import zipfile
+import tqdm
+
+from .loop import create_before_to_disk_callback
+from ..lookups import Lookups
+from ..vectors import Vectors
+from ..errors import Errors
+from ..schemas import ConfigSchemaTraining
+from ..util import registry, load_model_from_config, resolve_dot_names, logger
+from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
+
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
+
+
+def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
+    raw_config = config
+    config = raw_config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    # Use original config here before it's resolved to functions
+    sourced_components = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    logger.info("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced_components if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
+        logger.info("Initialized pipeline components")
+    # Verify the config after calling 'initialize' to ensure labels
+    # are properly initialized
+    verify_config(nlp)
+    nlp = before_to_disk(nlp)
+    return nlp
+
+
+def must_reinitialize(train_config: Config, init_config: Config) -> bool:
+    # TODO: do this better and more fine-grained
+    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
+
+
+def init_vocab(
+    nlp: "Language",
+    *,
+    data: Optional[Path] = None,
+    lookups: Optional[Lookups] = None,
+    vectors: Optional[str] = None,
+) -> "Language":
+    if lookups:
+        nlp.vocab.lookups = lookups
+        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+    data_path = ensure_path(data)
+    if data_path is not None:
+        lex_attrs = srsly.read_jsonl(data_path)
+        for lexeme in nlp.vocab:
+            lexeme.rank = OOV_RANK
+        for attrs in lex_attrs:
+            if "settings" in attrs:
+                continue
+            lexeme = nlp.vocab[attrs["orth"]]
+            lexeme.set_attrs(**attrs)
+        if len(nlp.vocab):
+            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
+        else:
+            oov_prob = DEFAULT_OOV_PROB
+        nlp.vocab.cfg.update({"oov_prob": oov_prob})
+        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    logger.info("Created vocabulary")
+    if vectors is not None:
+        load_vectors_into_model(nlp, vectors)
+        logger.info(f"Added vectors: {vectors}")
+
+
+def load_vectors_into_model(
+    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
+) -> None:
+    """Load word vectors from an installed model or path into a model instance."""
+    try:
+        vectors_nlp = load_model(name)
+    except ConfigValidationError as e:
+        title = f"Config validation error for vectors {name}"
+        desc = (
+            "This typically means that there's a problem in the config.cfg included "
+            "with the packaged vectors. Make sure that the vectors package you're "
+            "loading is compatible with the current version of spaCy."
+        )
+        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        raise err from None
+    nlp.vocab.vectors = vectors_nlp.vocab.vectors
+    if add_strings:
+        # I guess we should add the strings from the vectors_nlp model?
+        # E.g. if someone does a similarity query, they might expect the strings.
+        for key in nlp.vocab.vectors.key2row:
+            if key in vectors_nlp.vocab.strings:
+                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
+
+
+def init_tok2vec(
+    nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
+) -> bool:
+    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    P = pretrain_config
+    I = init_config
+    weights_data = None
+    init_tok2vec = ensure_path(I["init_tok2vec"])
+    if init_tok2vec is not None:
+        if P["objective"].get("type") == "vectors" and not I["vectors"]:
+            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
+            errors = [{"loc": ["initialize"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        if not init_tok2vec.exists():
+            err = f"can't find pretrained tok2vec: {init_tok2vec}"
+            errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    if weights_data is not None:
+        tok2vec_component = P["component"]
+        if tok2vec_component is None:
+            desc = (
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them."
+            )
+            err = "component can't be null"
+            errors = [{"loc": ["pretraining", "component"], "msg": err}]
+            raise ConfigValidationError(
+                config=nlp.config["pretraining"], errors=errors, desc=desc
+            )
+        layer = nlp.get_pipe(tok2vec_component).model
+        if P["layer"]:
+            layer = layer.get_ref(P["layer"])
+        layer.from_bytes(weights_data)
+        return True
+    return False
+
+
+def verify_config(nlp: "Language") -> None:
+    """Perform additional checks based on the config, loaded nlp object and training data."""
+    # TODO: maybe we should validate based on the actual components, the list
+    # in config["nlp"]["pipeline"] instead?
+    for pipe_config in nlp.config["components"].values():
+        # We can't assume that the component name == the factory
+        factory = pipe_config["factory"]
+        if factory == "textcat":
+            verify_textcat_config(nlp, pipe_config)
+
+
+def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
+    # if 'positive_label' is provided: double check whether it's in the data and
+    # the task is binary
+    if pipe_config.get("positive_label"):
+        textcat_labels = nlp.get_pipe("textcat").labels
+        pos_label = pipe_config.get("positive_label")
+        if pos_label not in textcat_labels:
+            raise ValueError(
+                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
+            )
+        if len(list(textcat_labels)) != 2:
+            raise ValueError(
+                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
+            )
+
+
+def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
+    """RETURNS (List[str]): All sourced components in the original config,
+        e.g. {"source": "en_core_web_sm"}. If the config contains a key
+        "factory", we assume it refers to a component factory.
+    """
+    return [
+        name
+        for name, cfg in config.get("components", {}).items()
+        if "factory" not in cfg and "source" in cfg
+    ]
+
+
+def convert_vectors(
+    nlp: "Language",
+    vectors_loc: Optional[Path],
+    *,
+    truncate: int,
+    prune: int,
+    name: Optional[str] = None,
+) -> None:
+    vectors_loc = ensure_path(vectors_loc)
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+        for lex in nlp.vocab:
+            if lex.rank and lex.rank != OOV_RANK:
+                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
+    else:
+        if vectors_loc:
+            logger.info(f"Reading vectors from {vectors_loc}")
+            vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+            logger.info(f"Loaded vectors from {vectors_loc}")
+        else:
+            vectors_data, vector_keys = (None, None)
+        if vector_keys is not None:
+            for word in vector_keys:
+                if word not in nlp.vocab:
+                    nlp.vocab[word]
+        if vectors_data is not None:
+            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+    if name is None:
+        # TODO: Is this correct? Does this matter?
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
+    else:
+        nlp.vocab.vectors.name = name
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
+    if prune >= 1:
+        nlp.vocab.prune_vectors(prune)
+
+
+def read_vectors(vectors_loc: Path, truncate_vectors: int):
+    f = open_file(vectors_loc)
+    f = ensure_shape(f)
+    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
+    vectors_keys = []
+    for i, line in enumerate(tqdm.tqdm(f)):
+        line = line.rstrip()
+        pieces = line.rsplit(" ", vectors_data.shape[1])
+        word = pieces.pop(0)
+        if len(pieces) != vectors_data.shape[1]:
+            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
+        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
+    return vectors_data, vectors_keys
+
+
+def open_file(loc: Union[str, Path]) -> IO:
+    """Handle .gz, .tar.gz or unzipped files"""
+    loc = ensure_path(loc)
+    if tarfile.is_tarfile(str(loc)):
+        return tarfile.open(str(loc), "r:gz")
+    elif loc.parts[-1].endswith("gz"):
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+    elif loc.parts[-1].endswith("zip"):
+        zip_file = zipfile.ZipFile(str(loc))
+        names = zip_file.namelist()
+        file_ = zip_file.open(names[0])
+        return (line.decode("utf8") for line in file_)
+    else:
+        return loc.open("r", encoding="utf8")
+
+
+def ensure_shape(lines):
+    """Ensure that the first line of the data is the vectors shape.
+    If it's not, we read in the data and output the shape as the first result,
+    so that the reader doesn't have to deal with the problem.
+    """
+    first_line = next(lines)
+    try:
+        shape = tuple(int(size) for size in first_line.split())
+    except ValueError:
+        shape = None
+    if shape is not None:
+        # All good, give the data
+        yield first_line
+        yield from lines
+    else:
+        # Figure out the shape, make it the first value, and then give the
+        # rest of the data.
+        width = len(first_line.split()) - 1
+        captured = [first_line] + list(lines)
+        length = len(captured)
+        yield f"{length} {width}"
+        yield from captured
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -0,0 +1,304 @@
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import Optional, TYPE_CHECKING
+from pathlib import Path
+from timeit import default_timer as timer
+from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
+import random
+import tqdm
+from wasabi import Printer
+
+from .example import Example
+from ..schemas import ConfigSchemaTraining
+from ..errors import Errors
+from ..util import resolve_dot_names, registry
+
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
+
+
+def train(
+    nlp: "Language",
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    silent: bool = False,
+) -> None:
+    """Train a pipeline.
+
+    nlp (Language): The initialized nlp object with the full config.
+    output_path (Path): Optional output path to save trained model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    silent (bool): Whether to pretty-print outputs.
+    RETURNS (Path / None): The path to the final exported model.
+    """
+    msg = Printer(no_print=silent)
+    # Create iterator, which yields out info after each optimization step.
+    config = nlp.config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = T["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = train_while_improving(
+        nlp,
+        optimizer,
+        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
+        create_evaluation_callback(nlp, dev_corpus, score_weights),
+        dropout=T["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        patience=T["patience"],
+        max_steps=T["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+    )
+    msg.info(f"Pipeline: {nlp.pipe_names}")
+    if frozen_components:
+        msg.info(f"Frozen components: {frozen_components}")
+    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
+    with nlp.select_pipes(disable=frozen_components):
+        print_row, finalize_logger = train_logger(nlp)
+    try:
+        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+        progress.set_description(f"Epoch 1")
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            progress.update(1)
+            if is_best_checkpoint is not None:
+                progress.close()
+                print_row(info)
+                if is_best_checkpoint and output_path is not None:
+                    with nlp.select_pipes(disable=frozen_components):
+                        update_meta(T, nlp, info)
+                    with nlp.use_params(optimizer.averages):
+                        nlp = before_to_disk(nlp)
+                        nlp.to_disk(output_path / "model-best")
+                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+                progress.set_description(f"Epoch {info['epoch']}")
+    except Exception as e:
+        finalize_logger()
+        if output_path is not None:
+            # We don't want to swallow the traceback if we don't have a
+            # specific error.
+            msg.warn(
+                f"Aborting and saving the final best model. "
+                f"Encountered exception: {str(e)}"
+            )
+            nlp = before_to_disk(nlp)
+            nlp.to_disk(output_path / "model-final")
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            final_model_path = output_path / "model-final"
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    nlp.to_disk(final_model_path)
+            else:
+                nlp.to_disk(final_model_path)
+            msg.good(f"Saved pipeline to output directory", final_model_path)
+
+
+def train_while_improving(
+    nlp: "Language",
+    optimizer: Optimizer,
+    train_data,
+    evaluate,
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    patience: int,
+    max_steps: int,
+    exclude: List[str],
+):
+    """Train until an evaluation stops improving. Works as a generator,
+    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+    where info is a dict, and is_best_checkpoint is in [True, False, None] --
+    None indicating that the iteration was not evaluated as a checkpoint.
+    The evaluation is conducted by calling the evaluate callback.
+
+    Positional arguments:
+        nlp: The spaCy pipeline to evaluate.
+        optimizer: The optimizer callable.
+        train_data (Iterable[Batch]): A generator of batches, with the training
+            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+            data iterable needs to take care of iterating over the epochs and
+            shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(train_data):
+        dropout = next(dropouts)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            nlp.update(
+                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for name, proc in nlp.pipeline:
+            if (
+                name not in exclude
+                and hasattr(proc, "model")
+                and proc.model not in (True, False, None)
+            ):
+                proc.model.finish_update(optimizer)
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if no improvement in `patience` updates (if specified)
+        best_score, best_step = max(results)
+        if patience and (step - best_step) >= patience:
+            break
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
+def subdivide_batch(batch, accumulate_gradient):
+    batch = list(batch)
+    batch.sort(key=lambda eg: len(eg.predicted))
+    sub_len = len(batch) // accumulate_gradient
+    start = 0
+    for i in range(accumulate_gradient):
+        subbatch = batch[start : start + sub_len]
+        if subbatch:
+            yield subbatch
+        start += len(subbatch)
+    subbatch = batch[start:]
+    if subbatch:
+        yield subbatch
+
+
+def create_evaluation_callback(
+    nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
+) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    weights = {key: value for key, value in weights.items() if value is not None}
+
+    def evaluate() -> Tuple[float, Dict[str, float]]:
+        dev_examples = list(dev_corpus(nlp))
+        scores = nlp.evaluate(dev_examples)
+        # Calculate a weighted sum based on score_weights for the main score.
+        # We can only consider scores that are ints/floats, not dicts like
+        # entity scores per type etc.
+        for key, value in scores.items():
+            if key in weights and not isinstance(value, (int, float)):
+                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
+        try:
+            weighted_score = sum(
+                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
+            )
+        except KeyError as e:
+            keys = list(scores.keys())
+            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
+            raise KeyError(err) from None
+        return weighted_score, scores
+
+    return evaluate
+
+
+def create_train_batches(
+    iterator: Iterator[Example],
+    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    max_epochs: int,
+):
+    epoch = 0
+    examples = list(iterator)
+    if not examples:
+        # Raise error if no data
+        raise ValueError(Errors.E986)
+    while max_epochs < 1 or epoch != max_epochs:
+        random.shuffle(examples)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
+def update_meta(
+    training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
+) -> None:
+    nlp.meta["performance"] = {}
+    for metric in training["score_weights"]:
+        if metric is not None:
+            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
+    for pipe_name in nlp.pipe_names:
+        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+
+
+def create_before_to_disk_callback(
+    callback: Optional[Callable[["Language"], "Language"]]
+) -> Callable[["Language"], "Language"]:
+    from ..language import Language  # noqa: F811
+
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -0,0 +1,266 @@
+from typing import Optional, Callable, Iterable, Union, List
+from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
+from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from pathlib import Path
+from functools import partial
+from collections import Counter
+import srsly
+import numpy
+import time
+import re
+from wasabi import Printer
+
+from .example import Example
+from ..tokens import Doc
+from ..attrs import ID
+from ..ml.models.multi_task import build_cloze_multi_task_model
+from ..ml.models.multi_task import build_cloze_characters_multi_task_model
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..util import registry, load_model_from_config, dot_to_object
+
+
+def pretrain(
+    config: Config,
+    output_dir: Path,
+    resume_path: Optional[Path] = None,
+    epoch_resume: Optional[int] = None,
+    use_gpu: int = -1,
+    silent: bool = True,
+):
+    msg = Printer(no_print=silent)
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    nlp = load_model_from_config(config)
+    _config = nlp.config.interpolate()
+    T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
+    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
+    corpus = dot_to_object(T, P["corpus"])
+    batcher = P["batcher"]
+    model = create_pretraining_model(nlp, P)
+    optimizer = P["optimizer"]
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        _resume_model(model, resume_path, epoch_resume, silent=silent)
+    else:
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0
+    # TODO: move this to logger function?
+    tracker = ProgressTracker(frequency=10000)
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
+    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
+    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
+
+    def _save_model(epoch, is_temp=False):
+        is_temp_str = ".temp" if is_temp else ""
+        with model.use_params(optimizer.averages):
+            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+                file_.write(model.get_ref("tok2vec").to_bytes())
+            log = {
+                "nr_word": tracker.nr_word,
+                "loss": tracker.loss,
+                "epoch_loss": tracker.epoch_loss,
+                "epoch": epoch,
+            }
+            with (output_dir / "log.jsonl").open("a") as file_:
+                file_.write(srsly.json_dumps(log) + "\n")
+
+    objective = create_objective(P["objective"])
+    # TODO: I think we probably want this to look more like the
+    # 'create_train_batches' function?
+    for epoch in range(epoch_resume, P["max_epochs"]):
+        for batch_id, batch in enumerate(batcher(corpus(nlp))):
+            docs = ensure_docs(batch)
+            loss = make_update(model, docs, optimizer, objective)
+            progress = tracker.update(epoch, loss, docs)
+            if progress:
+                msg.row(progress, **row_settings)
+            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                _save_model(epoch, is_temp=True)
+        _save_model(epoch)
+        tracker.epoch_loss = 0.0
+
+
+def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
+    docs = []
+    for eg_or_doc in examples_or_docs:
+        if isinstance(eg_or_doc, Doc):
+            docs.append(eg_or_doc)
+        else:
+            docs.append(eg_or_doc.reference)
+    return docs
+
+
+def _resume_model(
+    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
+) -> None:
+    msg = Printer(no_print=silent)
+    msg.info(f"Resume training tok2vec from: {resume_path}")
+    with resume_path.open("rb") as file_:
+        weights_data = file_.read()
+        model.get_ref("tok2vec").from_bytes(weights_data)
+    # Parse the epoch number from the given weight file
+    model_name = re.search(r"model\d+\.bin", str(resume_path))
+    if model_name:
+        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+        msg.info(f"Resuming from epoch: {epoch_resume}")
+    else:
+        msg.info(f"Resuming from epoch: {epoch_resume}")
+
+
+def make_update(
+    model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
+) -> float:
+    """Perform an update over a single batch of documents.
+
+    docs (iterable): A batch of `Doc` objects.
+    optimizer (callable): An optimizer.
+    RETURNS loss: A float for the loss.
+    """
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = objective_func(model.ops, docs, predictions)
+    backprop(gradients)
+    model.finish_update(optimizer)
+    # Don't want to return a cupy object here
+    # The gradients are modified in-place by the BERT MLM,
+    # so we get an accurate loss
+    return float(loss)
+
+
+def create_objective(config: Config):
+    """Create the objective for pretraining.
+
+    We'd like to replace this with a registry function but it's tricky because
+    we're also making a model choice based on this. For now we hard-code support
+    for two types (characters, vectors). For characters you can specify
+    n_characters, for vectors you can specify the loss.
+
+    Bleh.
+    """
+    objective_type = config["type"]
+    if objective_type == "characters":
+        return partial(get_characters_loss, nr_char=config["n_characters"])
+    elif objective_type == "vectors":
+        if config["loss"] == "cosine":
+            distance = CosineDistance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        elif config["loss"] == "L2":
+            distance = L2Distance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        else:
+            raise ValueError("Unexpected loss type", config["loss"])
+    else:
+        raise ValueError("Unexpected objective_type", objective_type)
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_target, loss = distance(prediction, target)
+    return loss, d_target
+
+
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+def create_pretraining_model(nlp, pretrain_config):
+    """Define a network for the pretraining. We simply add an output layer onto
+    the tok2vec input model. The tok2vec input model needs to be a model that
+    takes a batch of Doc objects (as a list), and returns a list of arrays.
+    Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
+    """
+    component = nlp.get_pipe(pretrain_config["component"])
+    if pretrain_config.get("layer"):
+        tok2vec = component.model.get_ref(pretrain_config["layer"])
+    else:
+        tok2vec = component.model
+
+    # TODO
+    maxout_pieces = 3
+    hidden_size = 300
+    if pretrain_config["objective"]["type"] == "vectors":
+        model = build_cloze_multi_task_model(
+            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
+        )
+    elif pretrain_config["objective"]["type"] == "characters":
+        model = build_cloze_characters_multi_task_model(
+            nlp.vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=pretrain_config["objective"]["n_characters"],
+        )
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    set_dropout_rate(model, pretrain_config["dropout"])
+    return model
+
+
+class ProgressTracker:
+    def __init__(self, frequency=1000000):
+        self.loss = 0.0
+        self.prev_loss = 0.0
+        self.nr_word = 0
+        self.words_per_epoch = Counter()
+        self.frequency = frequency
+        self.last_time = time.time()
+        self.last_update = 0
+        self.epoch_loss = 0.0
+
+    def update(self, epoch, loss, docs):
+        self.loss += loss
+        self.epoch_loss += loss
+        words_in_batch = sum(len(doc) for doc in docs)
+        self.words_per_epoch[epoch] += words_in_batch
+        self.nr_word += words_in_batch
+        words_since_update = self.nr_word - self.last_update
+        if words_since_update >= self.frequency:
+            wps = words_since_update / (time.time() - self.last_time)
+            self.last_update = self.nr_word
+            self.last_time = time.time()
+            loss_per_word = self.loss - self.prev_loss
+            status = (
+                epoch,
+                self.nr_word,
+                _smart_round(self.loss, width=10),
+                _smart_round(loss_per_word, width=6),
+                int(wps),
+            )
+            self.prev_loss = float(self.loss)
+            return status
+        else:
+            return None
+
+
+def _smart_round(
+    figure: Union[float, int], width: int = 10, max_decimal: int = 4
+) -> str:
+    """Round large numbers as integers, smaller numbers as decimals."""
+    n_digits = len(str(int(figure)))
+    n_decimal = width - (n_digits + 1)
+    if n_decimal <= 1:
+        return str(int(figure))
+    else:
+        n_decimal = min(n_decimal, max_decimal)
+        format_str = "%." + str(n_decimal) + "f"
+        return format_str % figure
--- a/spacy/util.py
+++ b/spacy/util.py
@ -8,6 +8,7 @@ import re
 from pathlib import Path
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError
 import functools
 import itertools
 import numpy.random
@ -56,12 +57,13 @@ if TYPE_CHECKING:


 OOV_RANK = numpy.iinfo(numpy.uint64).max
+DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]

 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
-CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 # fmt: on


@ -97,6 +99,10 @@ class registry(thinc.registry):
    cli = catalogue.create("spacy", "cli", entry_points=True)


+# We want json loading in the registry, so manually register srsly.read_json.
+registry.readers("srsly.read_json.v0", srsly.read_json)
+
+
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
@ -240,28 +246,6 @@ def get_module_path(module: ModuleType) -> Path:
    return Path(sys.modules[module.__module__].__file__).parent


-def load_vectors_into_model(
-    nlp: "Language", name: Union[str, Path], *, add_strings=True
-) -> None:
-    """Load word vectors from an installed model or path into a model instance."""
-    vectors_nlp = load_model(name)
-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
-
-
-def load_vocab_data_into_model(
-    nlp: "Language", *, lookups: Optional["Lookups"] = None
-) -> None:
-    """Load vocab data."""
-    if lookups:
-        nlp.vocab.lookups = lookups
-
-
 def load_model(
    name: Union[str, Path],
    *,
@ -400,27 +384,39 @@ def load_model_from_config(
    return nlp


-def resolve_training_config(
-    config: Config,
-    exclude: Iterable[str] = ("nlp", "components"),
-    validate: bool = True,
-) -> Dict[str, Any]:
-    """Resolve the config sections relevant for trainig and create all objects.
-    Mostly used in the CLI to separate training config (not resolved by default
-    because not runtime-relevant – an nlp object should load fine even if it's
-    [training] block refers to functions that are not available etc.).
+def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]:
+    """Resolve one or more "dot notation" names, e.g. corpora.train.
+    The paths could point anywhere into the config, so we don't know which
+    top-level section we'll be looking within.

-    config (Config): The config to resolve.
-    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
-        be available in the final resolved config.
-    validate (bool): Whether to validate the config.
-    RETURNS (Dict[str, Any]): The resolved config.
+    We resolve the whole top-level section, although we could resolve less --
+    we could find the lowest part of the tree.
    """
-    config = config.copy()
-    for key in exclude:
-        if key in config:
-            config.pop(key)
-    return registry.resolve(config, validate=validate)
+    # TODO: include schema?
+    resolved = {}
+    output = []
+    errors = []
+    for name in dot_names:
+        if name is None:
+            output.append(name)
+        else:
+            section = name.split(".")[0]
+            # We want to avoid resolving the same thing twice
+            if section not in resolved:
+                if registry.is_promise(config[section]):
+                    # Otherwise we can't resolve [corpus] if it's a promise
+                    result = registry.resolve({"config": config[section]})["config"]
+                else:
+                    result = registry.resolve(config[section])
+                resolved[section] = result
+            try:
+                output.append(dot_to_object(resolved, name))
+            except KeyError:
+                msg = f"not a valid section reference: {name}"
+                errors.append({"loc": name.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config=config, errors=errors)
+    return tuple(output)


 def load_model_from_init_py(
@ -1300,3 +1296,21 @@ def minibatch(items, size):
        if len(batch) == 0:
            break
        yield list(batch)
+
+
+def is_cython_func(func: Callable) -> bool:
+    """Slightly hacky check for whether a callable is implemented in Cython.
+    Can be used to implement slightly different behaviors, especially around
+    inspecting and parameter annotations.
+
+    func (Callable): The callable to check.
+    RETURNS (bool): Whether the callable is Cython (probably).
+    """
+    attr = "__reduce_cython__"
+    if hasattr(func, attr):  # function or class instance
+        return True
+    # https://stackoverflow.com/a/55767059
+    if hasattr(func, "__qualname__") and hasattr(func, "__module__"):  # method
+        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        return hasattr(cls_func, attr)
+    return False
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.

 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build mixed representations. The features used
-are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
-definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
-pretrained static vectors can also be incorporated into the concatenated
-representation.
+a feed-forward subnetwork to build mixed representations. The features used are
+the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
+depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
+static vectors can also be incorporated into the concatenated representation.

 | Name                      | Description                                                                                                                                                                                                       |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -517,18 +516,18 @@ specific data and challenge.
 Stacked ensemble of a bag-of-words model and a neural network model. The neural
 network has an internal CNN Tok2Vec layer and uses attention.

-| Name                 | Description                                                                                                                                                                                        |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                        |
-| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                             |
-| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                              |
-| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                                |
-| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                        |
-| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                        |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                 | Description                                                                                                                                                                                    |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
+| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
+| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                          |
+| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                            |
+| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
+| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ### spacy.TextCatCNN.v1 {#TextCatCNN}

@ -555,12 +554,12 @@ A neural network model where token vectors are calculated using a CNN. The
 vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.

-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                            |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ### spacy.TextCatBOW.v1 {#TextCatBOW}

@ -575,16 +574,16 @@ architecture is usually less accurate than the ensemble, but runs faster.
 > nO = null
 > ```

-An n-gram "bag-of-words" model. This architecture should run much faster than the
-others, but may not be as accurate, especially if texts are short.
+An n-gram "bag-of-words" model. This architecture should run much faster than
+the others, but may not be as accurate, especially if texts are short.

-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                               |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |

 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}

@ -629,11 +628,11 @@ into the "real world". This requires 3 main components:
 The `EntityLinker` model architecture is a Thinc `Model` with a
 [`Linear`](https://thinc.ai/api-layers#linear) output layer.

-| Name        | Description                                                                                                                                                                                                             |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                                 |
-| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                        |
+| Name        | Description                                                                                                                                                                                                         |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                             |
+| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |

 ### spacy.EmptyKB.v1 {#EmptyKB}

--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -190,8 +190,6 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                 |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
 | `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                         |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                           |
-| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                  |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                           |
 | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                 |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                   |
@ -200,7 +198,6 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                             |
 | `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                           |
 | `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                       |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                          |

 ### pretraining {#config-pretraining tag="section,optional"}

@ -220,6 +217,38 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |

+### initialize {#config-initialize tag="section"}
+
+This config block lets you define resources for **initializing the pipeline**.
+It's used by [`Language.initialize`](/api/language#initialize) and typically
+called right before training (but not at runtime). The section allows you to
+specify local file paths or custom functions to load data resources from,
+without requiring them at runtime when you load the trained pipeline back in.
+
+> #### Example
+>
+> ```ini
+> [initialize]
+> vectors = "/path/to/vectors_nlp"
+> init_tok2vec = "/path/to/pretrain.bin"
+>
+> [initialize_components]
+>
+> [initialize.components.my_component]
+> data_path = "/path/to/component_data"
+> ```
+
+<!-- TODO: -->
+
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
+| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                                                                                                                                                                                |
+| `lookups`      | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                                                                                                                                                                                       |
+| `tokenizer`    | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ |
+| `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                               |
+| `vocab_data`   | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~                                                                                                                                                                                                                                                                                           |
+
 ## Training data {#training}

 ### Binary training format {#binary-training new="3"}
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -139,31 +139,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## DependencyParser.begin_training {#begin_training tag="method"}
+## DependencyParser.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
+> parser.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## DependencyParser.predict {#predict tag="method"}

@ -209,7 +213,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = parser.update(examples, sgd=optimizer)
 > ```

@ -293,11 +297,10 @@ context, the original parameters are restored.
 ## DependencyParser.add_label {#add_label tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -139,31 +139,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityLinker.begin_training {#begin_training tag="method"}
+## EntityLinker.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
-> entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+> entity_linker = nlp.add_pipe("entity_linker")
+> entity_linker.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## EntityLinker.predict {#predict tag="method"}

@ -211,7 +215,7 @@ pipe's entity linking model and context encoder. Delegates to
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -129,31 +129,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## EntityRecognizer.begin_training {#begin_training tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
+> ner.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## EntityRecognizer.predict {#predict tag="method"}

@ -199,7 +203,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = ner.update(examples, sgd=optimizer)
 > ```

@ -282,11 +286,10 @@ context, the original parameters are restored.
 ## EntityRecognizer.add_label {#add_label tag="method"}

 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -202,30 +202,38 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |

-## Language.begin_training {#begin_training tag="method"}
+## Language.initialize {#initialize tag="method"}

 Initialize the pipeline for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples can either be the full training data or a representative sample. They
-are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`begin_training`](/api/pipe#begin_training) method, if
-available. Initialization includes validating the network,
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
+settings defined in the [`[initialize]`](/api/data-formats#config-initialize)
+config block to set up the vocabulary, load in vectors and tok2vec weights and
+pass optional arguments to the `initialize` methods implemented by pipeline
+components or the tokenizer. This method is typically called automatically when
+you run [`spacy train`](/api/cli#train).
+
+`get_examples` should be a function that returns an iterable of
+[`Example`](/api/example) objects. The data examples can either be the full
+training data or a representative sample. They are used to **initialize the
+models** of trainable pipeline components and are passed each component's
+[`initialize`](/api/pipe#initialize) method, if available. Initialization
+includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.

-If no `get_examples` function is provided when calling `nlp.begin_training`, the
+If no `get_examples` function is provided when calling `nlp.initialize`, the
 pipeline components will be initialized with generic data. In this case, it is
 crucial that the output dimension of each component has already been defined
 either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).

-<Infobox variant="warning" title="Changed in v3.0">
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">

-The `Language.update` method now takes a **function** that is called with no
-arguments and returns a sequence of [`Example`](/api/example) objects instead of
-tuples of `Doc` and `GoldParse` objects.
+This method was previously called `begin_training`. It now also takes a
+**function** that is called with no arguments and returns a sequence of
+[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
+objects.

 </Infobox>

@ -233,7 +241,7 @@ tuples of `Doc` and `GoldParse` objects.
 >
 > ```python
 > get_examples = lambda: examples
-> optimizer = nlp.begin_training(get_examples)
+> optimizer = nlp.initialize(get_examples)
 > ```

 | Name           | Description                                                                                                                                              |
@ -637,13 +645,13 @@ list will be disabled. Under the hood, this method calls into
 >
 > ```python
 > with nlp.select_pipes(disable=["tagger", "parser"]):
->    nlp.begin_training()
+>    nlp.initialize()
 >
 > with nlp.select_pipes(enable="ner"):
->     nlp.begin_training()
+>     nlp.initialize()
 >
 > disabled = nlp.select_pipes(disable=["tagger", "parser"])
-> nlp.begin_training()
+> nlp.initialize()
 > disabled.restore()
 > ```

--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -117,32 +117,29 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Morphologizer.begin_training {#begin_training tag="method"}
+## Morphologizer.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).

 > #### Example
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
+> morphologizer.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## Morphologizer.predict {#predict tag="method"}

@ -189,7 +186,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = morphologizer.update(examples, sgd=optimizer)
 > ```

@ -259,12 +256,11 @@ context, the original parameters are restored.
 Add a new label to the pipe. If the `Morphologizer` should set annotations for
 both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
 Raises an error if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). Note that you don't have to
-call this method if you provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+already been fully [initialized](#initialize). Note that you don't have to call
+this method if you provide a **representative data sample** to the
+[`initialize`](#initialize) method. In this case, all labels found in the sample
+will be automatically added to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 > #### Example
 >
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -98,31 +98,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Pipe.begin_training {#begin_training tag="method"}
+## Pipe.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
+> pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## Pipe.predict {#predict tag="method"}

@ -180,7 +184,7 @@ predictions and gold-standard annotations, and update the component's model.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=optimizer)
 > ```

@ -296,9 +300,9 @@ context, the original parameters are restored.
 Add a new label to the pipe, to be predicted by the model. The actual
 implementation depends on the specific component, but in general `add_label`
 shouldn't be called if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). If these conditions are
-violated, the function will raise an Error. The exception to this rule is when
-the component is [resizable](#is_resizable), in which case
+already been fully [initialized](#initialize). If these conditions are violated,
+the function will raise an Error. The exception to this rule is when the
+component is [resizable](#is_resizable), in which case
 [`set_output`](#set_output) should be called to ensure that the model is
 properly resized.

@ -314,9 +318,9 @@ This method needs to be overwritten with your own custom `add_label` method.
 | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |

 Note that in general, you don't have to call `pipe.add_label` if you provide a
-representative data sample to the [`begin_training`](#begin_training) method. In
-this case, all labels found in the sample will be automatically added to the
-model, and the output dimension will be
+representative data sample to the [`initialize`](#initialize) method. In this
+case, all labels found in the sample will be automatically added to the model,
+and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.

 ## Pipe.is_resizable {#is_resizable tag="method"}
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@ -114,31 +114,29 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## SentenceRecognizer.begin_training {#begin_training tag="method"}
+## SentenceRecognizer.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).

 > #### Example
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
+> senter.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## SentenceRecognizer.predict {#predict tag="method"}

@ -185,7 +183,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = senter.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -112,31 +112,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Tagger.begin_training {#begin_training tag="method"}
+## Tagger.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+> tagger.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## Tagger.predict {#predict tag="method"}

@ -183,7 +187,7 @@ Delegates to [`predict`](/api/tagger#predict) and
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tagger.update(examples, sgd=optimizer)
 > ```

@ -289,12 +293,12 @@ context, the original parameters are restored.
 ## Tagger.add_label {#add_label tag="method"}

 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.

 > #### Example
 >
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -125,31 +125,35 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## TextCategorizer.begin_training {#begin_training tag="method"}
+## TextCategorizer.initialize {#initialize tag="method"}

-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>

 > #### Example
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
+> textcat.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## TextCategorizer.predict {#predict tag="method"}

@ -196,14 +200,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = textcat.update(examples, sgd=optimizer)
 > ```

 | Name              | Description                                                                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    | 
+| _keyword-only_    |                                                                                                                                    |
 | `drop`            | The dropout rate. ~~float~~                                                                                                        |
 | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
 | `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
@ -227,7 +231,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          | 
+| _keyword-only_ |                                                                                                                          |
 | `drop`         | The dropout rate. ~~float~~                                                                                              |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
@ -303,12 +307,12 @@ Modify the pipe's model to use the given parameter values.
 ## TextCategorizer.add_label {#add_label tag="method"}

 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.

 > #### Example
 >
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Tok2Vec.begin_training {#begin_training tag="method"}
+## Tok2Vec.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -132,22 +132,21 @@ examples are used to **initialize the model** of the component and can either be
 the full training data or a representative sample. Initialization includes
 validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).

 > #### Example
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
+> tok2vec.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## Tok2Vec.predict {#predict tag="method"}

@ -193,7 +192,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tok2vec.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |

-## Transformer.begin_training {#begin_training tag="method"}
+## Transformer.initialize {#initialize tag="method"}

 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -167,22 +167,21 @@ examples are used to **initialize the model** of the component and can either be
 the full training data or a representative sample. Initialization includes
 validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).

 > #### Example
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
+> trf.initialize(lambda: [], nlp=nlp)
 > ```

 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |

 ## Transformer.predict {#predict tag="method"}

@ -241,7 +240,7 @@ and call the optimizer, while the others simply increment the gradients.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = trf.update(examples, sgd=optimizer)
 > ```

--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
 that their internal models are **always initialized** with appropriate sample
 data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
 ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
-functionality is triggered when
-[`nlp.begin_training`](/api/language#begin_training) is called.
+functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
+called.

 ### Dropout and normalization in Thinc {#thinc-dropout-norm}

@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):

 <!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
+- Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.

-| Name                                         | Description                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)                 | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)             | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)                   | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |

 <Infobox title="Custom trainable components and models" emoji="📖">

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -1056,8 +1056,8 @@ of being dropped.

 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
->   return an optimizer to update the component model weights.
+> - [`nlp.initialize`](/api/language#initialize): Start the training and return
+>   an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1068,7 +1068,7 @@ of being dropped.

 ```python
 ### Example training loop
-optimizer = nlp.begin_training()
+optimizer = nlp.initialize()
 for itn in range(100):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
  [`Pipe.update`](/api/pipe#update) methods now all take batches of
  [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
  raw text and a dictionary of annotations.
-  [`Language.begin_training`](/api/language#begin_training) and
-  [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-  returns a sequence of `Example` objects to initialize the model instead of a
-  list of tuples.
+  [`Language.initialize`](/api/language#initialize) and
+  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
+  sequence of `Example` objects to initialize the model instead of a list of
+  tuples.
+- The `begin_training` methods have been renamed to `initialize`.
 - [`Matcher.add`](/api/matcher#add) and
  [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
  patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
 | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
@ -936,7 +938,7 @@ TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.begin_training()
+nlp.initialize()
 for i in range(20):
    random.shuffle(TRAIN_DATA)
    for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
        nlp.update(examples)
 ```

-[`Language.begin_training`](/api/language#begin_training) and
-[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-returns a sequence of `Example` objects to initialize the model instead of a
-list of tuples. The data examples are used to **initialize the models** of
+`Language.begin_training` and `Pipe.begin_training` have been renamed to
+[`Language.initialize`](/api/language#initialize) and
+[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
+that returns a sequence of `Example` objects to initialize the model instead of
+a list of tuples. The data examples are used to **initialize the models** of
 trainable pipeline components, which includes validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.

 ```diff
- nlp.begin_training(examples)
-+ nlp.begin_training(lambda: examples)
+- nlp.initialize(examples)
+ nlp.initialize(lambda: examples)
 ```

 #### Packaging trained pipelines {#migrating-training-packaging}