Refactor CLI

2025-11-25 04:16:11 +03:00 · 2020-09-28 15:09:59 +02:00 · 2020-09-28 15:09:59 +02:00 · 822ea4ef61
commit 822ea4ef61
parent a89e0ff7cb
18 changed files with 917 additions and 913 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,8 +15,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
-#from .init_model import init_model  # noqa: F401
-from .init_pipeline import init_pipeline  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -10,13 +10,12 @@ from click import NoSuchOption
 from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError, require_gpu
 from configparser import InterpolationError
 import os

 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import ensure_path

 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)


-def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
-    """RETURNS (List[str]): All sourced components in the original config,
-        e.g. {"source": "en_core_web_sm"}. If the config contains a key
-        "factory", we assume it refers to a component factory.
-    """
-    return [
-        name
-        for name, cfg in config.get("components", {}).items()
-        if "factory" not in cfg and "source" in cfg
-    ]
-
-
 def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
    """Upload a file.

@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
            p = int(p)
        result.append(p)
    return result
+
+
+class CliLogger:
+    """Helper mocking up the most commonly used logger methods. Can be passed
+    into functions like train() to make them output pretty-printed messages
+    on the CLI and regular logging if used from within Python.
+    """
+
+    debug = msg.text
+    info = msg.info
+    warn = msg.info
+    error = msg.fail
+
+
+def setup_gpu(use_gpu: int):
+    if use_gpu >= 0:
+        msg.info(f"Using GPU: {use_gpu}")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -1,7 +1,7 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config
 from thinc.config import VARIABLE_RE
 import typer

@ -52,10 +52,8 @@ def debug_config(
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
-        # Use the resolved config here in case user has one function returning
-        # a dict of corpora etc.
-        resolved = util.resolve_training_config(nlp.config)
-        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
+        dot_names = ["training.dev_corpus", "training.train_corpus"]
+        util.resolve_dot_names(nlp.config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
        value = util.dot_to_object(config, path)
        result[variable] = repr(value)
    return result
-
-
-def check_section_refs(config: Config, fields: List[str]) -> None:
-    """Validate fields in the config that refer to other sections or values
-    (e.g. in the corpora) and make sure that those references exist.
-    """
-    errors = []
-    for field in fields:
-        # If the field doesn't exist in the config, we ignore it
-        try:
-            value = util.dot_to_object(config, field)
-        except KeyError:
-            continue
-        try:
-            util.dot_to_object(config, value)
-        except KeyError:
-            msg = f"not a valid section reference: {value}"
-            errors.append({"loc": field.split("."), "msg": msg})
-    if errors:
-        raise ConfigValidationError(config=config, errors=errors)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
 import typer

 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, get_sourced_components
+from ._util import import_code, debug_cli
 from ..training import Corpus, Example
+from ..training.initialize import get_sourced_components
+from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
+from ..util import registry
 from .. import util


@ -94,26 +97,13 @@ def debug_data(
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
-        C = util.resolve_training_config(nlp.config)
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
    resume_components = [p for p in sourced_components if p not in frozen_components]
    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
-    tag_map_path = util.ensure_path(C["training"]["tag_map"])
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
-    morph_rules = {}
-    if morph_rules_path is not None:
-        morph_rules = srsly.read_json(morph_rules_path)
-    # Replace tag map with provided mapping
-    nlp.vocab.morphology.load_tag_map(tag_map)
-    # Load morph rules
-    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
@ -145,10 +135,10 @@ def debug_data(

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]

    msg.divider("Training stats")
-    msg.text(f"Language: {C['nlp']['lang']}")
+    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@ -355,6 +345,7 @@ def debug_data(
    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
+        # TODO: does this need to be updated?
        tag_map = nlp.vocab.morphology.tag_map
        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
        labels_with_counts = _format_labels(
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -4,12 +4,14 @@ from pathlib import Path
 from spacy.training import Example
 from spacy.util import dot_to_object
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
 import typer

 from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list
+from ._util import parse_config_overrides, string_to_list, setup_gpu
+from ..schemas import ConfigSchemaTraining
+from ..util import registry
 from .. import util


@ -37,11 +39,7 @@ def debug_model_cli(

    DOCS: https://nightly.spacy.io/api/cli#debug-model
    """
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
@ -65,8 +63,8 @@ def debug_model_cli(
        set_gpu_allocator(allocator)
    with show_validation_error(config_path):
        nlp = util.load_model_from_config(raw_config)
-        C = util.resolve_training_config(nlp.config)
-    seed = C["training"]["seed"]
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    seed = T["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
@ -77,7 +75,7 @@ def debug_model_cli(
            exits=1,
        )
    model = pipe.model
-    debug_model(C, nlp, model, print_settings=print_settings)
+    debug_model(T, nlp, model, print_settings=print_settings)


 def debug_model(
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,11 +3,11 @@ from wasabi import Printer
 from pathlib import Path
 import re
 import srsly
-from thinc.api import require_gpu, fix_random_seed
+from thinc.api import fix_random_seed

 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, setup_gpu
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
@ -61,8 +61,7 @@ def evaluate(
 ) -> Scorer:
    msg = Printer(no_print=silent, pretty=not silent)
    fix_random_seed()
-    if use_gpu >= 0:
-        require_gpu(use_gpu)
+    setup_gpu(use_gpu)
    data_path = util.ensure_path(data_path)
    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -1,22 +1,13 @@
-from typing import Optional, Dict, Callable, Any
+from typing import Optional
 import logging
 from pathlib import Path
 from wasabi import msg
 import typer
-from thinc.api import Config, fix_random_seed, set_gpu_allocator
-import srsly

 from .. import util
-from ..util import registry, resolve_dot_names, OOV_RANK
-from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
-from ..language import Language
-from ..lookups import Lookups
-from ..errors import Errors
+from ..training.initialize import init_nlp
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
-
-
-DEFAULT_OOV_PROB = -20
+from ._util import import_code, CliLogger, setup_gpu


@init_cli.command(
@ -31,178 +22,16 @@ def init_pipeline_cli(
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
+    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
-    nlp = init_pipeline(config)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")
-
-
-def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
-    raw_config = config
-    config = raw_config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    # Use original config here before it's resolved to functions
-    sourced_components = get_sourced_components(config)
-    with show_validation_error():
-        nlp = util.load_model_from_config(raw_config, auto_fill=True)
-    msg.good("Set up nlp object from config")
-    config = nlp.config.interpolate()
-    # Resolve all training-relevant sections using the filled nlp config
-    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"]]
-    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-    V = I["vocab"]
-    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
-    optimizer = T["optimizer"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T["frozen_components"]
-    # Sourced components that require resume_training
-    resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if resume_components:
-        with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
-            nlp.resume_training(sgd=optimizer)
-    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-        msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
-    # are properly initialized
-    verify_config(nlp)
-    if "pretraining" in config and config["pretraining"]:
-        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        add_tok2vec_weights(nlp, P, I)
-    # TODO: this should be handled better?
-    nlp = before_to_disk(nlp)
-    return nlp
-
-
-def init_vocab(
-    nlp: Language,
-    *,
-    data: Optional[Path] = None,
-    lookups: Optional[Lookups] = None,
-    vectors: Optional[str] = None,
-) -> Language:
-    if lookups:
-        nlp.vocab.lookups = lookups
-        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
-    data_path = util.ensure_path(data)
-    if data_path is not None:
-        lex_attrs = srsly.read_jsonl(data_path)
-        for lexeme in nlp.vocab:
-            lexeme.rank = OOV_RANK
-        for attrs in lex_attrs:
-            if "settings" in attrs:
-                continue
-            lexeme = nlp.vocab[attrs["orth"]]
-            lexeme.set_attrs(**attrs)
-        if len(nlp.vocab):
-            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
-        else:
-            oov_prob = DEFAULT_OOV_PROB
-        nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
-    msg.good("Created vocabulary")
-    if vectors is not None:
-        add_vectors(nlp, vectors)
-        msg.good(f"Added vectors: {vectors}")
-
-
-def add_tok2vec_weights(
-    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
-) -> None:
-    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    P = pretrain_config
-    V = vocab_config
-    weights_data = None
-    init_tok2vec = util.ensure_path(V["init_tok2vec"])
-    if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not V["vectors"]:
-            err = "Need initialize.vectors if pretraining.objective.type is vectors"
-            msg.fail(err, exits=1)
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    if weights_data is not None:
-        tok2vec_component = P["component"]
-        if tok2vec_component is None:
-            msg.fail(
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them.",
-                exits=1,
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        if P["layer"]:
-            layer = layer.get_ref(P["layer"])
-        layer.from_bytes(weights_data)
-        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
-
-
-def add_vectors(nlp: Language, vectors: str) -> None:
-    title = f"Config validation error for vectors {vectors}"
-    desc = (
-        "This typically means that there's a problem in the config.cfg included "
-        "with the packaged vectors. Make sure that the vectors package you're "
-        "loading is compatible with the current version of spaCy."
-    )
-    with show_validation_error(
-        title=title, desc=desc, hint_fill=False, show_config=False
-    ):
-        util.load_vectors_into_model(nlp, vectors)
-        msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
-
-
-def verify_config(nlp: Language) -> None:
-    """Perform additional checks based on the config, loaded nlp object and training data."""
-    # TODO: maybe we should validate based on the actual components, the list
-    # in config["nlp"]["pipeline"] instead?
-    for pipe_config in nlp.config["components"].values():
-        # We can't assume that the component name == the factory
-        factory = pipe_config["factory"]
-        if factory == "textcat":
-            verify_textcat_config(nlp, pipe_config)
-
-
-def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
-    # if 'positive_label' is provided: double check whether it's in the data and
-    # the task is binary
-    if pipe_config.get("positive_label"):
-        textcat_labels = nlp.get_pipe("textcat").labels
-        pos_label = pipe_config.get("positive_label")
-        if pos_label not in textcat_labels:
-            raise ValueError(
-                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
-            )
-        if len(list(textcat_labels)) != 2:
-            raise ValueError(
-                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
-            )
-
-
-def create_before_to_disk_callback(
-    callback: Optional[Callable[[Language], Language]]
-) -> Callable[[Language], Language]:
-    def before_to_disk(nlp: Language) -> Language:
-        if not callback:
-            return nlp
-        modified_nlp = callback(nlp)
-        if not isinstance(modified_nlp, Language):
-            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
-            raise ValueError(err)
-        return modified_nlp
-
-    return before_to_disk
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,25 +1,13 @@
 from typing import Optional
-import numpy
-import time
-import re
-from collections import Counter
 from pathlib import Path
-from thinc.api import require_gpu, set_gpu_allocator
-from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
-import srsly
-from functools import partial
 import typer
+import re

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
-from ..ml.models.multi_task import build_cloze_multi_task_model
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
-from ..tokens import Doc
-from ..attrs import ID
-from .. import util
-from ..util import dot_to_object
+from ._util import import_code, setup_gpu, CliLogger
+from ..training.pretrain import pretrain
+from ..util import load_config


@app.command(
@ -61,15 +49,11 @@ def pretrain_cli(
    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
    msg.info(f"Loading config from: {config_path}")

    with show_validation_error(config_path):
-        raw_config = util.load_config(
+        raw_config = load_config(
            config_path, overrides=config_overrides, interpolate=False
        )
    config = raw_config.interpolate()
@ -89,250 +73,11 @@ def pretrain_cli(
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
+        logger=CliLogger,
    )
-
-
-def pretrain(
-    config: Config,
-    output_dir: Path,
-    resume_path: Optional[Path] = None,
-    epoch_resume: Optional[int] = None,
-    use_gpu: int = -1,
-):
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    nlp = util.load_model_from_config(config)
-    C = util.resolve_training_config(nlp.config)
-    P_cfg = C["pretraining"]
-    corpus = dot_to_object(C, P_cfg["corpus"])
-    batcher = P_cfg["batcher"]
-    model = create_pretraining_model(nlp, C["pretraining"])
-    optimizer = C["pretraining"]["optimizer"]
-    # Load in pretrained weights to resume from
-    if resume_path is not None:
-        _resume_model(model, resume_path, epoch_resume)
-    else:
-        # Without '--resume-path' the '--epoch-resume' argument is ignored
-        epoch_resume = 0
-
-    tracker = ProgressTracker(frequency=10000)
-    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
-    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
-    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
-
-    def _save_model(epoch, is_temp=False):
-        is_temp_str = ".temp" if is_temp else ""
-        with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
-                file_.write(model.get_ref("tok2vec").to_bytes())
-            log = {
-                "nr_word": tracker.nr_word,
-                "loss": tracker.loss,
-                "epoch_loss": tracker.epoch_loss,
-                "epoch": epoch,
-            }
-            with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(srsly.json_dumps(log) + "\n")
-
-    objective = create_objective(P_cfg["objective"])
-    # TODO: I think we probably want this to look more like the
-    # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
-        _save_model(epoch)
-        tracker.epoch_loss = 0.0
    msg.good("Successfully finished pretrain")


-def ensure_docs(examples_or_docs):
-    docs = []
-    for eg_or_doc in examples_or_docs:
-        if isinstance(eg_or_doc, Doc):
-            docs.append(eg_or_doc)
-        else:
-            docs.append(eg_or_doc.reference)
-    return docs
-
-
-def _resume_model(model, resume_path, epoch_resume):
-    msg.info(f"Resume training tok2vec from: {resume_path}")
-    with resume_path.open("rb") as file_:
-        weights_data = file_.read()
-        model.get_ref("tok2vec").from_bytes(weights_data)
-    # Parse the epoch number from the given weight file
-    model_name = re.search(r"model\d+\.bin", str(resume_path))
-    if model_name:
-        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-    else:
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-
-
-def make_update(model, docs, optimizer, objective_func):
-    """Perform an update over a single batch of documents.
-
-    docs (iterable): A batch of `Doc` objects.
-    optimizer (callable): An optimizer.
-    RETURNS loss: A float for the loss.
-    """
-    predictions, backprop = model.begin_update(docs)
-    loss, gradients = objective_func(model.ops, docs, predictions)
-    backprop(gradients)
-    model.finish_update(optimizer)
-    # Don't want to return a cupy object here
-    # The gradients are modified in-place by the BERT MLM,
-    # so we get an accurate loss
-    return float(loss)
-
-
-def create_objective(config):
-    """Create the objective for pretraining.
-
-    We'd like to replace this with a registry function but it's tricky because
-    we're also making a model choice based on this. For now we hard-code support
-    for two types (characters, vectors). For characters you can specify
-    n_characters, for vectors you can specify the loss.
-
-    Bleh.
-    """
-    objective_type = config["type"]
-    if objective_type == "characters":
-        return partial(get_characters_loss, nr_char=config["n_characters"])
-    elif objective_type == "vectors":
-        if config["loss"] == "cosine":
-            return partial(
-                get_vectors_loss,
-                distance=CosineDistance(normalize=True, ignore_zeros=True),
-            )
-        elif config["loss"] == "L2":
-            return partial(
-                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
-            )
-        else:
-            raise ValueError("Unexpected loss type", config["loss"])
-    else:
-        raise ValueError("Unexpected objective_type", objective_type)
-
-
-def get_vectors_loss(ops, docs, prediction, distance):
-    """Compute a loss based on a distance between the documents' vectors and
-    the prediction.
-    """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    d_target, loss = distance(prediction, target)
-    return loss, d_target
-
-
-def get_characters_loss(ops, docs, prediction, nr_char):
-    """Compute a loss based on a number of characters predicted from the docs."""
-    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
-    target_ids = target_ids.reshape((-1,))
-    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
-    target = target.reshape((-1, 256 * nr_char))
-    diff = prediction - target
-    loss = (diff ** 2).sum()
-    d_target = diff / float(prediction.shape[0])
-    return loss, d_target
-
-
-def create_pretraining_model(nlp, pretrain_config):
-    """Define a network for the pretraining. We simply add an output layer onto
-    the tok2vec input model. The tok2vec input model needs to be a model that
-    takes a batch of Doc objects (as a list), and returns a list of arrays.
-    Each array in the output needs to have one row per token in the doc.
-    The actual tok2vec layer is stored as a reference, and only this bit will be
-    serialized to file and read back in when calling the 'train' command.
-    """
-    component = nlp.get_pipe(pretrain_config["component"])
-    if pretrain_config.get("layer"):
-        tok2vec = component.model.get_ref(pretrain_config["layer"])
-    else:
-        tok2vec = component.model
-
-    # TODO
-    maxout_pieces = 3
-    hidden_size = 300
-    if pretrain_config["objective"]["type"] == "vectors":
-        model = build_cloze_multi_task_model(
-            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
-        )
-    elif pretrain_config["objective"]["type"] == "characters":
-        model = build_cloze_characters_multi_task_model(
-            nlp.vocab,
-            tok2vec,
-            hidden_size=hidden_size,
-            maxout_pieces=maxout_pieces,
-            nr_char=pretrain_config["objective"]["n_characters"],
-        )
-    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
-    set_dropout_rate(model, pretrain_config["dropout"])
-    return model
-
-
-class ProgressTracker:
-    def __init__(self, frequency=1000000):
-        self.loss = 0.0
-        self.prev_loss = 0.0
-        self.nr_word = 0
-        self.words_per_epoch = Counter()
-        self.frequency = frequency
-        self.last_time = time.time()
-        self.last_update = 0
-        self.epoch_loss = 0.0
-
-    def update(self, epoch, loss, docs):
-        self.loss += loss
-        self.epoch_loss += loss
-        words_in_batch = sum(len(doc) for doc in docs)
-        self.words_per_epoch[epoch] += words_in_batch
-        self.nr_word += words_in_batch
-        words_since_update = self.nr_word - self.last_update
-        if words_since_update >= self.frequency:
-            wps = words_since_update / (time.time() - self.last_time)
-            self.last_update = self.nr_word
-            self.last_time = time.time()
-            loss_per_word = self.loss - self.prev_loss
-            status = (
-                epoch,
-                self.nr_word,
-                _smart_round(self.loss, width=10),
-                _smart_round(loss_per_word, width=6),
-                int(wps),
-            )
-            self.prev_loss = float(self.loss)
-            return status
-        else:
-            return None
-
-
-def _smart_round(figure, width=10, max_decimal=4):
-    """Round large numbers as integers, smaller numbers as decimals."""
-    n_digits = len(str(int(figure)))
-    n_decimal = width - (n_digits + 1)
-    if n_decimal <= 1:
-        return str(int(figure))
-    else:
-        n_decimal = min(n_decimal, max_decimal)
-        format_str = "%." + str(n_decimal) + "f"
-        return format_str % figure
-
-
 def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,24 +1,16 @@
-from typing import Optional, Dict, Any, Tuple, Union, Callable, List
-from timeit import default_timer as timer
-import tqdm
+from typing import Optional
 from pathlib import Path
 from wasabi import msg
-import thinc
-import thinc.schedules
-from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
-import random
+from thinc.api import Config
 import typer
 import logging

-from .init_pipeline import init_pipeline
-from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
+from ._util import import_code, CliLogger, setup_gpu
 from ..language import Language
+from ..training.loop import train
+from ..training.initialize import init_nlp, must_reinitialize
 from .. import util
-from ..errors import Errors
-from ..util import resolve_dot_names, registry
-from ..schemas import ConfigSchemaTraining


@app.command(
@ -52,31 +44,33 @@ def train_cli(
    verify_cli_args(config_path, output_path)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    if use_gpu >= 0:
-        msg.info(f"Using GPU: {use_gpu}")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
-    config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
-    nlp = init_nlp(config, output_path)
+    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
    msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu)
+    final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
+    if final_path:
+        msg.good(f"Saved pipeline to output directory", final_path)


-def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
+def init_pipeline(
+    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
+) -> Language:
+    init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
    if output_path is not None:
        init_path = output_path / "model-initial"
        if not init_path.exists():
            msg.info(f"Initializing the pipeline in {init_path}")
-            nlp = init_pipeline(config)
+            nlp = init_nlp(config, **init_kwargs)
            nlp.to_disk(init_path)
            msg.good(f"Saved initialized pipeline to {init_path}")
        else:
            nlp = util.load_model(init_path)
            if must_reinitialize(config, nlp.config):
                msg.warn("Config has changed: need to re-initialize pipeline")
-                nlp = init_pipeline(config)
+                nlp = init_nlp(config, **init_kwargs)
                nlp.to_disk(init_path)
                msg.good(f"Re-initialized pipeline in {init_path}")
            else:
@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
        "the vocabulary, vectors and label scheme. To take advantage of this, "
        "provide an output directory."
    )
-    return init_pipeline(config)
-
-
-def train(
-    nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
-) -> None:
-    # Create iterator, which yields out info after each optimization step.
-    config = nlp.config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"]]
-    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    optimizer = T["optimizer"]
-    score_weights = T["score_weights"]
-    batcher = T["batcher"]
-    train_logger = T["logger"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T["frozen_components"]
-    # Create iterator, which yields out info after each optimization step.
-    training_step_iterator = train_while_improving(
-        nlp,
-        optimizer,
-        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
-        create_evaluation_callback(nlp, dev_corpus, score_weights),
-        dropout=T["dropout"],
-        accumulate_gradient=T["accumulate_gradient"],
-        patience=T["patience"],
-        max_steps=T["max_steps"],
-        eval_frequency=T["eval_frequency"],
-        exclude=frozen_components,
-    )
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if frozen_components:
-        msg.info(f"Frozen components: {frozen_components}")
-    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
-    with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
-
-    try:
-        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-        progress.set_description(f"Epoch 1")
-        for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
-            if is_best_checkpoint is not None:
-                progress.close()
-                print_row(info)
-                if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=frozen_components):
-                        update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / "model-best")
-                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-                progress.set_description(f"Epoch {info['epoch']}")
-    except Exception as e:
-        finalize_logger()
-        if output_path is not None:
-            # We don't want to swallow the traceback if we don't have a
-            # specific error.
-            msg.warn(
-                f"Aborting and saving the final best model. "
-                f"Encountered exception: {str(e)}"
-            )
-            nlp = before_to_disk(nlp)
-            nlp.to_disk(output_path / "model-final")
-        raise e
-    finally:
-        finalize_logger()
-        if output_path is not None:
-            final_model_path = output_path / "model-final"
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    nlp.to_disk(final_model_path)
-            else:
-                nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory {final_model_path}")
-
-
-def must_reinitialize(train_config: Config, init_config: Config) -> bool:
-    # TODO: do this better and more fine-grained
-    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
-
-
-def add_vectors(nlp: Language, vectors: str) -> None:
-    title = f"Config validation error for vectors {vectors}"
-    desc = (
-        "This typically means that there's a problem in the config.cfg included "
-        "with the packaged vectors. Make sure that the vectors package you're "
-        "loading is compatible with the current version of spaCy."
-    )
-    with show_validation_error(
-        title=title, desc=desc, hint_fill=False, show_config=False
-    ):
-        util.load_vectors_into_model(nlp, vectors)
-
-
-def create_train_batches(iterator, batcher, max_epochs: int):
-    epoch = 0
-    examples = list(iterator)
-    if not examples:
-        # Raise error if no data
-        raise ValueError(Errors.E986)
-    while max_epochs < 1 or epoch != max_epochs:
-        random.shuffle(examples)
-        for batch in batcher(examples):
-            yield epoch, batch
-        epoch += 1
-
-
-def create_evaluation_callback(
-    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
-) -> Callable[[], Tuple[float, Dict[str, float]]]:
-    weights = {key: value for key, value in weights.items() if value is not None}
-
-    def evaluate() -> Tuple[float, Dict[str, float]]:
-        dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
-        # Calculate a weighted sum based on score_weights for the main score.
-        # We can only consider scores that are ints/floats, not dicts like
-        # entity scores per type etc.
-        for key, value in scores.items():
-            if key in weights and not isinstance(value, (int, float)):
-                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
-        try:
-            weighted_score = sum(
-                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
-            )
-        except KeyError as e:
-            keys = list(scores.keys())
-            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
-            raise KeyError(err) from None
-        return weighted_score, scores
-
-    return evaluate
-
-
-def train_while_improving(
-    nlp: Language,
-    optimizer: Optimizer,
-    train_data,
-    evaluate,
-    *,
-    dropout: float,
-    eval_frequency: int,
-    accumulate_gradient: int,
-    patience: int,
-    max_steps: int,
-    exclude: List[str],
-):
-    """Train until an evaluation stops improving. Works as a generator,
-    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
-    where info is a dict, and is_best_checkpoint is in [True, False, None] --
-    None indicating that the iteration was not evaluated as a checkpoint.
-    The evaluation is conducted by calling the evaluate callback.
-
-    Positional arguments:
-        nlp: The spaCy pipeline to evaluate.
-        optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
-        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
-            The callback should take no arguments and return a tuple
-            `(main_score, other_scores)`. The main_score should be a float where
-            higher is better. other_scores can be any object.
-
-    Every iteration, the function yields out a tuple with:
-
-    * batch: A list of Example objects.
-    * info: A dict with various information about the last update (see below).
-    * is_best_checkpoint: A value in None, False, True, indicating whether this
-        was the best evaluation so far. You should use this to save the model
-        checkpoints during training. If None, evaluation was not conducted on
-        that iteration. False means evaluation was conducted, but a previous
-        evaluation was better.
-
-    The info dict provides the following information:
-
-        epoch (int): How many passes over the data have been completed.
-        step (int): How many steps have been completed.
-        score (float): The main score from the last evaluation.
-        other_scores: : The other scores from the last evaluation.
-        losses: The accumulated losses throughout training.
-        checkpoints: A list of previous results, where each result is a
-            (score, step, epoch) tuple.
-    """
-    if isinstance(dropout, float):
-        dropouts = thinc.schedules.constant(dropout)
-    else:
-        dropouts = dropout
-    results = []
-    losses = {}
-    words_seen = 0
-    start_time = timer()
-    for step, (epoch, batch) in enumerate(train_data):
-        dropout = next(dropouts)
-        for subbatch in subdivide_batch(batch, accumulate_gradient):
-            nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
-            )
-        # TODO: refactor this so we don't have to run it separately in here
-        for name, proc in nlp.pipeline:
-            if (
-                name not in exclude
-                and hasattr(proc, "model")
-                and proc.model not in (True, False, None)
-            ):
-                proc.model.finish_update(optimizer)
-        optimizer.step_schedules()
-        if not (step % eval_frequency):
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    score, other_scores = evaluate()
-            else:
-                score, other_scores = evaluate()
-            results.append((score, step))
-            is_best_checkpoint = score == max(results)[0]
-        else:
-            score, other_scores = (None, None)
-            is_best_checkpoint = None
-        words_seen += sum(len(eg) for eg in batch)
-        info = {
-            "epoch": epoch,
-            "step": step,
-            "score": score,
-            "other_scores": other_scores,
-            "losses": losses,
-            "checkpoints": results,
-            "seconds": int(timer() - start_time),
-            "words": words_seen,
-        }
-        yield batch, info, is_best_checkpoint
-        if is_best_checkpoint is not None:
-            losses = {}
-        # Stop if no improvement in `patience` updates (if specified)
-        best_score, best_step = max(results)
-        if patience and (step - best_step) >= patience:
-            break
-        # Stop if we've exhausted our max steps (if specified)
-        if max_steps and step >= max_steps:
-            break
-
-
-def subdivide_batch(batch, accumulate_gradient):
-    batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
-    sub_len = len(batch) // accumulate_gradient
-    start = 0
-    for i in range(accumulate_gradient):
-        subbatch = batch[start : start + sub_len]
-        if subbatch:
-            yield subbatch
-        start += len(subbatch)
-    subbatch = batch[start:]
-    if subbatch:
-        yield subbatch
-
-
-def update_meta(
-    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
-) -> None:
-    nlp.meta["performance"] = {}
-    for metric in training["score_weights"]:
-        if metric is not None:
-            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
-    for pipe_name in nlp.pipe_names:
-        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+    return init_nlp(config, **init_kwargs)


 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
        if not output_path.exists():
            output_path.mkdir()
            msg.good(f"Created output directory: {output_path}")
-
-
-# TODO: this is currently imported by the ray extension and not used otherwise
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    weights_data = None
-    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return None, {}, {}, weights_data
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
+from spacy.training import Example
+from spacy.training.initialize import verify_textcat_config

 from ..util import make_tempdir
-from ...cli.train import verify_textcat_config
-from ...training import Example


 TRAIN_DATA = [
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from spacy.cli.debug_config import check_section_refs
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@ -414,15 +413,3 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
-
-
-def test_check_section_refs():
-    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
-    config = Config(config)
-    # Valid section reference
-    check_section_refs(config, ["a.b.c"])
-    # Section that doesn't exist in this config
-    check_section_refs(config, ["x.y.z"])
-    # Invalid section reference
-    with pytest.raises(ConfigValidationError):
-        check_section_refs(config, ["a.b.c", "f.g"])
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -7,7 +7,6 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from thinc.api import Optimizer


@pytest.fixture
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
    result = util.dot_to_dict(dot_notation)
    assert result == expected
    assert util.dict_to_dot(result) == dot_notation
-
-
-def test_resolve_training_config():
-    config = {
-        "nlp": {"lang": "en", "disabled": []},
-        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
-        "corpora": {},
-    }
-    resolved = util.resolve_training_config(config)
-    assert resolved["training"]["dropout"] == 0.1
-    assert isinstance(resolved["training"]["optimizer"], Optimizer)
-    assert resolved["corpora"] == {}
-    assert "nlp" not in resolved
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,14 +1,15 @@
 import pytest

-from .util import get_random_doc
-
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
-from ..lang.en import English
-from ..lang.nl import Dutch
-from ..language import DEFAULT_CONFIG_PATH
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc


@pytest.mark.parametrize(
@ -101,8 +102,8 @@ def test_util_dot_section():
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    resolved = util.resolve_training_config(nl_nlp.config)
-    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)


 def test_simple_frozen_list():
@ -120,3 +121,17 @@ def test_simple_frozen_list():
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
    with pytest.raises(NotImplementedError):
        t.append("baz")
+
+
+def test_resolve_dot_names():
+    config = {
+        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+    }
+    result = util.resolve_dot_names(config, ["foo.bar"])
+    assert isinstance(result[0], Optimizer)
+    with pytest.raises(ConfigValidationError) as e:
+        util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ["training", "xyz"]
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
 from spacy import Language
-from spacy.util import load_model_from_config, registry, dot_to_object
-from spacy.util import resolve_training_config
+from spacy.util import load_model_from_config, registry, resolve_dot_names
+from spacy.schemas import ConfigSchemaTraining
 from spacy.training import Example


@ -39,21 +39,21 @@ def test_readers():

    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    assert isinstance(train_corpus, Callable)
-    optimizer = resolved["training"]["optimizer"]
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    scores = nlp.evaluate(list(dev_corpus(nlp)))
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
-    extra_corpus = resolved["corpora"]["extra"]
+    extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
    assert isinstance(extra_corpus, Callable)


@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
-    optimizer = resolved["training"]["optimizer"]
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -0,0 +1,205 @@
+from typing import Union, Dict, Optional, Any, List, Callable
+from thinc.api import Config, fix_random_seed, set_gpu_allocator
+from thinc.api import ConfigValidationError
+from pathlib import Path
+import srsly
+
+from .loop import create_before_to_disk_callback
+from ..language import Language
+from ..lookups import Lookups
+from ..errors import Errors
+from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
+from ..util import registry, load_model_from_config, resolve_dot_names
+from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
+
+
+def init_nlp(
+    config: Config,
+    *,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+    on_success: Callable[[str], None] = lambda x: None,
+) -> Language:
+    raw_config = config
+    config = raw_config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    # Use original config here before it's resolved to functions
+    sourced_components = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    on_success("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+    V = I["vocab"]
+    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
+    optimizer = T["optimizer"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced_components if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        on_success(f"Initialized pipeline components")
+    # Verify the config after calling 'begin_training' to ensure labels
+    # are properly initialized
+    verify_config(nlp)
+    if "pretraining" in config and config["pretraining"]:
+        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
+        loaded = add_tok2vec_weights(nlp, P, I)
+        if loaded and P["component"]:
+            on_success(f"Loaded pretrained weights into component '{P['component']}'")
+    nlp = before_to_disk(nlp)
+    return nlp
+
+
+def must_reinitialize(train_config: Config, init_config: Config) -> bool:
+    # TODO: do this better and more fine-grained
+    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
+
+
+def init_vocab(
+    nlp: Language,
+    *,
+    data: Optional[Path] = None,
+    lookups: Optional[Lookups] = None,
+    vectors: Optional[str] = None,
+    on_success: Callable[[str], None] = lambda x: None,
+) -> Language:
+    if lookups:
+        nlp.vocab.lookups = lookups
+        on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
+    data_path = ensure_path(data)
+    if data_path is not None:
+        lex_attrs = srsly.read_jsonl(data_path)
+        for lexeme in nlp.vocab:
+            lexeme.rank = OOV_RANK
+        for attrs in lex_attrs:
+            if "settings" in attrs:
+                continue
+            lexeme = nlp.vocab[attrs["orth"]]
+            lexeme.set_attrs(**attrs)
+        if len(nlp.vocab):
+            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
+        else:
+            oov_prob = DEFAULT_OOV_PROB
+        nlp.vocab.cfg.update({"oov_prob": oov_prob})
+        on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    on_success("Created vocabulary")
+    if vectors is not None:
+        load_vectors_into_model(nlp, vectors)
+        on_success(f"Added vectors: {vectors}")
+
+
+def load_vectors_into_model(
+    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
+) -> None:
+    """Load word vectors from an installed model or path into a model instance."""
+    try:
+        vectors_nlp = load_model(name)
+    except ConfigValidationError as e:
+        title = f"Config validation error for vectors {name}"
+        desc = (
+            "This typically means that there's a problem in the config.cfg included "
+            "with the packaged vectors. Make sure that the vectors package you're "
+            "loading is compatible with the current version of spaCy."
+        )
+        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        raise err from None
+    nlp.vocab.vectors = vectors_nlp.vocab.vectors
+    if add_strings:
+        # I guess we should add the strings from the vectors_nlp model?
+        # E.g. if someone does a similarity query, they might expect the strings.
+        for key in nlp.vocab.vectors.key2row:
+            if key in vectors_nlp.vocab.strings:
+                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
+
+
+def add_tok2vec_weights(
+    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
+) -> bool:
+    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    P = pretrain_config
+    V = vocab_config
+    weights_data = None
+    init_tok2vec = ensure_path(V["init_tok2vec"])
+    if init_tok2vec is not None:
+        if P["objective"].get("type") == "vectors" and not V["vectors"]:
+            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
+            errors = [{"loc": ["initialize", "vectors"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        if not init_tok2vec.exists():
+            err = f"can't find pretrained tok2vec: {init_tok2vec}"
+            errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    if weights_data is not None:
+        tok2vec_component = P["component"]
+        if tok2vec_component is None:
+            desc = (
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them."
+            )
+            err = "component can't be null"
+            errors = [{"loc": ["pretraining", "component"], "msg": err}]
+            raise ConfigValidationError(
+                config=nlp.config["pretraining"], errors=errors, desc=desc
+            )
+        layer = nlp.get_pipe(tok2vec_component).model
+        if P["layer"]:
+            layer = layer.get_ref(P["layer"])
+        layer.from_bytes(weights_data)
+        return True
+    return False
+
+
+def verify_config(nlp: Language) -> None:
+    """Perform additional checks based on the config, loaded nlp object and training data."""
+    # TODO: maybe we should validate based on the actual components, the list
+    # in config["nlp"]["pipeline"] instead?
+    for pipe_config in nlp.config["components"].values():
+        # We can't assume that the component name == the factory
+        factory = pipe_config["factory"]
+        if factory == "textcat":
+            verify_textcat_config(nlp, pipe_config)
+
+
+def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
+    # if 'positive_label' is provided: double check whether it's in the data and
+    # the task is binary
+    if pipe_config.get("positive_label"):
+        textcat_labels = nlp.get_pipe("textcat").labels
+        pos_label = pipe_config.get("positive_label")
+        if pos_label not in textcat_labels:
+            raise ValueError(
+                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
+            )
+        if len(list(textcat_labels)) != 2:
+            raise ValueError(
+                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
+            )
+
+
+def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
+    """RETURNS (List[str]): All sourced components in the original config,
+        e.g. {"source": "en_core_web_sm"}. If the config contains a key
+        "factory", we assume it refers to a component factory.
+    """
+    return [
+        name
+        for name, cfg in config.get("components", {}).items()
+        if "factory" not in cfg and "source" in cfg
+    ]
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -0,0 +1,301 @@
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import Optional
+from pathlib import Path
+from timeit import default_timer as timer
+from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
+import random
+import tqdm
+
+from .example import Example
+from ..schemas import ConfigSchemaTraining
+from ..language import Language
+from ..errors import Errors
+from ..util import resolve_dot_names, registry, logger
+
+
+def train(
+    nlp: Language,
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+) -> Optional[Path]:
+    """Train a pipeline.
+
+    nlp (Language): The initialized nlp object with the full config.
+    output_path (Path): Optional output path to save trained model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    logger (Callable[[Any], Any]): Optional logger exposing the methods info,
+        error, debug and  warn. Defaults to regular spaCy logger but can be
+        swapped for CLI logger.
+    RETURNS (Path / None): The path to the final exported model.
+    """
+
+    # Create iterator, which yields out info after each optimization step.
+    config = nlp.config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = T["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = train_while_improving(
+        nlp,
+        optimizer,
+        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
+        create_evaluation_callback(nlp, dev_corpus, score_weights),
+        dropout=T["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        patience=T["patience"],
+        max_steps=T["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+    )
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if frozen_components:
+        logger.info(f"Frozen components: {frozen_components}")
+    logger.info(f"Initial learn rate: {optimizer.learn_rate}")
+    with nlp.select_pipes(disable=frozen_components):
+        print_row, finalize_logger = train_logger(nlp)
+    try:
+        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+        progress.set_description(f"Epoch 1")
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            progress.update(1)
+            if is_best_checkpoint is not None:
+                progress.close()
+                print_row(info)
+                if is_best_checkpoint and output_path is not None:
+                    with nlp.select_pipes(disable=frozen_components):
+                        update_meta(T, nlp, info)
+                    with nlp.use_params(optimizer.averages):
+                        nlp = before_to_disk(nlp)
+                        nlp.to_disk(output_path / "model-best")
+                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+                progress.set_description(f"Epoch {info['epoch']}")
+    except Exception as e:
+        finalize_logger()
+        if output_path is not None:
+            # We don't want to swallow the traceback if we don't have a
+            # specific error.
+            logger.warn(
+                f"Aborting and saving the final best model. "
+                f"Encountered exception: {str(e)}"
+            )
+            nlp = before_to_disk(nlp)
+            nlp.to_disk(output_path / "model-final")
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            final_model_path = output_path / "model-final"
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    nlp.to_disk(final_model_path)
+            else:
+                nlp.to_disk(final_model_path)
+            return final_model_path
+
+
+def train_while_improving(
+    nlp: Language,
+    optimizer: Optimizer,
+    train_data,
+    evaluate,
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    patience: int,
+    max_steps: int,
+    exclude: List[str],
+):
+    """Train until an evaluation stops improving. Works as a generator,
+    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+    where info is a dict, and is_best_checkpoint is in [True, False, None] --
+    None indicating that the iteration was not evaluated as a checkpoint.
+    The evaluation is conducted by calling the evaluate callback.
+
+    Positional arguments:
+        nlp: The spaCy pipeline to evaluate.
+        optimizer: The optimizer callable.
+        train_data (Iterable[Batch]): A generator of batches, with the training
+            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+            data iterable needs to take care of iterating over the epochs and
+            shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(train_data):
+        dropout = next(dropouts)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            nlp.update(
+                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for name, proc in nlp.pipeline:
+            if (
+                name not in exclude
+                and hasattr(proc, "model")
+                and proc.model not in (True, False, None)
+            ):
+                proc.model.finish_update(optimizer)
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if no improvement in `patience` updates (if specified)
+        best_score, best_step = max(results)
+        if patience and (step - best_step) >= patience:
+            break
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
+def subdivide_batch(batch, accumulate_gradient):
+    batch = list(batch)
+    batch.sort(key=lambda eg: len(eg.predicted))
+    sub_len = len(batch) // accumulate_gradient
+    start = 0
+    for i in range(accumulate_gradient):
+        subbatch = batch[start : start + sub_len]
+        if subbatch:
+            yield subbatch
+        start += len(subbatch)
+    subbatch = batch[start:]
+    if subbatch:
+        yield subbatch
+
+
+def create_evaluation_callback(
+    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
+) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    weights = {key: value for key, value in weights.items() if value is not None}
+
+    def evaluate() -> Tuple[float, Dict[str, float]]:
+        dev_examples = list(dev_corpus(nlp))
+        scores = nlp.evaluate(dev_examples)
+        # Calculate a weighted sum based on score_weights for the main score.
+        # We can only consider scores that are ints/floats, not dicts like
+        # entity scores per type etc.
+        for key, value in scores.items():
+            if key in weights and not isinstance(value, (int, float)):
+                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
+        try:
+            weighted_score = sum(
+                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
+            )
+        except KeyError as e:
+            keys = list(scores.keys())
+            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
+            raise KeyError(err) from None
+        return weighted_score, scores
+
+    return evaluate
+
+
+def create_train_batches(
+    iterator: Iterator[Example],
+    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    max_epochs: int,
+):
+    epoch = 0
+    examples = list(iterator)
+    if not examples:
+        # Raise error if no data
+        raise ValueError(Errors.E986)
+    while max_epochs < 1 or epoch != max_epochs:
+        random.shuffle(examples)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
+def update_meta(
+    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
+) -> None:
+    nlp.meta["performance"] = {}
+    for metric in training["score_weights"]:
+        if metric is not None:
+            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
+    for pipe_name in nlp.pipe_names:
+        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+
+
+def create_before_to_disk_callback(
+    callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -0,0 +1,267 @@
+from typing import Optional, Callable, Any, Iterable, Union, List
+from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
+from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from pathlib import Path
+from functools import partial
+from collections import Counter
+import srsly
+import numpy
+import time
+import re
+from wasabi import msg
+
+from .example import Example
+from ..tokens import Doc
+from ..attrs import ID
+from ..ml.models.multi_task import build_cloze_multi_task_model
+from ..ml.models.multi_task import build_cloze_characters_multi_task_model
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..util import registry, load_model_from_config, dot_to_object, logger
+
+
+def pretrain(
+    config: Config,
+    output_dir: Path,
+    resume_path: Optional[Path] = None,
+    epoch_resume: Optional[int] = None,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+):
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    nlp = load_model_from_config(config)
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
+    corpus = dot_to_object(T, P["corpus"])
+    batcher = P["batcher"]
+    model = create_pretraining_model(nlp, P)
+    optimizer = P["optimizer"]
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        _resume_model(model, resume_path, epoch_resume)
+    else:
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0
+
+    # TODO: move this to logger function?
+    tracker = ProgressTracker(frequency=10000)
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
+    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
+    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
+
+    def _save_model(epoch, is_temp=False):
+        is_temp_str = ".temp" if is_temp else ""
+        with model.use_params(optimizer.averages):
+            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+                file_.write(model.get_ref("tok2vec").to_bytes())
+            log = {
+                "nr_word": tracker.nr_word,
+                "loss": tracker.loss,
+                "epoch_loss": tracker.epoch_loss,
+                "epoch": epoch,
+            }
+            with (output_dir / "log.jsonl").open("a") as file_:
+                file_.write(srsly.json_dumps(log) + "\n")
+
+    objective = create_objective(P["objective"])
+    # TODO: I think we probably want this to look more like the
+    # 'create_train_batches' function?
+    for epoch in range(epoch_resume, P["max_epochs"]):
+        for batch_id, batch in enumerate(batcher(corpus(nlp))):
+            docs = ensure_docs(batch)
+            loss = make_update(model, docs, optimizer, objective)
+            progress = tracker.update(epoch, loss, docs)
+            if progress:
+                msg.row(progress, **row_settings)
+            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                _save_model(epoch, is_temp=True)
+        _save_model(epoch)
+        tracker.epoch_loss = 0.0
+
+
+def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
+    docs = []
+    for eg_or_doc in examples_or_docs:
+        if isinstance(eg_or_doc, Doc):
+            docs.append(eg_or_doc)
+        else:
+            docs.append(eg_or_doc.reference)
+    return docs
+
+
+def _resume_model(
+    model: Model,
+    resume_path: Path,
+    epoch_resume: int,
+    logger: Callable[[Any], Any] = logger,
+) -> None:
+    logger.info(f"Resume training tok2vec from: {resume_path}")
+    with resume_path.open("rb") as file_:
+        weights_data = file_.read()
+        model.get_ref("tok2vec").from_bytes(weights_data)
+    # Parse the epoch number from the given weight file
+    model_name = re.search(r"model\d+\.bin", str(resume_path))
+    if model_name:
+        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+        logger.info(f"Resuming from epoch: {epoch_resume}")
+    else:
+        logger.info(f"Resuming from epoch: {epoch_resume}")
+
+
+def make_update(
+    model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
+) -> float:
+    """Perform an update over a single batch of documents.
+
+    docs (iterable): A batch of `Doc` objects.
+    optimizer (callable): An optimizer.
+    RETURNS loss: A float for the loss.
+    """
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = objective_func(model.ops, docs, predictions)
+    backprop(gradients)
+    model.finish_update(optimizer)
+    # Don't want to return a cupy object here
+    # The gradients are modified in-place by the BERT MLM,
+    # so we get an accurate loss
+    return float(loss)
+
+
+def create_objective(config: Config):
+    """Create the objective for pretraining.
+
+    We'd like to replace this with a registry function but it's tricky because
+    we're also making a model choice based on this. For now we hard-code support
+    for two types (characters, vectors). For characters you can specify
+    n_characters, for vectors you can specify the loss.
+
+    Bleh.
+    """
+    objective_type = config["type"]
+    if objective_type == "characters":
+        return partial(get_characters_loss, nr_char=config["n_characters"])
+    elif objective_type == "vectors":
+        if config["loss"] == "cosine":
+            distance = CosineDistance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        elif config["loss"] == "L2":
+            distance = L2Distance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        else:
+            raise ValueError("Unexpected loss type", config["loss"])
+    else:
+        raise ValueError("Unexpected objective_type", objective_type)
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_target, loss = distance(prediction, target)
+    return loss, d_target
+
+
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+def create_pretraining_model(nlp, pretrain_config):
+    """Define a network for the pretraining. We simply add an output layer onto
+    the tok2vec input model. The tok2vec input model needs to be a model that
+    takes a batch of Doc objects (as a list), and returns a list of arrays.
+    Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
+    """
+    component = nlp.get_pipe(pretrain_config["component"])
+    if pretrain_config.get("layer"):
+        tok2vec = component.model.get_ref(pretrain_config["layer"])
+    else:
+        tok2vec = component.model
+
+    # TODO
+    maxout_pieces = 3
+    hidden_size = 300
+    if pretrain_config["objective"]["type"] == "vectors":
+        model = build_cloze_multi_task_model(
+            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
+        )
+    elif pretrain_config["objective"]["type"] == "characters":
+        model = build_cloze_characters_multi_task_model(
+            nlp.vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=pretrain_config["objective"]["n_characters"],
+        )
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    set_dropout_rate(model, pretrain_config["dropout"])
+    return model
+
+
+class ProgressTracker:
+    def __init__(self, frequency=1000000):
+        self.loss = 0.0
+        self.prev_loss = 0.0
+        self.nr_word = 0
+        self.words_per_epoch = Counter()
+        self.frequency = frequency
+        self.last_time = time.time()
+        self.last_update = 0
+        self.epoch_loss = 0.0
+
+    def update(self, epoch, loss, docs):
+        self.loss += loss
+        self.epoch_loss += loss
+        words_in_batch = sum(len(doc) for doc in docs)
+        self.words_per_epoch[epoch] += words_in_batch
+        self.nr_word += words_in_batch
+        words_since_update = self.nr_word - self.last_update
+        if words_since_update >= self.frequency:
+            wps = words_since_update / (time.time() - self.last_time)
+            self.last_update = self.nr_word
+            self.last_time = time.time()
+            loss_per_word = self.loss - self.prev_loss
+            status = (
+                epoch,
+                self.nr_word,
+                _smart_round(self.loss, width=10),
+                _smart_round(loss_per_word, width=6),
+                int(wps),
+            )
+            self.prev_loss = float(self.loss)
+            return status
+        else:
+            return None
+
+
+def _smart_round(
+    figure: Union[float, int], width: int = 10, max_decimal: int = 4
+) -> str:
+    """Round large numbers as integers, smaller numbers as decimals."""
+    n_digits = len(str(int(figure)))
+    n_decimal = width - (n_digits + 1)
+    if n_decimal <= 1:
+        return str(int(figure))
+    else:
+        n_decimal = min(n_decimal, max_decimal)
+        format_str = "%." + str(n_decimal) + "f"
+        return format_str % figure
--- a/spacy/util.py
+++ b/spacy/util.py
@ -8,6 +8,7 @@ import re
 from pathlib import Path
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError
 import functools
 import itertools
 import numpy.random
@ -56,6 +57,7 @@ if TYPE_CHECKING:


 OOV_RANK = numpy.iinfo(numpy.uint64).max
+DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]

 # Default order of sections in the config.cfg. Not all sections needs to exist,
@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
    return Path(sys.modules[module.__module__].__file__).parent


-def load_vectors_into_model(
-    nlp: "Language", name: Union[str, Path], *, add_strings=True
-) -> None:
-    """Load word vectors from an installed model or path into a model instance."""
-    vectors_nlp = load_model(name)
-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
-
-
 def load_model(
    name: Union[str, Path],
    *,
@ -391,32 +379,9 @@ def load_model_from_config(
    return nlp


-def resolve_training_config(
-    config: Config,
-    exclude: Iterable[str] = ("nlp", "components"),
-    validate: bool = True,
-) -> Dict[str, Any]:
-    """Resolve the config sections relevant for trainig and create all objects.
-    Mostly used in the CLI to separate training config (not resolved by default
-    because not runtime-relevant – an nlp object should load fine even if it's
-    [training] block refers to functions that are not available etc.).
-
-    config (Config): The config to resolve.
-    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
-        be available in the final resolved config.
-    validate (bool): Whether to validate the config.
-    RETURNS (Dict[str, Any]): The resolved config.
-    """
-    config = config.copy()
-    for key in exclude:
-        if key in config:
-            config.pop(key)
-    return registry.resolve(config, validate=validate)
-
-
 def resolve_dot_names(
    config: Config, dot_names: List[Optional[str]]
-) -> List[Optional[Callable]]:
+) -> Tuple[Any]:
    """Resolve one or more "dot notation" names, e.g. corpora.train.
    The paths could point anywhere into the config, so we don't know which
    top-level section we'll be looking within.
@ -424,18 +389,42 @@ def resolve_dot_names(
    We resolve the whole top-level section, although we could resolve less --
    we could find the lowest part of the tree.
    """
+    # TODO: include schema?
+    # TODO: clean this up and avoid duplication
    resolved = {}
    output = []
+    errors = []
    for name in dot_names:
        if name is None:
            output.append(name)
        else:
            section = name.split(".")[0]
-            # We want to avoid resolving the same thing twice.
+            # We want to avoid resolving the same thing twice
            if section not in resolved:
                resolved[section] = registry.resolve(config[section])
-            output.append(dot_to_object(resolved, name))
-    return output
+            try:
+                output.append(dot_to_object(resolved, name))
+            except KeyError:
+                msg = f"not a valid section reference: {name}"
+                errors.append({"loc": name.split("."), "msg": msg})
+    objects = []
+    for ref in output:
+        if not isinstance(ref, str):
+            msg = f"not a valid section reference: {ref} ({type(ref)})"
+            errors.append({"loc": ref.split("."), "msg": msg})
+            continue
+        section = ref.split(".")[0]
+        # We want to avoid resolving the same thing twice
+        if section not in resolved:
+            resolved[section] = registry.resolve(config[section])
+        try:
+            objects.append(dot_to_object(resolved, ref))
+        except KeyError:
+            msg = f"not a valid section reference: {name}"
+            errors.append({"loc": ref.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config=config, errors=errors)
+    return tuple(objects)


 def load_model_from_init_py(