Refactor CLI

2025-11-04 01:48:04 +03:00 · 2020-09-28 15:09:59 +02:00 · 2020-09-28 15:09:59 +02:00 · 822ea4ef61
commit 822ea4ef61
parent a89e0ff7cb
18 changed files with 917 additions and 913 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,8 +15,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
-#from .init_model import init_model  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_pipeline import init_pipeline  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -10,13 +10,12 @@ from click import NoSuchOption
 from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError, require_gpu
 from configparser import InterpolationError
 import os
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
 from ..util import ensure_path
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
    """RETURNS (List[str]): All sourced components in the original config,
        e.g. {"source": "en_core_web_sm"}. If the config contains a key
        "factory", we assume it refers to a component factory.
    """
    return [
        name
        for name, cfg in config.get("components", {}).items()
        if "factory" not in cfg and "source" in cfg
    ]
 def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
    """Upload a file.
@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
            p = int(p)
        result.append(p)
    return result
 class CliLogger:
    """Helper mocking up the most commonly used logger methods. Can be passed
    into functions like train() to make them output pretty-printed messages
    on the CLI and regular logging if used from within Python.
    """
    debug = msg.text
    info = msg.info
    warn = msg.info
    error = msg.fail
 def setup_gpu(use_gpu: int):
    if use_gpu >= 0:
        msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@ -1,7 +1,7 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config
 from thinc.config import VARIABLE_RE
 import typer
@ -52,10 +52,8 @@ def debug_config(
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
-        # Use the resolved config here in case user has one function returning
+        dot_names = ["training.dev_corpus", "training.train_corpus"]
-        # a dict of corpora etc.
+        util.resolve_dot_names(nlp.config, dot_names)
        resolved = util.resolve_training_config(nlp.config)
        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
        value = util.dot_to_object(config, path)
        result[variable] = repr(value)
    return result
 def check_section_refs(config: Config, fields: List[str]) -> None:
    """Validate fields in the config that refer to other sections or values
    (e.g. in the corpora) and make sure that those references exist.
    """
    errors = []
    for field in fields:
        # If the field doesn't exist in the config, we ignore it
        try:
            value = util.dot_to_object(config, field)
        except KeyError:
            continue
        try:
            util.dot_to_object(config, value)
        except KeyError:
            msg = f"not a valid section reference: {value}"
            errors.append({"loc": field.split("."), "msg": msg})
    if errors:
        raise ConfigValidationError(config=config, errors=errors)
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
 import typer
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, get_sourced_components
+from ._util import import_code, debug_cli
 from ..training import Corpus, Example
 from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
 from ..util import registry
 from .. import util
@ -94,26 +97,13 @@ def debug_data(
    with show_validation_error(config_path):
        cfg = util.load_config(config_path, overrides=config_overrides)
        nlp = util.load_model_from_config(cfg)
-        C = util.resolve_training_config(nlp.config)
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    # Use original config here, not resolved version
    sourced_components = get_sourced_components(cfg)
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
    resume_components = [p for p in sourced_components if p not in frozen_components]
    pipeline = nlp.pipe_names
    factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
    tag_map_path = util.ensure_path(C["training"]["tag_map"])
    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
    morph_rules = {}
    if morph_rules_path is not None:
        morph_rules = srsly.read_json(morph_rules_path)
    # Replace tag map with provided mapping
    nlp.vocab.morphology.load_tag_map(tag_map)
    # Load morph rules
    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
    msg.divider("Data file validation")
    # Create the gold corpus to be able to better analyze data
@ -145,10 +135,10 @@ def debug_data(
    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
    msg.divider("Training stats")
-    msg.text(f"Language: {C['nlp']['lang']}")
+    msg.text(f"Language: {nlp.lang}")
    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    if resume_components:
        msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@ -355,6 +345,7 @@ def debug_data(
    if "tagger" in factory_names:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        # TODO: does this need to be updated?
        tag_map = nlp.vocab.morphology.tag_map
        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
        labels_with_counts = _format_labels(
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -4,12 +4,14 @@ from pathlib import Path
 from spacy.training import Example
 from spacy.util import dot_to_object
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
 import typer
 from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list
+from ._util import parse_config_overrides, string_to_list, setup_gpu
 from ..schemas import ConfigSchemaTraining
 from ..util import registry
 from .. import util
@ -37,11 +39,7 @@ def debug_model_cli(
    DOCS: https://nightly.spacy.io/api/cli#debug-model
    """
-    if use_gpu >= 0:
+    setup_gpu(use_gpu)
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    layers = string_to_list(layers, intify=True)
    print_settings = {
        "dimensions": dimensions,
@ -65,8 +63,8 @@ def debug_model_cli(
        set_gpu_allocator(allocator)
    with show_validation_error(config_path):
        nlp = util.load_model_from_config(raw_config)
-        C = util.resolve_training_config(nlp.config)
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
-    seed = C["training"]["seed"]
+    seed = T["seed"]
    if seed is not None:
        msg.info(f"Fixing random seed: {seed}")
        fix_random_seed(seed)
@ -77,7 +75,7 @@ def debug_model_cli(
            exits=1,
        )
    model = pipe.model
-    debug_model(C, nlp, model, print_settings=print_settings)
+    debug_model(T, nlp, model, print_settings=print_settings)
 def debug_model(
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,11 +3,11 @@ from wasabi import Printer
 from pathlib import Path
 import re
 import srsly
-from thinc.api import require_gpu, fix_random_seed
+from thinc.api import fix_random_seed
 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, setup_gpu
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
@ -61,8 +61,7 @@ def evaluate(
 ) -> Scorer:
    msg = Printer(no_print=silent, pretty=not silent)
    fix_random_seed()
-    if use_gpu >= 0:
+    setup_gpu(use_gpu)
        require_gpu(use_gpu)
    data_path = util.ensure_path(data_path)
    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -1,22 +1,13 @@
-from typing import Optional, Dict, Callable, Any
+from typing import Optional
 import logging
 from pathlib import Path
 from wasabi import msg
 import typer
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 import srsly
 from .. import util
-from ..util import registry, resolve_dot_names, OOV_RANK
+from ..training.initialize import init_nlp
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
 from ..language import Language
 from ..lookups import Lookups
 from ..errors import Errors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
+from ._util import import_code, CliLogger, setup_gpu
 DEFAULT_OOV_PROB = -20
@init_cli.command(
@ -31,178 +22,16 @@ def init_pipeline_cli(
    output_path: Path = Arg(..., help="Output directory for the prepared data"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
-    nlp = init_pipeline(config)
+    with show_validation_error(hint_fill=False):
        nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
    nlp.to_disk(output_path)
    msg.good(f"Saved initialized pipeline to {output_path}")
 def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
    raw_config = config
    config = raw_config.interpolate()
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced_components = get_sourced_components(config)
    with show_validation_error():
        nlp = util.load_model_from_config(raw_config, auto_fill=True)
    msg.good("Set up nlp object from config")
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    V = I["vocab"]
    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
    optimizer = T["optimizer"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced_components if p not in frozen_components]
    msg.info(f"Pipeline: {nlp.pipe_names}")
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
            msg.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
        msg.good(f"Initialized pipeline components")
    # Verify the config after calling 'begin_training' to ensure labels
    # are properly initialized
    verify_config(nlp)
    if "pretraining" in config and config["pretraining"]:
        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
        add_tok2vec_weights(nlp, P, I)
    # TODO: this should be handled better?
    nlp = before_to_disk(nlp)
    return nlp
 def init_vocab(
    nlp: Language,
    *,
    data: Optional[Path] = None,
    lookups: Optional[Lookups] = None,
    vectors: Optional[str] = None,
 ) -> Language:
    if lookups:
        nlp.vocab.lookups = lookups
        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
    data_path = util.ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
        for lexeme in nlp.vocab:
            lexeme.rank = OOV_RANK
        for attrs in lex_attrs:
            if "settings" in attrs:
                continue
            lexeme = nlp.vocab[attrs["orth"]]
            lexeme.set_attrs(**attrs)
        if len(nlp.vocab):
            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
        else:
            oov_prob = DEFAULT_OOV_PROB
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
    msg.good("Created vocabulary")
    if vectors is not None:
        add_vectors(nlp, vectors)
        msg.good(f"Added vectors: {vectors}")
 def add_tok2vec_weights(
    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 ) -> None:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
    V = vocab_config
    weights_data = None
    init_tok2vec = util.ensure_path(V["init_tok2vec"])
    if init_tok2vec is not None:
        if P["objective"].get("type") == "vectors" and not V["vectors"]:
            err = "Need initialize.vectors if pretraining.objective.type is vectors"
            msg.fail(err, exits=1)
        if not init_tok2vec.exists():
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if weights_data is not None:
        tok2vec_component = P["component"]
        if tok2vec_component is None:
            msg.fail(
                f"To use pretrained tok2vec weights, [pretraining.component] "
                f"needs to specify the component that should load them.",
                exits=1,
            )
        layer = nlp.get_pipe(tok2vec_component).model
        if P["layer"]:
            layer = layer.get_ref(P["layer"])
        layer.from_bytes(weights_data)
        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
 def add_vectors(nlp: Language, vectors: str) -> None:
    title = f"Config validation error for vectors {vectors}"
    desc = (
        "This typically means that there's a problem in the config.cfg included "
        "with the packaged vectors. Make sure that the vectors package you're "
        "loading is compatible with the current version of spaCy."
    )
    with show_validation_error(
        title=title, desc=desc, hint_fill=False, show_config=False
    ):
        util.load_vectors_into_model(nlp, vectors)
        msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
 def verify_config(nlp: Language) -> None:
    """Perform additional checks based on the config, loaded nlp object and training data."""
    # TODO: maybe we should validate based on the actual components, the list
    # in config["nlp"]["pipeline"] instead?
    for pipe_config in nlp.config["components"].values():
        # We can't assume that the component name == the factory
        factory = pipe_config["factory"]
        if factory == "textcat":
            verify_textcat_config(nlp, pipe_config)
 def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
    # if 'positive_label' is provided: double check whether it's in the data and
    # the task is binary
    if pipe_config.get("positive_label"):
        textcat_labels = nlp.get_pipe("textcat").labels
        pos_label = pipe_config.get("positive_label")
        if pos_label not in textcat_labels:
            raise ValueError(
                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
            )
        if len(list(textcat_labels)) != 2:
            raise ValueError(
                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
            )
 def create_before_to_disk_callback(
    callback: Optional[Callable[[Language], Language]]
 ) -> Callable[[Language], Language]:
    def before_to_disk(nlp: Language) -> Language:
        if not callback:
            return nlp
        modified_nlp = callback(nlp)
        if not isinstance(modified_nlp, Language):
            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
            raise ValueError(err)
        return modified_nlp
    return before_to_disk
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,25 +1,13 @@
 from typing import Optional
 import numpy
 import time
 import re
 from collections import Counter
 from pathlib import Path
 from thinc.api import require_gpu, set_gpu_allocator
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
 from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
 import srsly
 from functools import partial
 import typer
 import re
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
+from ._util import import_code, setup_gpu, CliLogger
-from ..ml.models.multi_task import build_cloze_multi_task_model
+from ..training.pretrain import pretrain
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
+from ..util import load_config
 from ..tokens import Doc
 from ..attrs import ID
 from .. import util
 from ..util import dot_to_object
@app.command(
@ -61,15 +49,11 @@ def pretrain_cli(
    config_overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
-    if use_gpu >= 0:
+    setup_gpu(use_gpu)
        msg.info("Using GPU")
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
    msg.info(f"Loading config from: {config_path}")
    with show_validation_error(config_path):
-        raw_config = util.load_config(
+        raw_config = load_config(
            config_path, overrides=config_overrides, interpolate=False
        )
    config = raw_config.interpolate()
@ -89,250 +73,11 @@ def pretrain_cli(
        resume_path=resume_path,
        epoch_resume=epoch_resume,
        use_gpu=use_gpu,
        logger=CliLogger,
    )
 def pretrain(
    config: Config,
    output_dir: Path,
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
 ):
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    nlp = util.load_model_from_config(config)
    C = util.resolve_training_config(nlp.config)
    P_cfg = C["pretraining"]
    corpus = dot_to_object(C, P_cfg["corpus"])
    batcher = P_cfg["batcher"]
    model = create_pretraining_model(nlp, C["pretraining"])
    optimizer = C["pretraining"]["optimizer"]
    # Load in pretrained weights to resume from
    if resume_path is not None:
        _resume_model(model, resume_path, epoch_resume)
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")
    objective = create_objective(P_cfg["objective"])
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
        for batch_id, batch in enumerate(batcher(corpus(nlp))):
            docs = ensure_docs(batch)
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
    msg.good("Successfully finished pretrain")
 def ensure_docs(examples_or_docs):
    docs = []
    for eg_or_doc in examples_or_docs:
        if isinstance(eg_or_doc, Doc):
            docs.append(eg_or_doc)
        else:
            docs.append(eg_or_doc.reference)
    return docs
 def _resume_model(model, resume_path, epoch_resume):
    msg.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
        weights_data = file_.read()
        model.get_ref("tok2vec").from_bytes(weights_data)
    # Parse the epoch number from the given weight file
    model_name = re.search(r"model\d+\.bin", str(resume_path))
    if model_name:
        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
        msg.info(f"Resuming from epoch: {epoch_resume}")
    else:
        msg.info(f"Resuming from epoch: {epoch_resume}")
 def make_update(model, docs, optimizer, objective_func):
    """Perform an update over a single batch of documents.
    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs)
    loss, gradients = objective_func(model.ops, docs, predictions)
    backprop(gradients)
    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
    return float(loss)
 def create_objective(config):
    """Create the objective for pretraining.
    We'd like to replace this with a registry function but it's tricky because
    we're also making a model choice based on this. For now we hard-code support
    for two types (characters, vectors). For characters you can specify
    n_characters, for vectors you can specify the loss.
    Bleh.
    """
    objective_type = config["type"]
    if objective_type == "characters":
        return partial(get_characters_loss, nr_char=config["n_characters"])
    elif objective_type == "vectors":
        if config["loss"] == "cosine":
            return partial(
                get_vectors_loss,
                distance=CosineDistance(normalize=True, ignore_zeros=True),
            )
        elif config["loss"] == "L2":
            return partial(
                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
            )
        else:
            raise ValueError("Unexpected loss type", config["loss"])
    else:
        raise ValueError("Unexpected objective_type", objective_type)
 def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a loss based on a distance between the documents' vectors and
    the prediction.
    """
    # The simplest way to implement this would be to vstack the
    # token.vector values, but that's a bit inefficient, especially on GPU.
    # Instead we fetch the index into the vectors table for each of our tokens,
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    d_target, loss = distance(prediction, target)
    return loss, d_target
 def get_characters_loss(ops, docs, prediction, nr_char):
    """Compute a loss based on a number of characters predicted from the docs."""
    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
    target_ids = target_ids.reshape((-1,))
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
    loss = (diff ** 2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target
 def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model
    # TODO
    maxout_pieces = 3
    hidden_size = 300
    if pretrain_config["objective"]["type"] == "vectors":
        model = build_cloze_multi_task_model(
            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
    elif pretrain_config["objective"]["type"] == "characters":
        model = build_cloze_characters_multi_task_model(
            nlp.vocab,
            tok2vec,
            hidden_size=hidden_size,
            maxout_pieces=maxout_pieces,
            nr_char=pretrain_config["objective"]["n_characters"],
        )
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    set_dropout_rate(model, pretrain_config["dropout"])
    return model
 class ProgressTracker:
    def __init__(self, frequency=1000000):
        self.loss = 0.0
        self.prev_loss = 0.0
        self.nr_word = 0
        self.words_per_epoch = Counter()
        self.frequency = frequency
        self.last_time = time.time()
        self.last_update = 0
        self.epoch_loss = 0.0
    def update(self, epoch, loss, docs):
        self.loss += loss
        self.epoch_loss += loss
        words_in_batch = sum(len(doc) for doc in docs)
        self.words_per_epoch[epoch] += words_in_batch
        self.nr_word += words_in_batch
        words_since_update = self.nr_word - self.last_update
        if words_since_update >= self.frequency:
            wps = words_since_update / (time.time() - self.last_time)
            self.last_update = self.nr_word
            self.last_time = time.time()
            loss_per_word = self.loss - self.prev_loss
            status = (
                epoch,
                self.nr_word,
                _smart_round(self.loss, width=10),
                _smart_round(loss_per_word, width=6),
                int(wps),
            )
            self.prev_loss = float(self.loss)
            return status
        else:
            return None
 def _smart_round(figure, width=10, max_decimal=4):
    """Round large numbers as integers, smaller numbers as decimals."""
    n_digits = len(str(int(figure)))
    n_decimal = width - (n_digits + 1)
    if n_decimal <= 1:
        return str(int(figure))
    else:
        n_decimal = min(n_decimal, max_decimal)
        format_str = "%." + str(n_decimal) + "f"
        return format_str % figure
 def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,24 +1,16 @@
-from typing import Optional, Dict, Any, Tuple, Union, Callable, List
+from typing import Optional
 from timeit import default_timer as timer
 import tqdm
 from pathlib import Path
 from wasabi import msg
-import thinc
+from thinc.api import Config
 import thinc.schedules
 from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
 import random
 import typer
 import logging
 from .init_pipeline import init_pipeline
 from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
+from ._util import import_code, CliLogger, setup_gpu
 from ..language import Language
 from ..training.loop import train
 from ..training.initialize import init_nlp, must_reinitialize
 from .. import util
 from ..errors import Errors
 from ..util import resolve_dot_names, registry
 from ..schemas import ConfigSchemaTraining
@app.command(
@ -52,31 +44,33 @@ def train_cli(
    verify_cli_args(config_path, output_path)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
-    if use_gpu >= 0:
+    setup_gpu(use_gpu)
-        msg.info(f"Using GPU: {use_gpu}")
+    with show_validation_error(config_path):
        require_gpu(use_gpu)
    else:
        msg.info("Using CPU")
        config = util.load_config(config_path, overrides=overrides, interpolate=False)
    msg.divider("Initializing pipeline")
-    nlp = init_nlp(config, output_path)
+    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
    msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu)
+    final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
    if final_path:
        msg.good(f"Saved pipeline to output directory", final_path)
-def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
+def init_pipeline(
    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
 ) -> Language:
    init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
    if output_path is not None:
        init_path = output_path / "model-initial"
        if not init_path.exists():
            msg.info(f"Initializing the pipeline in {init_path}")
-            nlp = init_pipeline(config)
+            nlp = init_nlp(config, **init_kwargs)
            nlp.to_disk(init_path)
            msg.good(f"Saved initialized pipeline to {init_path}")
        else:
            nlp = util.load_model(init_path)
            if must_reinitialize(config, nlp.config):
                msg.warn("Config has changed: need to re-initialize pipeline")
-                nlp = init_pipeline(config)
+                nlp = init_nlp(config, **init_kwargs)
                nlp.to_disk(init_path)
                msg.good(f"Re-initialized pipeline in {init_path}")
            else:
@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
        "the vocabulary, vectors and label scheme. To take advantage of this, "
        "provide an output directory."
    )
-    return init_pipeline(config)
+    return init_nlp(config, **init_kwargs)
 def train(
    nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
 ) -> None:
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    score_weights = T["score_weights"]
    batcher = T["batcher"]
    train_logger = T["logger"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Create iterator, which yields out info after each optimization step.
    training_step_iterator = train_while_improving(
        nlp,
        optimizer,
        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
        create_evaluation_callback(nlp, dev_corpus, score_weights),
        dropout=T["dropout"],
        accumulate_gradient=T["accumulate_gradient"],
        patience=T["patience"],
        max_steps=T["max_steps"],
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
    )
    msg.info(f"Pipeline: {nlp.pipe_names}")
    if frozen_components:
        msg.info(f"Frozen components: {frozen_components}")
    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
    with nlp.select_pipes(disable=frozen_components):
        print_row, finalize_logger = train_logger(nlp)
    try:
        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
        progress.set_description(f"Epoch 1")
        for batch, info, is_best_checkpoint in training_step_iterator:
            progress.update(1)
            if is_best_checkpoint is not None:
                progress.close()
                print_row(info)
                if is_best_checkpoint and output_path is not None:
                    with nlp.select_pipes(disable=frozen_components):
                        update_meta(T, nlp, info)
                    with nlp.use_params(optimizer.averages):
                        nlp = before_to_disk(nlp)
                        nlp.to_disk(output_path / "model-best")
                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
                progress.set_description(f"Epoch {info['epoch']}")
    except Exception as e:
        finalize_logger()
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
            # specific error.
            msg.warn(
                f"Aborting and saving the final best model. "
                f"Encountered exception: {str(e)}"
            )
            nlp = before_to_disk(nlp)
            nlp.to_disk(output_path / "model-final")
        raise e
    finally:
        finalize_logger()
        if output_path is not None:
            final_model_path = output_path / "model-final"
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
            else:
                nlp.to_disk(final_model_path)
            msg.good(f"Saved pipeline to output directory {final_model_path}")
 def must_reinitialize(train_config: Config, init_config: Config) -> bool:
    # TODO: do this better and more fine-grained
    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
 def add_vectors(nlp: Language, vectors: str) -> None:
    title = f"Config validation error for vectors {vectors}"
    desc = (
        "This typically means that there's a problem in the config.cfg included "
        "with the packaged vectors. Make sure that the vectors package you're "
        "loading is compatible with the current version of spaCy."
    )
    with show_validation_error(
        title=title, desc=desc, hint_fill=False, show_config=False
    ):
        util.load_vectors_into_model(nlp, vectors)
 def create_train_batches(iterator, batcher, max_epochs: int):
    epoch = 0
    examples = list(iterator)
    if not examples:
        # Raise error if no data
        raise ValueError(Errors.E986)
    while max_epochs < 1 or epoch != max_epochs:
        random.shuffle(examples)
        for batch in batcher(examples):
            yield epoch, batch
        epoch += 1
 def create_evaluation_callback(
    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
    weights = {key: value for key, value in weights.items() if value is not None}
    def evaluate() -> Tuple[float, Dict[str, float]]:
        dev_examples = list(dev_corpus(nlp))
        scores = nlp.evaluate(dev_examples)
        # Calculate a weighted sum based on score_weights for the main score.
        # We can only consider scores that are ints/floats, not dicts like
        # entity scores per type etc.
        for key, value in scores.items():
            if key in weights and not isinstance(value, (int, float)):
                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
        try:
            weighted_score = sum(
                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
            )
        except KeyError as e:
            keys = list(scores.keys())
            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
            raise KeyError(err) from None
        return weighted_score, scores
    return evaluate
 def train_while_improving(
    nlp: Language,
    optimizer: Optimizer,
    train_data,
    evaluate,
    *,
    dropout: float,
    eval_frequency: int,
    accumulate_gradient: int,
    patience: int,
    max_steps: int,
    exclude: List[str],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
    where info is a dict, and is_best_checkpoint is in [True, False, None] --
    None indicating that the iteration was not evaluated as a checkpoint.
    The evaluation is conducted by calling the evaluate callback.
    Positional arguments:
        nlp: The spaCy pipeline to evaluate.
        optimizer: The optimizer callable.
        train_data (Iterable[Batch]): A generator of batches, with the training
            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
            data iterable needs to take care of iterating over the epochs and
            shuffling.
        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
            The callback should take no arguments and return a tuple
            `(main_score, other_scores)`. The main_score should be a float where
            higher is better. other_scores can be any object.
    Every iteration, the function yields out a tuple with:
    * batch: A list of Example objects.
    * info: A dict with various information about the last update (see below).
    * is_best_checkpoint: A value in None, False, True, indicating whether this
        was the best evaluation so far. You should use this to save the model
        checkpoints during training. If None, evaluation was not conducted on
        that iteration. False means evaluation was conducted, but a previous
        evaluation was better.
    The info dict provides the following information:
        epoch (int): How many passes over the data have been completed.
        step (int): How many steps have been completed.
        score (float): The main score from the last evaluation.
        other_scores: : The other scores from the last evaluation.
        losses: The accumulated losses throughout training.
        checkpoints: A list of previous results, where each result is a
            (score, step, epoch) tuple.
    """
    if isinstance(dropout, float):
        dropouts = thinc.schedules.constant(dropout)
    else:
        dropouts = dropout
    results = []
    losses = {}
    words_seen = 0
    start_time = timer()
    for step, (epoch, batch) in enumerate(train_data):
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
            )
        # TODO: refactor this so we don't have to run it separately in here
        for name, proc in nlp.pipeline:
            if (
                name not in exclude
                and hasattr(proc, "model")
                and proc.model not in (True, False, None)
            ):
                proc.model.finish_update(optimizer)
        optimizer.step_schedules()
        if not (step % eval_frequency):
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    score, other_scores = evaluate()
            else:
                score, other_scores = evaluate()
            results.append((score, step))
            is_best_checkpoint = score == max(results)[0]
        else:
            score, other_scores = (None, None)
            is_best_checkpoint = None
        words_seen += sum(len(eg) for eg in batch)
        info = {
            "epoch": epoch,
            "step": step,
            "score": score,
            "other_scores": other_scores,
            "losses": losses,
            "checkpoints": results,
            "seconds": int(timer() - start_time),
            "words": words_seen,
        }
        yield batch, info, is_best_checkpoint
        if is_best_checkpoint is not None:
            losses = {}
        # Stop if no improvement in `patience` updates (if specified)
        best_score, best_step = max(results)
        if patience and (step - best_step) >= patience:
            break
        # Stop if we've exhausted our max steps (if specified)
        if max_steps and step >= max_steps:
            break
 def subdivide_batch(batch, accumulate_gradient):
    batch = list(batch)
    batch.sort(key=lambda eg: len(eg.predicted))
    sub_len = len(batch) // accumulate_gradient
    start = 0
    for i in range(accumulate_gradient):
        subbatch = batch[start : start + sub_len]
        if subbatch:
            yield subbatch
        start += len(subbatch)
    subbatch = batch[start:]
    if subbatch:
        yield subbatch
 def update_meta(
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
    nlp.meta["performance"] = {}
    for metric in training["score_weights"]:
        if metric is not None:
            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
    for pipe_name in nlp.pipe_names:
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
        if not output_path.exists():
            output_path.mkdir()
            msg.good(f"Created output directory: {output_path}")
 # TODO: this is currently imported by the ray extension and not used otherwise
 def load_from_paths(
    config: Config,
 ) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
    weights_data = None
    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
    if init_tok2vec is not None:
        if not init_tok2vec.exists():
            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    return None, {}, {}, weights_data
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.training.initialize import verify_textcat_config
 from ..util import make_tempdir
 from ...cli.train import verify_textcat_config
 from ...training import Example
 TRAIN_DATA = [
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 from spacy.cli.debug_config import check_section_refs
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@ -414,15 +413,3 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
    assert string_to_list(value, intify=False) == ["1", "2", "3"]
    assert string_to_list(value, intify=True) == [1, 2, 3]
 def test_check_section_refs():
    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
    config = Config(config)
    # Valid section reference
    check_section_refs(config, ["a.b.c"])
    # Section that doesn't exist in this config
    check_section_refs(config, ["x.y.z"])
    # Invalid section reference
    with pytest.raises(ConfigValidationError):
        check_section_refs(config, ["a.b.c", "f.g"])
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -7,7 +7,6 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from thinc.api import Optimizer
@pytest.fixture
@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
    result = util.dot_to_dict(dot_notation)
    assert result == expected
    assert util.dict_to_dot(result) == dot_notation
 def test_resolve_training_config():
    config = {
        "nlp": {"lang": "en", "disabled": []},
        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
        "corpora": {},
    }
    resolved = util.resolve_training_config(config)
    assert resolved["training"]["dropout"] == 0.1
    assert isinstance(resolved["training"]["optimizer"], Optimizer)
    assert resolved["corpora"] == {}
    assert "nlp" not in resolved
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -1,14 +1,15 @@
 import pytest
 from .util import get_random_doc
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
-from ..lang.en import English
+from spacy.lang.en import English
-from ..lang.nl import Dutch
+from spacy.lang.nl import Dutch
-from ..language import DEFAULT_CONFIG_PATH
+from spacy.language import DEFAULT_CONFIG_PATH
 from spacy.schemas import ConfigSchemaTraining
 from .util import get_random_doc
@pytest.mark.parametrize(
@ -101,8 +102,8 @@ def test_util_dot_section():
        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
    with pytest.raises(KeyError):
        dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    resolved = util.resolve_training_config(nl_nlp.config)
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
-    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
 def test_simple_frozen_list():
@ -120,3 +121,17 @@ def test_simple_frozen_list():
    t = SimpleFrozenList(["foo", "bar"], error="Error!")
    with pytest.raises(NotImplementedError):
        t.append("baz")
 def test_resolve_dot_names():
    config = {
        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
    }
    result = util.resolve_dot_names(config, ["foo.bar"])
    assert isinstance(result[0], Optimizer)
    with pytest.raises(ConfigValidationError) as e:
        util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
    errors = e.value.errors
    assert len(errors) == 1
    assert errors[0]["loc"] == ["training", "xyz"]
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
 from spacy import Language
-from spacy.util import load_model_from_config, registry, dot_to_object
+from spacy.util import load_model_from_config, registry, resolve_dot_names
-from spacy.util import resolve_training_config
+from spacy.schemas import ConfigSchemaTraining
 from spacy.training import Example
@ -39,21 +39,21 @@ def test_readers():
    config = Config().from_str(config_string)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
    assert isinstance(train_corpus, Callable)
-    optimizer = resolved["training"]["optimizer"]
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
        nlp.update([example], sgd=optimizer)
    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    scores = nlp.evaluate(list(dev_corpus(nlp)))
    assert scores["cats_score"]
    # ensure the pipeline runs
    doc = nlp("Quick test")
    assert doc.cats
-    extra_corpus = resolved["corpora"]["extra"]
+    extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
    assert isinstance(extra_corpus, Callable)
@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
-    optimizer = resolved["training"]["optimizer"]
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    optimizer = T["optimizer"]
    # simulate a training loop
    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
    for example in train_corpus(nlp):
@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
        nlp.update([example], sgd=optimizer)
    # simulate performance benchmark on dev corpus
    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
    dev_examples = list(dev_corpus(nlp))
    for example in dev_examples:
        # this shouldn't fail if each dev example has at least one positive label
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -0,0 +1,205 @@
 from typing import Union, Dict, Optional, Any, List, Callable
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
 import srsly
 from .loop import create_before_to_disk_callback
 from ..language import Language
 from ..lookups import Lookups
 from ..errors import Errors
 from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, resolve_dot_names
 from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
 def init_nlp(
    config: Config,
    *,
    use_gpu: int = -1,
    logger: Callable[[Any], Any] = logger,
    on_success: Callable[[str], None] = lambda x: None,
 ) -> Language:
    raw_config = config
    config = raw_config.interpolate()
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    # Use original config here before it's resolved to functions
    sourced_components = get_sourced_components(config)
    nlp = load_model_from_config(raw_config, auto_fill=True)
    on_success("Set up nlp object from config")
    config = nlp.config.interpolate()
    # Resolve all training-relevant sections using the filled nlp config
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    V = I["vocab"]
    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
    optimizer = T["optimizer"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Sourced components that require resume_training
    resume_components = [p for p in sourced_components if p not in frozen_components]
    logger.info(f"Pipeline: {nlp.pipe_names}")
    if resume_components:
        with nlp.select_pipes(enable=resume_components):
            logger.info(f"Resuming training for: {resume_components}")
            nlp.resume_training(sgd=optimizer)
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
        on_success(f"Initialized pipeline components")
    # Verify the config after calling 'begin_training' to ensure labels
    # are properly initialized
    verify_config(nlp)
    if "pretraining" in config and config["pretraining"]:
        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
        loaded = add_tok2vec_weights(nlp, P, I)
        if loaded and P["component"]:
            on_success(f"Loaded pretrained weights into component '{P['component']}'")
    nlp = before_to_disk(nlp)
    return nlp
 def must_reinitialize(train_config: Config, init_config: Config) -> bool:
    # TODO: do this better and more fine-grained
    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
 def init_vocab(
    nlp: Language,
    *,
    data: Optional[Path] = None,
    lookups: Optional[Lookups] = None,
    vectors: Optional[str] = None,
    on_success: Callable[[str], None] = lambda x: None,
 ) -> Language:
    if lookups:
        nlp.vocab.lookups = lookups
        on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
    data_path = ensure_path(data)
    if data_path is not None:
        lex_attrs = srsly.read_jsonl(data_path)
        for lexeme in nlp.vocab:
            lexeme.rank = OOV_RANK
        for attrs in lex_attrs:
            if "settings" in attrs:
                continue
            lexeme = nlp.vocab[attrs["orth"]]
            lexeme.set_attrs(**attrs)
        if len(nlp.vocab):
            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
        else:
            oov_prob = DEFAULT_OOV_PROB
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
        on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
    on_success("Created vocabulary")
    if vectors is not None:
        load_vectors_into_model(nlp, vectors)
        on_success(f"Added vectors: {vectors}")
 def load_vectors_into_model(
    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
 ) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    try:
        vectors_nlp = load_model(name)
    except ConfigValidationError as e:
        title = f"Config validation error for vectors {name}"
        desc = (
            "This typically means that there's a problem in the config.cfg included "
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy."
        )
        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
        raise err from None
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
        # I guess we should add the strings from the vectors_nlp model?
        # E.g. if someone does a similarity query, they might expect the strings.
        for key in nlp.vocab.vectors.key2row:
            if key in vectors_nlp.vocab.strings:
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 def add_tok2vec_weights(
    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 ) -> bool:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
    V = vocab_config
    weights_data = None
    init_tok2vec = ensure_path(V["init_tok2vec"])
    if init_tok2vec is not None:
        if P["objective"].get("type") == "vectors" and not V["vectors"]:
            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
            errors = [{"loc": ["initialize", "vectors"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        if not init_tok2vec.exists():
            err = f"can't find pretrained tok2vec: {init_tok2vec}"
            errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
    if weights_data is not None:
        tok2vec_component = P["component"]
        if tok2vec_component is None:
            desc = (
                f"To use pretrained tok2vec weights, [pretraining.component] "
                f"needs to specify the component that should load them."
            )
            err = "component can't be null"
            errors = [{"loc": ["pretraining", "component"], "msg": err}]
            raise ConfigValidationError(
                config=nlp.config["pretraining"], errors=errors, desc=desc
            )
        layer = nlp.get_pipe(tok2vec_component).model
        if P["layer"]:
            layer = layer.get_ref(P["layer"])
        layer.from_bytes(weights_data)
        return True
    return False
 def verify_config(nlp: Language) -> None:
    """Perform additional checks based on the config, loaded nlp object and training data."""
    # TODO: maybe we should validate based on the actual components, the list
    # in config["nlp"]["pipeline"] instead?
    for pipe_config in nlp.config["components"].values():
        # We can't assume that the component name == the factory
        factory = pipe_config["factory"]
        if factory == "textcat":
            verify_textcat_config(nlp, pipe_config)
 def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
    # if 'positive_label' is provided: double check whether it's in the data and
    # the task is binary
    if pipe_config.get("positive_label"):
        textcat_labels = nlp.get_pipe("textcat").labels
        pos_label = pipe_config.get("positive_label")
        if pos_label not in textcat_labels:
            raise ValueError(
                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
            )
        if len(list(textcat_labels)) != 2:
            raise ValueError(
                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
            )
 def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
    """RETURNS (List[str]): All sourced components in the original config,
        e.g. {"source": "en_core_web_sm"}. If the config contains a key
        "factory", we assume it refers to a component factory.
    """
    return [
        name
        for name, cfg in config.get("components", {}).items()
        if "factory" not in cfg and "source" in cfg
    ]
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -0,0 +1,301 @@
 from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
 from typing import Optional
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 import random
 import tqdm
 from .example import Example
 from ..schemas import ConfigSchemaTraining
 from ..language import Language
 from ..errors import Errors
 from ..util import resolve_dot_names, registry, logger
 def train(
    nlp: Language,
    output_path: Optional[Path] = None,
    *,
    use_gpu: int = -1,
    logger: Callable[[Any], Any] = logger,
 ) -> Optional[Path]:
    """Train a pipeline.
    nlp (Language): The initialized nlp object with the full config.
    output_path (Path): Optional output path to save trained model to.
    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
        before calling this function.
    logger (Callable[[Any], Any]): Optional logger exposing the methods info,
        error, debug and  warn. Defaults to regular spaCy logger but can be
        swapped for CLI logger.
    RETURNS (Path / None): The path to the final exported model.
    """
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
    dot_names = [T["train_corpus"], T["dev_corpus"]]
    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
    optimizer = T["optimizer"]
    score_weights = T["score_weights"]
    batcher = T["batcher"]
    train_logger = T["logger"]
    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
    # Create iterator, which yields out info after each optimization step.
    training_step_iterator = train_while_improving(
        nlp,
        optimizer,
        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
        create_evaluation_callback(nlp, dev_corpus, score_weights),
        dropout=T["dropout"],
        accumulate_gradient=T["accumulate_gradient"],
        patience=T["patience"],
        max_steps=T["max_steps"],
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
    )
    logger.info(f"Pipeline: {nlp.pipe_names}")
    if frozen_components:
        logger.info(f"Frozen components: {frozen_components}")
    logger.info(f"Initial learn rate: {optimizer.learn_rate}")
    with nlp.select_pipes(disable=frozen_components):
        print_row, finalize_logger = train_logger(nlp)
    try:
        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
        progress.set_description(f"Epoch 1")
        for batch, info, is_best_checkpoint in training_step_iterator:
            progress.update(1)
            if is_best_checkpoint is not None:
                progress.close()
                print_row(info)
                if is_best_checkpoint and output_path is not None:
                    with nlp.select_pipes(disable=frozen_components):
                        update_meta(T, nlp, info)
                    with nlp.use_params(optimizer.averages):
                        nlp = before_to_disk(nlp)
                        nlp.to_disk(output_path / "model-best")
                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
                progress.set_description(f"Epoch {info['epoch']}")
    except Exception as e:
        finalize_logger()
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
            # specific error.
            logger.warn(
                f"Aborting and saving the final best model. "
                f"Encountered exception: {str(e)}"
            )
            nlp = before_to_disk(nlp)
            nlp.to_disk(output_path / "model-final")
        raise e
    finally:
        finalize_logger()
        if output_path is not None:
            final_model_path = output_path / "model-final"
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
            else:
                nlp.to_disk(final_model_path)
            return final_model_path
 def train_while_improving(
    nlp: Language,
    optimizer: Optimizer,
    train_data,
    evaluate,
    *,
    dropout: float,
    eval_frequency: int,
    accumulate_gradient: int,
    patience: int,
    max_steps: int,
    exclude: List[str],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
    where info is a dict, and is_best_checkpoint is in [True, False, None] --
    None indicating that the iteration was not evaluated as a checkpoint.
    The evaluation is conducted by calling the evaluate callback.
    Positional arguments:
        nlp: The spaCy pipeline to evaluate.
        optimizer: The optimizer callable.
        train_data (Iterable[Batch]): A generator of batches, with the training
            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
            data iterable needs to take care of iterating over the epochs and
            shuffling.
        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
            The callback should take no arguments and return a tuple
            `(main_score, other_scores)`. The main_score should be a float where
            higher is better. other_scores can be any object.
    Every iteration, the function yields out a tuple with:
    * batch: A list of Example objects.
    * info: A dict with various information about the last update (see below).
    * is_best_checkpoint: A value in None, False, True, indicating whether this
        was the best evaluation so far. You should use this to save the model
        checkpoints during training. If None, evaluation was not conducted on
        that iteration. False means evaluation was conducted, but a previous
        evaluation was better.
    The info dict provides the following information:
        epoch (int): How many passes over the data have been completed.
        step (int): How many steps have been completed.
        score (float): The main score from the last evaluation.
        other_scores: : The other scores from the last evaluation.
        losses: The accumulated losses throughout training.
        checkpoints: A list of previous results, where each result is a
            (score, step, epoch) tuple.
    """
    if isinstance(dropout, float):
        dropouts = constant(dropout)
    else:
        dropouts = dropout
    results = []
    losses = {}
    words_seen = 0
    start_time = timer()
    for step, (epoch, batch) in enumerate(train_data):
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
            )
        # TODO: refactor this so we don't have to run it separately in here
        for name, proc in nlp.pipeline:
            if (
                name not in exclude
                and hasattr(proc, "model")
                and proc.model not in (True, False, None)
            ):
                proc.model.finish_update(optimizer)
        optimizer.step_schedules()
        if not (step % eval_frequency):
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    score, other_scores = evaluate()
            else:
                score, other_scores = evaluate()
            results.append((score, step))
            is_best_checkpoint = score == max(results)[0]
        else:
            score, other_scores = (None, None)
            is_best_checkpoint = None
        words_seen += sum(len(eg) for eg in batch)
        info = {
            "epoch": epoch,
            "step": step,
            "score": score,
            "other_scores": other_scores,
            "losses": losses,
            "checkpoints": results,
            "seconds": int(timer() - start_time),
            "words": words_seen,
        }
        yield batch, info, is_best_checkpoint
        if is_best_checkpoint is not None:
            losses = {}
        # Stop if no improvement in `patience` updates (if specified)
        best_score, best_step = max(results)
        if patience and (step - best_step) >= patience:
            break
        # Stop if we've exhausted our max steps (if specified)
        if max_steps and step >= max_steps:
            break
 def subdivide_batch(batch, accumulate_gradient):
    batch = list(batch)
    batch.sort(key=lambda eg: len(eg.predicted))
    sub_len = len(batch) // accumulate_gradient
    start = 0
    for i in range(accumulate_gradient):
        subbatch = batch[start : start + sub_len]
        if subbatch:
            yield subbatch
        start += len(subbatch)
    subbatch = batch[start:]
    if subbatch:
        yield subbatch
 def create_evaluation_callback(
    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
    weights = {key: value for key, value in weights.items() if value is not None}
    def evaluate() -> Tuple[float, Dict[str, float]]:
        dev_examples = list(dev_corpus(nlp))
        scores = nlp.evaluate(dev_examples)
        # Calculate a weighted sum based on score_weights for the main score.
        # We can only consider scores that are ints/floats, not dicts like
        # entity scores per type etc.
        for key, value in scores.items():
            if key in weights and not isinstance(value, (int, float)):
                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
        try:
            weighted_score = sum(
                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
            )
        except KeyError as e:
            keys = list(scores.keys())
            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
            raise KeyError(err) from None
        return weighted_score, scores
    return evaluate
 def create_train_batches(
    iterator: Iterator[Example],
    batcher: Callable[[Iterable[Example]], Iterable[Example]],
    max_epochs: int,
 ):
    epoch = 0
    examples = list(iterator)
    if not examples:
        # Raise error if no data
        raise ValueError(Errors.E986)
    while max_epochs < 1 or epoch != max_epochs:
        random.shuffle(examples)
        for batch in batcher(examples):
            yield epoch, batch
        epoch += 1
 def update_meta(
    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
 ) -> None:
    nlp.meta["performance"] = {}
    for metric in training["score_weights"]:
        if metric is not None:
            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
    for pipe_name in nlp.pipe_names:
        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 def create_before_to_disk_callback(
    callback: Optional[Callable[[Language], Language]]
 ) -> Callable[[Language], Language]:
    def before_to_disk(nlp: Language) -> Language:
        if not callback:
            return nlp
        modified_nlp = callback(nlp)
        if not isinstance(modified_nlp, Language):
            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
            raise ValueError(err)
        return modified_nlp
    return before_to_disk
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -0,0 +1,267 @@
 from typing import Optional, Callable, Any, Iterable, Union, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
 from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
 from pathlib import Path
 from functools import partial
 from collections import Counter
 import srsly
 import numpy
 import time
 import re
 from wasabi import msg
 from .example import Example
 from ..tokens import Doc
 from ..attrs import ID
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, dot_to_object, logger
 def pretrain(
    config: Config,
    output_dir: Path,
    resume_path: Optional[Path] = None,
    epoch_resume: Optional[int] = None,
    use_gpu: int = -1,
    logger: Callable[[Any], Any] = logger,
 ):
    if config["training"]["seed"] is not None:
        fix_random_seed(config["training"]["seed"])
    allocator = config["training"]["gpu_allocator"]
    if use_gpu >= 0 and allocator:
        set_gpu_allocator(allocator)
    nlp = load_model_from_config(config)
    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
    P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
    corpus = dot_to_object(T, P["corpus"])
    batcher = P["batcher"]
    model = create_pretraining_model(nlp, P)
    optimizer = P["optimizer"]
    # Load in pretrained weights to resume from
    if resume_path is not None:
        _resume_model(model, resume_path, epoch_resume)
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")
    objective = create_objective(P["objective"])
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
        for batch_id, batch in enumerate(batcher(corpus(nlp))):
            docs = ensure_docs(batch)
            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
 def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
    docs = []
    for eg_or_doc in examples_or_docs:
        if isinstance(eg_or_doc, Doc):
            docs.append(eg_or_doc)
        else:
            docs.append(eg_or_doc.reference)
    return docs
 def _resume_model(
    model: Model,
    resume_path: Path,
    epoch_resume: int,
    logger: Callable[[Any], Any] = logger,
 ) -> None:
    logger.info(f"Resume training tok2vec from: {resume_path}")
    with resume_path.open("rb") as file_:
        weights_data = file_.read()
        model.get_ref("tok2vec").from_bytes(weights_data)
    # Parse the epoch number from the given weight file
    model_name = re.search(r"model\d+\.bin", str(resume_path))
    if model_name:
        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
        logger.info(f"Resuming from epoch: {epoch_resume}")
    else:
        logger.info(f"Resuming from epoch: {epoch_resume}")
 def make_update(
    model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
 ) -> float:
    """Perform an update over a single batch of documents.
    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs)
    loss, gradients = objective_func(model.ops, docs, predictions)
    backprop(gradients)
    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
    return float(loss)
 def create_objective(config: Config):
    """Create the objective for pretraining.
    We'd like to replace this with a registry function but it's tricky because
    we're also making a model choice based on this. For now we hard-code support
    for two types (characters, vectors). For characters you can specify
    n_characters, for vectors you can specify the loss.
    Bleh.
    """
    objective_type = config["type"]
    if objective_type == "characters":
        return partial(get_characters_loss, nr_char=config["n_characters"])
    elif objective_type == "vectors":
        if config["loss"] == "cosine":
            distance = CosineDistance(normalize=True, ignore_zeros=True)
            return partial(get_vectors_loss, distance=distance)
        elif config["loss"] == "L2":
            distance = L2Distance(normalize=True, ignore_zeros=True)
            return partial(get_vectors_loss, distance=distance)
        else:
            raise ValueError("Unexpected loss type", config["loss"])
    else:
        raise ValueError("Unexpected objective_type", objective_type)
 def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a loss based on a distance between the documents' vectors and
    the prediction.
    """
    # The simplest way to implement this would be to vstack the
    # token.vector values, but that's a bit inefficient, especially on GPU.
    # Instead we fetch the index into the vectors table for each of our tokens,
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    d_target, loss = distance(prediction, target)
    return loss, d_target
 def get_characters_loss(ops, docs, prediction, nr_char):
    """Compute a loss based on a number of characters predicted from the docs."""
    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
    target_ids = target_ids.reshape((-1,))
    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
    target = target.reshape((-1, 256 * nr_char))
    diff = prediction - target
    loss = (diff ** 2).sum()
    d_target = diff / float(prediction.shape[0])
    return loss, d_target
 def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model
    # TODO
    maxout_pieces = 3
    hidden_size = 300
    if pretrain_config["objective"]["type"] == "vectors":
        model = build_cloze_multi_task_model(
            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
    elif pretrain_config["objective"]["type"] == "characters":
        model = build_cloze_characters_multi_task_model(
            nlp.vocab,
            tok2vec,
            hidden_size=hidden_size,
            maxout_pieces=maxout_pieces,
            nr_char=pretrain_config["objective"]["n_characters"],
        )
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    set_dropout_rate(model, pretrain_config["dropout"])
    return model
 class ProgressTracker:
    def __init__(self, frequency=1000000):
        self.loss = 0.0
        self.prev_loss = 0.0
        self.nr_word = 0
        self.words_per_epoch = Counter()
        self.frequency = frequency
        self.last_time = time.time()
        self.last_update = 0
        self.epoch_loss = 0.0
    def update(self, epoch, loss, docs):
        self.loss += loss
        self.epoch_loss += loss
        words_in_batch = sum(len(doc) for doc in docs)
        self.words_per_epoch[epoch] += words_in_batch
        self.nr_word += words_in_batch
        words_since_update = self.nr_word - self.last_update
        if words_since_update >= self.frequency:
            wps = words_since_update / (time.time() - self.last_time)
            self.last_update = self.nr_word
            self.last_time = time.time()
            loss_per_word = self.loss - self.prev_loss
            status = (
                epoch,
                self.nr_word,
                _smart_round(self.loss, width=10),
                _smart_round(loss_per_word, width=6),
                int(wps),
            )
            self.prev_loss = float(self.loss)
            return status
        else:
            return None
 def _smart_round(
    figure: Union[float, int], width: int = 10, max_decimal: int = 4
 ) -> str:
    """Round large numbers as integers, smaller numbers as decimals."""
    n_digits = len(str(int(figure)))
    n_decimal = width - (n_digits + 1)
    if n_decimal <= 1:
        return str(int(figure))
    else:
        n_decimal = min(n_decimal, max_decimal)
        format_str = "%." + str(n_decimal) + "f"
        return format_str % figure
--- a/spacy/util.py
+++ b/spacy/util.py
@ -8,6 +8,7 @@ import re
 from pathlib import Path
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
 from thinc.api import ConfigValidationError
 import functools
 import itertools
 import numpy.random
@ -56,6 +57,7 @@ if TYPE_CHECKING:
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
 # Default order of sections in the config.cfg. Not all sections needs to exist,
@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
    return Path(sys.modules[module.__module__].__file__).parent
 def load_vectors_into_model(
    nlp: "Language", name: Union[str, Path], *, add_strings=True
 ) -> None:
    """Load word vectors from an installed model or path into a model instance."""
    vectors_nlp = load_model(name)
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
        # I guess we should add the strings from the vectors_nlp model?
        # E.g. if someone does a similarity query, they might expect the strings.
        for key in nlp.vocab.vectors.key2row:
            if key in vectors_nlp.vocab.strings:
                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 def load_model(
    name: Union[str, Path],
    *,
@ -391,32 +379,9 @@ def load_model_from_config(
    return nlp
 def resolve_training_config(
    config: Config,
    exclude: Iterable[str] = ("nlp", "components"),
    validate: bool = True,
 ) -> Dict[str, Any]:
    """Resolve the config sections relevant for trainig and create all objects.
    Mostly used in the CLI to separate training config (not resolved by default
    because not runtime-relevant – an nlp object should load fine even if it's
    [training] block refers to functions that are not available etc.).
    config (Config): The config to resolve.
    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
        be available in the final resolved config.
    validate (bool): Whether to validate the config.
    RETURNS (Dict[str, Any]): The resolved config.
    """
    config = config.copy()
    for key in exclude:
        if key in config:
            config.pop(key)
    return registry.resolve(config, validate=validate)
 def resolve_dot_names(
    config: Config, dot_names: List[Optional[str]]
-) -> List[Optional[Callable]]:
+) -> Tuple[Any]:
    """Resolve one or more "dot notation" names, e.g. corpora.train.
    The paths could point anywhere into the config, so we don't know which
    top-level section we'll be looking within.
@ -424,18 +389,42 @@ def resolve_dot_names(
    We resolve the whole top-level section, although we could resolve less --
    we could find the lowest part of the tree.
    """
    # TODO: include schema?
    # TODO: clean this up and avoid duplication
    resolved = {}
    output = []
    errors = []
    for name in dot_names:
        if name is None:
            output.append(name)
        else:
            section = name.split(".")[0]
-            # We want to avoid resolving the same thing twice.
+            # We want to avoid resolving the same thing twice
            if section not in resolved:
                resolved[section] = registry.resolve(config[section])
            try:
                output.append(dot_to_object(resolved, name))
-    return output
+            except KeyError:
                msg = f"not a valid section reference: {name}"
                errors.append({"loc": name.split("."), "msg": msg})
    objects = []
    for ref in output:
        if not isinstance(ref, str):
            msg = f"not a valid section reference: {ref} ({type(ref)})"
            errors.append({"loc": ref.split("."), "msg": msg})
            continue
        section = ref.split(".")[0]
        # We want to avoid resolving the same thing twice
        if section not in resolved:
            resolved[section] = registry.resolve(config[section])
        try:
            objects.append(dot_to_object(resolved, ref))
        except KeyError:
            msg = f"not a valid section reference: {name}"
            errors.append({"loc": ref.split("."), "msg": msg})
    if errors:
        raise ConfigValidationError(config=config, errors=errors)
    return tuple(objects)
 def load_model_from_init_py(