Merge remote-tracking branch 'origin/develop' into rliaw-develop

2025-11-27 13:26:07 +03:00 · 2020-07-11 19:47:28 -07:00 · 2020-07-11 19:47:28 -07:00 · 3bccf8b954
commit 3bccf8b954
parent 3941367742 11bbc82c24
138 changed files with 5114 additions and 3110 deletions
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@ -5,16 +5,16 @@
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length or number of examples.
-max_length = 5000
+max_length = 3000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-dropout = 0.2
+dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
+patience = 100000
 max_epochs = 0
-max_steps = 20000
-eval_frequency = 500
+max_steps = 0
+eval_frequency = 1000
 # Other settings
 seed = 0
 accumulate_gradient = 1
@ -26,6 +26,7 @@ score_weights = {"ents_f": 1.0}
 init_tok2vec = null
 discard_oversize = false
 omit_extra_lookups = false
+batch_by = "words"

 [training.batch_size]
@schedules = "compounding.v1"
@ -37,19 +38,13 @@ compound = 1.001
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
-L2_is_weight_decay = false
-L2 = 1e-6
+L2_is_weight_decay = true
+L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001

-#[optimizer.learn_rate]
-#@schedules = "warmup_linear.v1"
-#warmup_steps = 250
-#total_steps = 20000
-#initial_rate = 0.001
-
 [nlp]
 lang = "en"
 vectors = null
@ -58,8 +53,6 @@ vectors = null
 factory = "ner"
 learn_tokens = false
 min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0

 [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,8 +1,7 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a2"
+__version__ = "3.0.0a4"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
 __projects__ = "https://github.com/explosion/spacy-boilerplates"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -11,12 +11,15 @@ from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
+from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project import project_clone, project_assets, project_run  # noqa: F401
-from .project import project_run_all  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface

 DOCS: https://spacy.io/api/cli
 """
+PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
+project templates. You'd typically start by cloning a project template to a local
+directory and fetching its assets like datasets etc. See the project's
+project.yml for the available commands.
+"""


 app = typer.Typer(name=NAME, help=HELP)
+project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
+app.add_typer(project_cli)

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -120,8 +120,12 @@ def convert(
            no_print=silent,
            ner_map=ner_map,
        )
+        if file_type == "json":
+            data = [docs_to_json(docs)]
+        else:
+            data = DocBin(docs=docs, store_user_data=True).to_bytes()
        if output_dir == "-":
-            _print_docs_to_stdout(docs, file_type)
+            _print_docs_to_stdout(data, file_type)
        else:
            if input_loc != input_path:
                subpath = input_loc.relative_to(input_path)
@ -129,24 +133,23 @@ def convert(
            else:
                output_file = Path(output_dir) / input_loc.parts[-1]
                output_file = output_file.with_suffix(f".{file_type}")
-            _write_docs_to_file(docs, output_file, file_type)
+            _write_docs_to_file(data, output_file, file_type)
            msg.good(f"Generated output file ({len(docs)} documents): {output_file}")


-def _print_docs_to_stdout(docs, output_type):
+def _print_docs_to_stdout(data, output_type):
    if output_type == "json":
-        srsly.write_json("-", [docs_to_json(docs)])
+        srsly.write_json("-", data)
    else:
-        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
+        sys.stdout.buffer.write(data)


-def _write_docs_to_file(docs, output_file, output_type):
+def _write_docs_to_file(data, output_file, output_type):
    if not output_file.parent.exists():
        output_file.parent.mkdir(parents=True)
    if output_type == "json":
-        srsly.write_json(output_file, [docs_to_json(docs)])
+        srsly.write_json(output_file, data)
    else:
-        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)

--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -0,0 +1,168 @@
+from typing import List
+from pathlib import Path
+from wasabi import msg
+
+from ._app import app, Arg, Opt
+from .. import util
+from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from ..lang.en import English
+
+
+@app.command("debug-model")
+def debug_model_cli(
+    # fmt: off
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
+    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
+    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
+    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
+    attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
+    P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
+    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
+    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
+    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
+    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
+    # fmt: on
+):
+    """
+    Analyze a Thinc ML model - internal structure and activations during training
+    """
+    print_settings = {
+        "dimensions": dimensions,
+        "parameters": parameters,
+        "gradients": gradients,
+        "attributes": attributes,
+        "layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
+        "print_before_training": P0,
+        "print_after_init": P1,
+        "print_after_training": P2,
+        "print_prediction": P3,
+    }
+
+    if seed is not None:
+        msg.info(f"Fixing random seed: {seed}")
+        fix_random_seed(seed)
+    if use_gpu >= 0:
+        msg.info(f"Using GPU: {use_gpu}")
+        require_gpu(use_gpu)
+    else:
+        msg.info(f"Using CPU")
+
+    debug_model(
+        config_path,
+        print_settings=print_settings,
+    )
+
+
+def debug_model(
+    config_path: Path,
+    *,
+    print_settings=None
+):
+    if print_settings is None:
+        print_settings = {}
+
+    model = util.load_config(config_path, create_objects=True)["model"]
+
+    # STEP 0: Printing before training
+    msg.info(f"Analysing model with ID {model.id}")
+    if print_settings.get("print_before_training"):
+        msg.info(f"Before training:")
+        _print_model(model, print_settings)
+
+    # STEP 1: Initializing the model and printing again
+    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
+    if print_settings.get("print_after_init"):
+        msg.info(f"After initialization:")
+        _print_model(model, print_settings)
+
+    # STEP 2: Updating the model and printing again
+    optimizer = Adam(0.001)
+    set_dropout_rate(model, 0.2)
+    for e in range(3):
+        Y, get_dX = model.begin_update(_get_docs())
+        dY = get_gradient(model, Y)
+        _ = get_dX(dY)
+        model.finish_update(optimizer)
+    if print_settings.get("print_after_training"):
+        msg.info(f"After training:")
+        _print_model(model, print_settings)
+
+    # STEP 3: the final prediction
+    prediction = model.predict(_get_docs())
+    if print_settings.get("print_prediction"):
+        msg.info(f"Prediction:", str(prediction))
+
+
+def get_gradient(model, Y):
+    goldY = _get_output(model.ops.xp)
+    return Y - goldY
+
+
+def _sentences():
+    return [
+        "Apple is looking at buying U.K. startup for $1 billion",
+        "Autonomous cars shift insurance liability toward manufacturers",
+        "San Francisco considers banning sidewalk delivery robots",
+        "London is a big city in the United Kingdom.",
+    ]
+
+
+def _get_docs():
+    nlp = English()
+    return list(nlp.pipe(_sentences()))
+
+
+def _get_output(xp):
+    return xp.asarray([xp.asarray([i+10, i+20, i+30], dtype="float32") for i, _ in enumerate(_get_docs())])
+
+
+def _print_model(model, print_settings):
+    layers = print_settings.get("layers", "")
+    parameters = print_settings.get("parameters", False)
+    dimensions = print_settings.get("dimensions", False)
+    gradients = print_settings.get("gradients", False)
+    attributes = print_settings.get("attributes", False)
+
+    for i, node in enumerate(model.walk()):
+        if not layers or i in layers:
+            msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
+
+            if dimensions:
+                for name in node.dim_names:
+                    if node.has_dim(name):
+                        msg.info(f" - dim {name}: {node.get_dim(name)}")
+                    else:
+                        msg.info(f" - dim {name}: {node.has_dim(name)}")
+
+            if parameters:
+                for name in node.param_names:
+                    if node.has_param(name):
+                        print_value = _print_matrix(node.get_param(name))
+                        msg.info(f" - param {name}: {print_value}")
+                    else:
+                        msg.info(f" - param {name}: {node.has_param(name)}")
+            if gradients:
+                for name in node.param_names:
+                    if node.has_grad(name):
+                        print_value = _print_matrix(node.get_grad(name))
+                        msg.info(f" - grad {name}: {print_value}")
+                    else:
+                        msg.info(f" - grad {name}: {node.has_grad(name)}")
+            if attributes:
+                attrs = node.attrs
+                for name, value in attrs.items():
+                    msg.info(f" - attr {name}: {value}")
+
+
+def _print_matrix(value):
+    if value is None or isinstance(value, bool):
+        return value
+    result = str(value.shape) + " - sample: "
+    sample_matrix = value
+    for d in range(value.ndim-1):
+        sample_matrix = sample_matrix[0]
+    sample_matrix = sample_matrix[0:5]
+    result = result + str(sample_matrix)
+    return result
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,4 +1,4 @@
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence
 import requests
 import sys
 from wasabi import msg
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
 from .. import about
 from ..util import is_package, get_base_version, run_command

+# These are the old shortcuts we previously supported in spacy download. As of
+# v3, shortcuts are deprecated so we're not expecting to add anything to this
+# list. It only exists to show users warnings.
+OLD_SHORTCUTS = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm",
+    "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "el": "el_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "xx": "xx_ent_wiki_sm",
+}
+

@app.command(
    "download",
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
        version = components[-1]
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
-        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
-        model_name = shortcuts.get(model, model)
+        model_name = model
+        if model in OLD_SHORTCUTS:
+            msg.warn(
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
+                f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
+            )
+            model_name = OLD_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
    )


-def get_json(url: str, desc: str) -> Union[dict, list]:
-    r = requests.get(url)
+def get_compatibility() -> dict:
+    version = get_base_version(about.__version__)
+    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
            f"Server error ({r.status_code})",
-            f"Couldn't fetch {desc}. Please find a model for your spaCy "
+            f"Couldn't fetch compatibility table. Please find a model for your spaCy "
            f"installation (v{about.__version__}), and download it manually. "
            f"For more details, see the documentation: "
            f"https://spacy.io/usage/models",
            exits=1,
        )
-    return r.json()
-
-
-def get_compatibility() -> dict:
-    version = get_base_version(about.__version__)
-    comp_table = get_json(about.__compatibility__, "compatibility table")
+    comp_table = r.json()
    comp = comp_table["spacy"]
    if version not in comp:
        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -1,708 +0,0 @@
-from typing import List, Dict, Any, Optional, Sequence
-import typer
-import srsly
-from pathlib import Path
-from wasabi import msg
-import subprocess
-import os
-import re
-import shutil
-import sys
-import requests
-import tqdm
-
-from ._app import app, Arg, Opt, COMMAND, NAME
-from .. import about
-from ..schemas import ProjectConfigSchema, validate
-from ..util import ensure_path, run_command, make_tempdir, working_dir
-from ..util import get_hash, get_checksum, split_command
-
-
-CONFIG_FILE = "project.yml"
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-DIRS = [
-    "assets",
-    "metas",
-    "configs",
-    "packages",
-    "metrics",
-    "scripts",
-    "notebooks",
-    "training",
-    "corpus",
-]
-CACHES = [
-    Path.home() / ".torch",
-    Path.home() / ".caches" / "torch",
-    os.environ.get("TORCH_HOME"),
-    Path.home() / ".keras",
-]
-DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
-# it directly and edit the project.yml instead and re-run the project."""
-CLI_HELP = f"""Command-line interface for spaCy projects and working with project
-templates. You'd typically start by cloning a project template to a local
-directory and fetching its assets like datasets etc. See the project's
-{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
-Version Control) to manage input and output files and to ensure steps are only
-re-run if their inputs change.
-"""
-
-project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
-
-
-@project_cli.callback(invoke_without_command=True)
-def callback(ctx: typer.Context):
-    """This runs before every project command and ensures DVC is installed."""
-    ensure_dvc()
-
-
-################
-# CLI COMMANDS #
-################
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
-    initialize the project directory as a Git repo. If the project is intended
-    to be a Git repo, it should be initialized with Git first, before
-    initializing DVC (Data Version Control). This allows DVC to integrate with
-    Git.
-    """
-    if dest == Path.cwd():
-        dest = dest / name
-    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
-
-
-@project_cli.command("init")
-def project_init_cli(
-    # fmt: off
-    path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
-    # fmt: on
-):
-    """Initialize a project directory with DVC and optionally Git. This should
-    typically be taken care of automatically when you run the "project clone"
-    command, but you can also run it separately. If the project is intended to
-    be a Git repo, it should be initialized with Git first, before initializing
-    DVC. This allows DVC to integrate with Git.
-    """
-    project_init(path, git=git, force=force, silent=True)
-
-
-@project_cli.command("assets")
-def project_assets_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Use DVC (Data Version Control) to fetch project assets. Assets are
-    defined in the "assets" section of the project config. If possible, DVC
-    will try to track the files so you can pull changes from upstream. It will
-    also try and store the checksum so the assets are versioned. If the file
-    can't be tracked or checked, it will be downloaded without DVC. If a checksum
-    is provided in the project config, the file is only downloaded if no local
-    file with the same checksum exists.
-    """
-    project_assets(project_dir)
-
-
-@project_cli.command(
-    "run-all",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_all_cli(
-    # fmt: off
-    ctx: typer.Context,
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run all commands defined in the project. This command will use DVC and
-    the defined outputs and dependencies in the project config to determine
-    which steps need to be re-run and where to start. This means you're only
-    re-generating data if the inputs have changed.
-
-    This command calls into "dvc repro" and all additional arguments are passed
-    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
-    """
-    if show_help:
-        print_run_help(project_dir)
-    else:
-        project_run_all(project_dir, *ctx.args)
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,
-    subcommand: str = Arg(None, help="Name of command defined in project config"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named script defined in the project config. If the command is
-    part of the default pipeline defined in the "run" section, DVC is used to
-    determine whether the step should re-run if its inputs have changed, or
-    whether everything is up to date. If the script is not part of the default
-    pipeline, it will be called separately without DVC.
-
-    If DVC is used, the command calls into "dvc repro" and all additional
-    arguments are passed to the "dvc repro" command:
-    https://dvc.org/doc/command-reference/repro
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        project_run(project_dir, subcommand, *ctx.args)
-
-
-@project_cli.command("exec", hidden=True)
-def project_exec_cli(
-    # fmt: off
-    subcommand: str = Arg(..., help="Name of command defined in project config"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Execute a command defined in the project config. This CLI command is
-    only called internally in auto-generated DVC pipelines, as a shortcut for
-    multi-step commands in the project config. You typically shouldn't have to
-    call it yourself. To run a command, call "run" or "run-all".
-    """
-    project_exec(project_dir, subcommand)
-
-
-@project_cli.command("update-dvc")
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Update the auto-generated DVC config file. Uses the steps defined in the
-    "run" section of the project config. This typically happens automatically
-    when running a command, but can also be triggered manually if needed.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
-    if updated:
-        msg.good(f"Updated DVC config from {CONFIG_FILE}")
-    else:
-        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
-
-
-app.add_typer(project_cli, name="project")
-
-
-#################
-# CLI FUNCTIONS #
-#################
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    git: bool = False,
-    no_init: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    git (bool): Initialize project as Git repo. Should be set to True if project
-        is intended as a repo, since it will allow DVC to integrate with Git.
-    no_init (bool): Don't initialize DVC and Git automatically. If True, the
-        "init" command or "git init" and "dvc init" need to be run manually.
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
-        try:
-            run_command(cmd)
-        except SystemExit:
-            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
-            msg.fail(err)
-        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
-            f.write(name)
-        try:
-            run_command(["git", "-C", str(tmp_dir), "fetch"])
-            run_command(["git", "-C", str(tmp_dir), "checkout"])
-        except SystemExit:
-            err = f"Could not clone '{name}' in the repo '{repo}'."
-            msg.fail(err)
-        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
-    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
-    for sub_dir in DIRS:
-        dir_path = project_dir / sub_dir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    if not no_init:
-        project_init(project_dir, git=git, force=True, silent=True)
-    msg.good(f"Your project is now ready!", dest)
-    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def project_init(
-    project_dir: Path,
-    *,
-    git: bool = False,
-    force: bool = False,
-    silent: bool = False,
-    analytics: bool = False,
-):
-    """Initialize a project as a DVC and (optionally) as a Git repo.
-
-    project_dir (Path): Path to project directory.
-    git (bool): Also call "git init" to initialize directory as a Git repo.
-    silent (bool): Don't print any output (via DVC).
-    analytics (bool): Opt-in to DVC analytics (defaults to False).
-    """
-    with working_dir(project_dir) as cwd:
-        if git:
-            run_command(["git", "init"])
-        init_cmd = ["dvc", "init"]
-        if silent:
-            init_cmd.append("--quiet")
-        if not git:
-            init_cmd.append("--no-scm")
-        if force:
-            init_cmd.append("--force")
-        run_command(init_cmd)
-        # We don't want to have analytics on by default – our users should
-        # opt-in explicitly. If they want it, they can always enable it.
-        if not analytics:
-            run_command(["dvc", "config", "core.analytics", "false"])
-        # Remove unused and confusing plot templates from .dvc directory
-        # TODO: maybe we shouldn't do this, but it's otherwise super confusing
-        # once you commit your changes via Git and it creates a bunch of files
-        # that have no purpose
-        plots_dir = cwd / DVC_DIR / "plots"
-        if plots_dir.exists():
-            shutil.rmtree(str(plots_dir))
-        config = load_project_config(cwd)
-        setup_check_dvc(cwd, config)
-
-
-def project_assets(project_dir: Path) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path)
-    setup_check_dvc(project_path, config)
-    assets = config.get("assets", {})
-    if not assets:
-        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
-    msg.info(f"Fetching {len(assets)} asset(s)")
-    variables = config.get("variables", {})
-    fetched_assets = []
-    for asset in assets:
-        url = asset["url"].format(**variables)
-        dest = asset["dest"].format(**variables)
-        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
-        if fetched_path:
-            fetched_assets.append(str(fetched_path))
-    if fetched_assets:
-        with working_dir(project_path):
-            run_command(["dvc", "add", *fetched_assets, "--external"])
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> Optional[Path]:
-    """Fetch an asset from a given URL or path. Will try to import the file
-    using DVC's import-url if possible (fully tracked and versioned) and falls
-    back to get-url (versioned) and a non-DVC download if necessary. If a
-    checksum is provided and a local file exists, it's only re-downloaded if the
-    checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    url = convert_asset_url(url)
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
-        # If there's already a file, check for checksum
-        # TODO: add support for caches (dvc import-url with local path)
-        if checksum == get_checksum(dest_path):
-            msg.good(f"Skipping download with matching checksum: {dest}")
-            return dest_path
-    with working_dir(project_path):
-        try:
-            # If these fail, we don't want to output an error or info message.
-            # Try with tracking the source first, then just downloading with
-            # DVC, then a regular non-DVC download.
-            try:
-                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-            except subprocess.CalledProcessError:
-                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-        except subprocess.CalledProcessError:
-            try:
-                download_file(url, dest_path)
-            except requests.exceptions.HTTPError as e:
-                msg.fail(f"Download failed: {dest}", e)
-                return None
-    if checksum and checksum != get_checksum(dest_path):
-        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
-    msg.good(f"Fetched asset {dest}")
-    return dest_path
-
-
-def project_run_all(project_dir: Path, *dvc_args) -> None:
-    """Run all commands defined in the project using DVC.
-
-    project_dir (Path): Path to project directory.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    dvc_cmd = ["dvc", "repro", *dvc_args]
-    with working_dir(project_dir):
-        run_command(dvc_cmd)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project config.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    if subcommand:
-        validate_subcommand(commands.keys(), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
-        help_text = commands[subcommand].get("help")
-        if help_text:
-            msg.text(f"\n{help_text}\n")
-    else:
-        print(f"\nAvailable commands in {CONFIG_FILE}")
-        print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
-        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        msg.text("Run all commands defined in the 'run' block of the project config:")
-        print(f"{COMMAND} project run-all {project_dir}")
-
-
-def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
-    """Run a named script defined in the project config. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    validate_subcommand(commands.keys(), subcommand)
-    if subcommand in config.get("run", []):
-        # This is one of the pipeline commands tracked in DVC
-        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
-        with working_dir(project_dir):
-            run_command(dvc_cmd)
-    else:
-        cmd = commands[subcommand]
-        # Deps in non-DVC commands aren't tracked, but if they're defined,
-        # make sure they exist before running the command
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                msg.fail(err, exits=1)
-        with working_dir(project_dir):
-            run_commands(cmd["script"], variables)
-
-
-def project_exec(project_dir: Path, subcommand: str):
-    """Execute a command defined in the project config.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    with working_dir(project_dir):
-        run_commands(commands[subcommand]["script"], variables)
-
-
-###########
-# HELPERS #
-###########
-
-
-def load_project_config(path: Path) -> Dict[str, Any]:
-    """Load the project config file from a directory and validate it.
-
-    path (Path): The path to the project directory.
-    RETURNS (Dict[str, Any]): The loaded project config.
-    """
-    config_path = path / CONFIG_FILE
-    if not config_path.exists():
-        msg.fail("Can't find project config", config_path, exits=1)
-    invalid_err = f"Invalid project config in {CONFIG_FILE}"
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err, "\n".join(errors), exits=1)
-    return config
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    verbose: bool = False,
-    silent: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
-    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project config, don't need to update
-        dvc_config_path.unlink()
-    variables = config.get("variables", {})
-    commands = []
-    # We only want to include commands that are part of the main list of "run"
-    # commands in project.yml and should be run in sequence
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    for name in config.get("run", []):
-        validate_subcommand(config_commands.keys(), name)
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "exec", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
-        if verbose:
-            dvc_cmd.append("--verbose")
-        if silent:
-            dvc_cmd.append("--quiet")
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        commands.append(" ".join(full_cmd))
-    with working_dir(path):
-        run_commands(commands, variables, silent=True)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def ensure_dvc() -> None:
-    """Ensure that the "dvc" command is available and show an error if not."""
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-
-
-def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
-    """Check that the project is set up correctly with DVC and update its
-    config if needed. Will raise an error if the project is not an initialized
-    DVC project.
-
-    project_dir (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
-    """
-    if not project_dir.exists():
-        msg.fail(f"Can't find project directory: {project_dir}")
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project.",
-            f"Make sure that the project template was cloned correctly. To "
-            f"initialize the project directory manually, you can run: "
-            f"{COMMAND} project init {project_dir}",
-            exits=1,
-        )
-    with msg.loading("Updating DVC config..."):
-        updated = update_dvc_config(project_dir, config, silent=True)
-    if updated:
-        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
-
-
-def run_commands(
-    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    silent (bool): Don't print the commands.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {' '.join(command)}")
-        run_command(command)
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match("(http(s?)):\/\/github.com", url):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    try:
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
-            f"To clone a project without Git, copy the files from the '{name}' "
-            f"directory in the {repo} to {dest} manually and then run:",
-            f"{COMMAND} project init {dest}",
-            exits=1,
-        )
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
-            exits=1,
-        )
-
-
-def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if subcommand not in commands:
-        msg.fail(
-            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
-            f"Available commands: {', '.join(commands)}",
-            exits=1,
-        )
-
-
-def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
-    """Download a file using requests.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    chunk_size (int): The size of chunks to read/write.
-    """
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    total = int(response.headers.get("content-length", 0))
-    progress_settings = {
-        "total": total,
-        "unit": "iB",
-        "unit_scale": True,
-        "unit_divisor": chunk_size,
-        "leave": False,
-    }
-    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
-        for data in response.iter_content(chunk_size=chunk_size):
-            size = f.write(data)
-            bar.update(size)
--- a/spacy/cli/project/init.py
+++ b/spacy/cli/project/init.py
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -0,0 +1,158 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import requests
+import tqdm
+import re
+import shutil
+
+from ...util import ensure_path, working_dir
+from .._app import project_cli, Arg
+from .util import PROJECT_FILE, load_project_config, get_checksum
+
+
+# TODO: find a solution for caches
+# CACHES = [
+#     Path.home() / ".torch",
+#     Path.home() / ".caches" / "torch",
+#     os.environ.get("TORCH_HOME"),
+#     Path.home() / ".keras",
+# ]
+
+
+@project_cli.command("assets")
+def project_assets_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+    """
+    project_assets(project_dir)
+
+
+def project_assets(project_dir: Path) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path)
+    assets = config.get("assets", {})
+    if not assets:
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+    msg.info(f"Fetching {len(assets)} asset(s)")
+    variables = config.get("variables", {})
+    for asset in assets:
+        dest = asset["dest"].format(**variables)
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            check_private_asset(dest, checksum)
+            continue
+        url = url.format(**variables)
+        fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Desintation path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if checksum and checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    # TODO: add support for caches
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists() and checksum:
+        # If there's already a file, check for checksum
+        if checksum == get_checksum(dest_path):
+            msg.good(f"Skipping download with matching checksum: {dest}")
+            return dest_path
+    # We might as well support the user here and create parent directories in
+    # case the asset dir isn't listed as a dir to create in the project.yml
+    if not dest_path.parent.exists():
+        dest_path.parent.mkdir(parents=True)
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+                return
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if re.match(r"(http(s?)):\/\/github.com", url):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
+
+
+def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
+    """Download a file using requests.
+
+    url (str): The URL of the file.
+    dest (Path): The destination path.
+    chunk_size (int): The size of chunks to read/write.
+    """
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total = int(response.headers.get("content-length", 0))
+    progress_settings = {
+        "total": total,
+        "unit": "iB",
+        "unit_scale": True,
+        "unit_divisor": chunk_size,
+        "leave": False,
+    }
+    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
+        for data in response.iter_content(chunk_size=chunk_size):
+            size = f.write(data)
+            bar.update(size)
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -0,0 +1,97 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import subprocess
+import shutil
+import re
+
+from ... import about
+from ...util import ensure_path, run_command, make_tempdir
+from .._app import project_cli, Arg, Opt, COMMAND
+from .util import PROJECT_FILE
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo).
+    """
+    if dest is None:
+        dest = Path.cwd() / name
+    project_clone(name, dest, repo=repo)
+
+
+def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
+    # We're using Git and sparse checkout to only clone the files we need
+    with make_tempdir() as tmp_dir:
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
+        try:
+            run_command(cmd)
+        except subprocess.CalledProcessError:
+            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
+            msg.fail(err)
+        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
+            f.write(name)
+        try:
+            run_command(["git", "-C", str(tmp_dir), "fetch"])
+            run_command(["git", "-C", str(tmp_dir), "checkout"])
+        except subprocess.CalledProcessError:
+            err = f"Could not clone '{name}' from repo '{repo_name}'"
+            msg.fail(err)
+        # We need Path(name) to make sure we also support subdirectories
+        shutil.move(str(tmp_dir / Path(name)), str(project_dir))
+    msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
+    if not (project_dir / PROJECT_FILE).exists():
+        msg.warn(f"No {PROJECT_FILE} found in directory")
+    else:
+        msg.good(f"Your project is now ready!")
+        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    try:
+        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+            f"To clone a project without Git, copy the files from the '{name}' "
+            f"directory in the {repo} to {dest} manually and then run:",
+            f"{COMMAND} project init {dest}",
+            exits=1,
+        )
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            exits=1,
+        )
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -0,0 +1,208 @@
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+from typing import Dict, Any, List, Optional
+import subprocess
+from pathlib import Path
+from wasabi import msg
+
+from .util import PROJECT_FILE, load_project_config, get_hash
+from .._app import project_cli, Arg, Opt, NAME, COMMAND
+from ...util import working_dir, split_command, join_command, run_command
+
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if the project.yml changed.
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    silent: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    silent (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    variables = config.get("variables", {})
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+        if command.get("no_skip"):
+            dvc_cmd.append("--always-changed")
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+    with working_dir(path):
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        dvc_command = ["dvc", *command]
+        # Add the flags if they are set to True
+        for flag, is_active in flags.items():
+            if is_active:
+                dvc_command.append(flag)
+        run_command(dvc_command)
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -0,0 +1,266 @@
+from typing import Optional, List, Dict, Sequence, Any
+from pathlib import Path
+from wasabi import msg
+import sys
+import srsly
+
+from ...util import working_dir, run_command, split_command, is_cwd, join_command
+from .._app import project_cli, Arg, Opt, COMMAND
+from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
+from .util import get_checksum
+
+
+@project_cli.command("run")
+def project_run_cli(
+    # fmt: off
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named command or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        project_run(project_dir, subcommand, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    """
+    config = load_project_config(project_dir)
+    variables = config.get("variables", {})
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(project_dir, cmd, force=force, dry=dry)
+    else:
+        cmd = commands[subcommand]
+        variables = config.get("variables", {})
+        for dep in cmd.get("deps", []):
+            dep = dep.format(**variables)
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_kwargs = {"exits": 1} if not dry else {}
+                msg.fail(err, **err_kwargs)
+        with working_dir(project_dir) as current_dir:
+            rerun = check_rerun(current_dir, cmd, variables)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                msg.divider(subcommand)
+                run_commands(cmd["script"], variables, dry=dry)
+                if not dry:
+                    update_lockfile(current_dir, cmd, variables)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    workflows = config.get("workflows", {})
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        if subcommand in commands:
+            help_text = commands[subcommand].get("help")
+            if help_text:
+                print(f"\n{help_text}\n")
+        elif subcommand in workflows:
+            steps = workflows[subcommand]
+            print(f"\nWorkflow consisting of {len(steps)} commands:")
+            steps_data = [
+                (f"{i + 1}. {step}", commands[step].get("help", ""))
+                for i, step in enumerate(steps)
+            ]
+            msg.table(steps_data)
+            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
+            print(f"For command details, run: {help_cmd}")
+    else:
+        print("")
+        if config_commands:
+            print(f"Available commands in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        if workflows:
+            print(f"Available workflows in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
+            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
+
+
+def run_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, Any] = {},
+    silent: bool = False,
+    dry: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (bool): Whether to re-run the command.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # Always run commands with no outputs (otherwise they'd always be skipped)
+    if not entry.get("outs", []):
+        return True
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+
+
+def update_lockfile(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+    }
+
+
+def get_fileinfo(
+    project_dir: Path, paths: List[str], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        path = path.format(**variables)
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
--- a/spacy/cli/project/util.py
+++ b/spacy/cli/project/util.py
@ -0,0 +1,93 @@
+from typing import Dict, Any, Union
+from pathlib import Path
+from wasabi import msg
+import srsly
+import hashlib
+
+from ...schemas import ProjectConfigSchema, validate
+
+
+PROJECT_FILE = "project.yml"
+PROJECT_LOCK = "project.lock"
+
+
+def load_project_config(path: Path) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it. Also make
+    sure that all directories defined in the config exist.
+
+    path (Path): The path to the project directory.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err, "\n".join(errors), exits=1)
+    validate_project_commands(config)
+    # Make sure directories defined in config exist
+    for subdir in config.get("directories", []):
+        dir_path = path / subdir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    return config
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
+
+
+def get_hash(data) -> str:
+    """Get the hash for a JSON-serializable object.
+
+    data: The data to hash.
+    RETURNS (str): The hash.
+    """
+    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
+    return hashlib.md5(data_str).hexdigest()
+
+
+def get_checksum(path: Union[Path, str]) -> str:
+    """Get the checksum for a file or directory given its file path. If a
+    directory path is provided, this uses all files in that directory.
+
+    path (Union[Path, str]): The file or directory path.
+    RETURNS (str): The checksum.
+    """
+    path = Path(path)
+    if path.is_file():
+        return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    if path.is_dir():
+        # TODO: this is currently pretty slow
+        dir_checksum = hashlib.md5()
+        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+            dir_checksum.update(sub_file.read_bytes())
+        return dir_checksum.hexdigest()
+    raise ValueError(f"Can't get checksum for {path}: not a file or directory")
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train")
 def train_cli(
    # fmt: off
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
+    train_path: Path = Arg(..., help="Location of training data", exists=True),
+    dev_path: Path = Arg(..., help="Location of development data", exists=True),
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
    raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
-    verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
    num_workers: int = Opt(None, "-j", help="Parallel Workers"),
    strategy: str = Opt("allreduce", "--strategy", help="Distributed training strategy (requires spacy_ray)"),
@ -155,6 +155,7 @@ def train_cli(
    if init_tok2vec is not None:
        with init_tok2vec.open("rb") as file_:
            weights_data = file_.read()
+
    train_args = dict(
        config_path=config_path,
        data_paths={"train": train_path, "dev": dev_path},
@ -170,7 +171,7 @@ def train_cli(
        distributed_setup_and_train(use_gpu, num_workers, strategy, ray_address, train_args)
    else:
        if use_gpu >= 0:
-            msg.info(f"Using GPU: {str(use_gpu)}")
+            msg.info(f"Using GPU: {use_gpu}")
            require_gpu(use_gpu)
        else:
            msg.info("Using CPU")
@ -191,7 +192,8 @@ def train(
    msg.info(f"Loading config from: {config_path}")
    # Read the config first without creating objects, to get to the original nlp_config
    config = util.load_config(config_path, create_objects=False)
-    fix_random_seed(config["training"]["seed"])
+    if config["training"].get("seed"):
+        fix_random_seed(config["training"]["seed"])
    if config["training"].get("use_pytorch_for_gpu_memory"):
        # It feels kind of weird to not have a default for this.
        use_pytorch_for_gpu_memory()
@ -216,7 +218,10 @@ def train(
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
        train_examples = list(
            corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"]
+                nlp,
+                shuffle=False,
+                gold_preproc=training["gold_preproc"],
+                max_length=training["max_length"],
            )
        )
        nlp.begin_training(lambda: train_examples)
@ -315,6 +320,7 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
    )

    epoch = 0
+    batch_strategy = cfg.get("batch_by", "sequences")
    while True:
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
@ -324,11 +330,22 @@ def create_train_batches(nlp, corpus, cfg, randomization_index):
            random.random()
        random.shuffle(train_examples)
        epoch += 1
-        batches = util.minibatch_by_words(
-            train_examples,
-            size=cfg["batch_size"],
-            discard_oversize=cfg["discard_oversize"],
-        )
+        if batch_strategy == "padded":
+            batches = util.minibatch_by_padded_size(
+                train_examples,
+                size=cfg["batch_size"],
+                buffer=256,
+                discard_oversize=cfg["discard_oversize"],
+            )
+        elif batch_strategy == "words":
+            batches = util.minibatch_by_words(
+                train_examples,
+                size=cfg["batch_size"],
+                discard_oversize=cfg["discard_oversize"],
+            )
+        else:
+            batches = util.minibatch(train_examples, size=cfg["batch_size"])
+
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
            first = next(batches)
@ -440,7 +457,9 @@ def train_while_improving(

    if raw_text:
        random.shuffle(raw_text)
-        raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
+        raw_examples = [
+            Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
+        ]
        raw_batches = util.minibatch(raw_examples, size=8)

    for step, (epoch, batch) in enumerate(train_data):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -69,6 +69,9 @@ class Warnings(object):
    W027 = ("Found a large training file of {size} bytes. Note that it may "
            "be more efficient to split your training data into multiple "
            "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type 'uint64' instead. This may result "
+            "in problems with the vocab further on in the pipeline.")
    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
            "entities \"{entities}\". Use "
            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
@ -477,15 +480,14 @@ class Errors(object):
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")

    # TODO: fix numbering after merging develop into master
+    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
            "array and {doc_length} for the Doc itself.")
    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
    E973 = ("Unexpected type for NER data")
    E974 = ("Unknown {obj} attribute: {key}")
-    E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
-            "but got {type}")
-    E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
+    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
            "but received None.")
    E977 = ("Can not compare a MorphAnalysis with a string object. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -28,7 +28,6 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):

 cdef class Example:
    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
-        """ Doc can either be text, or an actual Doc """
        if predicted is None:
            raise TypeError(Errors.E972.format(arg="predicted"))
        if reference is None:
@ -37,6 +36,9 @@ cdef class Example:
        self.y = reference
        self._alignment = alignment

+    def __len__(self):
+        return len(self.predicted)
+
    property predicted:
        def __get__(self):
            return self.x
@ -59,17 +61,15 @@ cdef class Example:

    @classmethod
    def from_dict(cls, Doc predicted, dict example_dict):
+        if predicted is None:
+            raise ValueError(Errors.E976.format(n="first", type="Doc"))
        if example_dict is None:
-            raise ValueError(Errors.E976)
-        if not isinstance(predicted, Doc):
-            raise TypeError(Errors.E975.format(type=type(predicted)))
+            raise ValueError(Errors.E976.format(n="second", type="dict"))
        example_dict = _fix_legacy_dict_data(example_dict)
        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
        if "ORTH" not in tok_dict:
            tok_dict["ORTH"] = [tok.text for tok in predicted]
            tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
-        if not _has_field(tok_dict, "SPACY"):
-            spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
        return Example(
            predicted,
            annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -257,7 +257,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([vocab.morphology.add(v) for v in value])
        else:
            attrs.append(key)
-            values.append([vocab.strings.add(v) for v in value])
+            try:
+                values.append([vocab.strings.add(v) for v in value])
+            except TypeError:
+                types= set([type(v) for v in value])
+                raise TypeError(Errors.E969.format(field=key, types=types))

    array = numpy.asarray(values, dtype="uint64")
    return attrs, array.T
@ -325,8 +329,8 @@ def _fix_legacy_dict_data(example_dict):
    for key, value in old_token_dict.items():
        if key in ("text", "ids", "brackets"):
            pass
-        elif key in remapping:
-            token_dict[remapping[key]] = value
+        elif key.lower() in remapping:
+            token_dict[remapping[key.lower()]] = value
        else:
            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
    text = example_dict.get("text", example_dict.get("raw"))
--- a/spacy/language.py
+++ b/spacy/language.py
@ -513,20 +513,23 @@ class Language(object):
    ):
        """Update the models in the pipeline.

-        examples (iterable): A batch of `Example` objects.
+        examples (Iterable[Example]): A batch of examples
        dummy: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
-        sgd (callable): An optimizer.
-        losses (dict): Dictionary to update with the loss, keyed by component.
-        component_cfg (dict): Config parameters for specific pipeline
+        sgd (Optimizer): An optimizer.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
+        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
        """
        if dummy is not None:
            raise ValueError(Errors.E989)
+        if losses is None:
+            losses = {}
        if len(examples) == 0:
-            return
+            return losses
        if not isinstance(examples, Iterable):
            raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples)))
        wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)])
@ -540,22 +543,19 @@ class Language(object):

        if component_cfg is None:
            component_cfg = {}
-        component_deps = count_pipeline_interdependencies(self.pipeline)
-        # Determine whether component should set annotations. In theory I guess
-        # we should do this by inspecting the meta? Or we could just always
-        # say "yes"
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            component_cfg[name].setdefault("drop", drop)
-            component_cfg[name]["set_annotations"] = bool(component_deps[i])
+            component_cfg[name].setdefault("set_annotations", False)
        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd is not False:
+        if sgd not in (None, False):
            for name, proc in self.pipeline:
                if hasattr(proc, "model"):
                    proc.model.finish_update(sgd)
+        return losses

    def rehearse(self, examples, sgd=None, losses=None, config=None):
        """Make a "rehearsal" update to the models in the pipeline, to prevent
@ -761,18 +761,17 @@ class Language(object):
    ):
        """Process texts as a stream, and yield `Doc` objects in order.

-        texts (iterator): A sequence of texts to process.
+        texts (Iterable[str]): A sequence of texts to process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        batch_size (int): The number of texts to buffer.
-        disable (list): Names of the pipeline components to disable.
+        disable (List[str]): Names of the pipeline components to disable.
        cleanup (bool): If True, unneeded strings are freed to control memory
            use. Experimental.
-        component_cfg (dict): An optional dictionary with extra keyword
+        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
            arguments for specific components.
-        n_process (int): Number of processors to process texts, only supported
-            in Python3. If -1, set `multiprocessing.cpu_count()`.
+        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -1,13 +1,14 @@
 from thinc.api import Model, normal_init


-def PrecomputableAffine(nO, nI, nF, nP):
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
    model = Model(
        "precomputable_affine",
        forward,
        init=init,
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout}
    )
    return model

@ -48,17 +49,14 @@ def forward(model, X, is_train):
        model.inc_grad("b", dY.sum(axis=0))
        dY = dY.reshape((dY.shape[0], nO * nP))

-        Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
+        Wopfi = W.transpose((1, 2, 0, 3))
        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)

-        # Reuse the buffer
-        dWopfi = Wopfi
-        dWopfi.fill(0.0)
-        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
        model.inc_grad("W", dWopfi)
        return dXf.reshape((dXf.shape[0], nF, nI))

--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -87,16 +87,16 @@ def build_text_classifier(
    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        lower = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout
+            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
        )
        prefix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
        )
        suffix = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
        )
        shape = HashEmbed(
-            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout
+            nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
        )

        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -154,16 +154,16 @@ def LayerNormalizedMaxout(width, maxout_pieces):
 def MultiHashEmbed(
    columns, width, rows, use_subwords, pretrained_vectors, mix, dropout
 ):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6)
    if use_subwords:
        prefix = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout, seed=7
        )
        suffix = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout, seed=8
        )
        shape = HashEmbed(
-            nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout
+            nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout, seed=9
        )

    if pretrained_vectors:
@ -192,7 +192,7 @@ def MultiHashEmbed(

@registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(columns, width, rows, nM, nC, features, dropout):
-    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout)
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5)
    chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
    with Model.define_operators({">>": chain, "|": concatenate}):
        embed_layer = chr_embed | features >> with_array(norm)
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        norm = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
+            nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
            seed=0
        )
        if subword_features:
            prefix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
                seed=1
            )
            suffix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
                seed=2
            )
            shape = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
                seed=3
            )
        else:
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
                >> Maxout(
                    nO=width,
                    nI=width * columns,
-                    nP=maxout_pieces,
+                    nP=3,
                    dropout=0.0,
                    normalize=True,
                ),
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
            reduce_dimensions = Maxout(
                nO=width,
                nI=nM * nC + width,
-                nP=maxout_pieces,
+                nP=3,
                dropout=0.0,
                normalize=True,
            )
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
 from ..syntax._parser_model import ParserStepModel


-def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
+def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -272,7 +272,7 @@ cdef class Morphology:

    @staticmethod
    def feats_to_dict(feats):
-        if not feats:
+        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -3,7 +3,7 @@ cimport numpy as np

 import numpy
 import srsly
-from thinc.api import to_categorical
+from thinc.api import SequenceCategoricalCrossentropy

 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
            doc.is_morphed = True

    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        truths = []
        for eg in examples:
+            eg_truths = []
            pos_tags = eg.get_aligned("POS", as_string=True)
            morphs = eg.get_aligned("MORPH", as_string=True)
            for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
                    morph = self.vocab.strings[self.vocab.morphology.add(feats)]
                if morph == "":
                    morph = Morphology.EMPTY_MORPH
-                if morph is None:
-                    correct[idx] = guesses[idx]
-                elif morph in tag_index:
-                    correct[idx] = tag_index[morph]
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                eg_truths.append(morph)
+            truths.append(eg_truths)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

    def to_bytes(self, exclude=tuple()):
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -58,12 +58,8 @@ class Pipe(object):
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
-        predictions = self.predict([doc])
-        if isinstance(predictions, tuple) and len(predictions) == 2:
-            scores, tensors = predictions
-            self.set_annotations([doc], scores, tensors=tensors)
-        else:
-            self.set_annotations([doc], predictions)
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
        return doc

    def pipe(self, stream, batch_size=128):
@ -73,12 +69,8 @@ class Pipe(object):
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
-            predictions = self.predict(docs)
-            if isinstance(predictions, tuple) and len(tuple) == 2:
-                scores, tensors = predictions
-                self.set_annotations(docs, scores, tensors=tensors)
-            else:
-                self.set_annotations(docs, predictions)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
            yield from docs

    def predict(self, docs):
@ -87,7 +79,7 @@ class Pipe(object):
        """
        raise NotImplementedError

-    def set_annotations(self, docs, scores, tensors=None):
+    def set_annotations(self, docs, scores):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

@ -281,9 +273,10 @@ class Tagger(Pipe):
                idx += 1
            doc.is_tagged = True

-    def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
-        if losses is not None and self.name not in losses:
-            losses[self.name] = 0.
+    def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)

        try:
            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
@ -303,11 +296,11 @@ class Tagger(Pipe):
        if sgd not in (None, False):
            self.model.finish_update(sgd)

-        if losses is not None:
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, self._scores2guesses(tag_scores))
+        return losses

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
@ -334,7 +327,7 @@ class Tagger(Pipe):
            losses[self.name] += (gradient**2).sum()

    def get_loss(self, examples, scores):
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
@ -521,29 +514,23 @@ class SentenceRecognizer(Tagger):
                        doc.c[j].sent_start = -1

    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = range(len(self.labels))
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        labels = self.labels
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        truths = []
        for eg in examples:
-            sent_starts = eg.get_aligned("sent_start")
-            for sent_start in sent_starts:
-                if sent_start is None:
-                    correct[idx] = guesses[idx]
-                elif sent_start in tag_index:
-                    correct[idx] = sent_start
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
@ -641,7 +628,7 @@ class MultitaskObjective(Tagger):
    def labels(self, value):
        self.cfg["labels"] = value

-    def set_annotations(self, docs, dep_ids, tensors=None):
+    def set_annotations(self, docs, dep_ids):
        pass

    def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -738,7 +725,7 @@ class ClozeMultitask(Pipe):
        self.cfg = cfg
        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config

-    def set_annotations(self, docs, dep_ids, tensors=None):
+    def set_annotations(self, docs, dep_ids):
        pass

    def begin_training(self, get_examples=lambda: [], pipeline=None,
@ -767,7 +754,7 @@ class ClozeMultitask(Pipe):
        loss = self.distance.get_loss(prediction, target)
        return loss, gradient

-    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
        pass

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
@ -815,8 +802,8 @@ class TextCategorizer(Pipe):

    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
-            scores, tensors = self.predict(docs)
-            self.set_annotations(docs, scores, tensors=tensors)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
            yield from docs

    def predict(self, docs):
@ -826,22 +813,25 @@ class TextCategorizer(Pipe):
            # Handle cases where there are no tokens in any docs.
            xp = get_array_module(tensors)
            scores = xp.zeros((len(docs), len(self.labels)))
-            return scores, tensors
+            return scores

        scores = self.model.predict(docs)
        scores = self.model.ops.asarray(scores)
-        return scores, tensors
+        return scores

-    def set_annotations(self, docs, scores, tensors=None):
+    def set_annotations(self, docs, scores):
        for i, doc in enumerate(docs):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

-    def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
        try:
            if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
                # Handle cases where there are no tokens in any docs.
-                return
+                return losses
        except AttributeError:
            types = set([type(eg) for eg in examples])
            raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
@ -853,12 +843,11 @@ class TextCategorizer(Pipe):
        bp_scores(d_scores)
        if sgd is not None:
            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, scores=scores)
+        return losses

    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
@ -1082,12 +1071,13 @@ class EntityLinker(Pipe):
            sgd = self.create_optimizer()
        return sgd

-    def update(self, examples, state=None, set_annotations=False, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
        self.require_kb()
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
        if not examples:
-            return 0
+            return losses
        sentence_docs = []
        try:
            docs = [eg.predicted for eg in examples]
@ -1130,20 +1120,19 @@ class EntityLinker(Pipe):
            return 0.0
        sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
        loss, d_scores = self.get_similarity_loss(
-            scores=sentence_encodings,
+            sentence_encodings=sentence_encodings,
            examples=examples
        )
        bp_context(d_scores)
        if sgd is not None:
            self.model.finish_update(sgd)

-        if losses is not None:
-            losses[self.name] += loss
+        losses[self.name] += loss
        if set_annotations:
            self.set_annotations(docs, predictions)
-        return loss
+        return losses

-    def get_similarity_loss(self, examples, scores):
+    def get_similarity_loss(self, examples, sentence_encodings):
        entity_encodings = []
        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
@ -1155,41 +1144,23 @@ class EntityLinker(Pipe):

        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")

-        if scores.shape != entity_encodings.shape:
+        if sentence_encodings.shape != entity_encodings.shape:
            raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))

-        gradients = self.distance.get_grad(scores, entity_encodings)
-        loss = self.distance.get_loss(scores, entity_encodings)
+        gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
+        loss = self.distance.get_loss(sentence_encodings, entity_encodings)
        loss = loss / len(entity_encodings)
        return loss, gradients

-    def get_loss(self, examples, scores):
-        cats = []
-        for eg in examples:
-            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
-                kb_id = kb_ids[ent.start]
-                if kb_id:
-                    cats.append([1.0])
-
-        cats = self.model.ops.asarray(cats, dtype="float32")
-        if len(scores) != len(cats):
-            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
-
-        d_scores = (scores - cats)
-        loss = (d_scores ** 2).sum()
-        loss = loss / len(cats)
-        return loss, d_scores
-
    def __call__(self, doc):
-        kb_ids, tensors = self.predict([doc])
-        self.set_annotations([doc], kb_ids, tensors=tensors)
+        kb_ids = self.predict([doc])
+        self.set_annotations([doc], kb_ids)
        return doc

    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
-            kb_ids, tensors = self.predict(docs)
-            self.set_annotations(docs, kb_ids, tensors=tensors)
+            kb_ids = self.predict(docs)
+            self.set_annotations(docs, kb_ids)
            yield from docs

    def predict(self, docs):
@ -1197,10 +1168,9 @@ class EntityLinker(Pipe):
        self.require_kb()
        entity_count = 0
        final_kb_ids = []
-        final_tensors = []

        if not docs:
-            return final_kb_ids, final_tensors
+            return final_kb_ids

        if isinstance(docs, Doc):
            docs = [docs]
@ -1234,21 +1204,18 @@ class EntityLinker(Pipe):
                            if to_discard and ent.label_ in to_discard:
                                # ignoring this entity - setting to NIL
                                final_kb_ids.append(self.NIL)
-                                final_tensors.append(sentence_encoding)

                            else:
                                candidates = self.kb.get_candidates(ent.text)
                                if not candidates:
                                    # no prediction possible for this entity - setting to NIL
                                    final_kb_ids.append(self.NIL)
-                                    final_tensors.append(sentence_encoding)

                                elif len(candidates) == 1:
                                    # shortcut for efficiency reasons: take the 1 candidate

                                    # TODO: thresholding
                                    final_kb_ids.append(candidates[0].entity_)
-                                    final_tensors.append(sentence_encoding)

                                else:
                                    random.shuffle(candidates)
@ -1277,14 +1244,13 @@ class EntityLinker(Pipe):
                                    best_index = scores.argmax().item()
                                    best_candidate = candidates[best_index]
                                    final_kb_ids.append(best_candidate.entity_)
-                                    final_tensors.append(sentence_encoding)

-        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
+        if not (len(final_kb_ids) == entity_count):
            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))

-        return final_kb_ids, final_tensors
+        return final_kb_ids

-    def set_annotations(self, docs, kb_ids, tensors=None):
+    def set_annotations(self, docs, kb_ids):
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
@ -1400,11 +1366,7 @@ class Sentencizer(Pipe):
    def pipe(self, stream, batch_size=128):
        for docs in util.minibatch(stream, size=batch_size):
            predictions = self.predict(docs)
-            if isinstance(predictions, tuple) and len(tuple) == 2:
-                scores, tensors = predictions
-                self.set_annotations(docs, scores, tensors=tensors)
-            else:
-                self.set_annotations(docs, predictions)
+            self.set_annotations(docs, predictions)
            yield from docs

    def predict(self, docs):
@ -1435,7 +1397,7 @@ class Sentencizer(Pipe):
            guesses.append(doc_guesses)
        return guesses

-    def set_annotations(self, docs, batch_tag_ids, tensors=None):
+    def set_annotations(self, docs, batch_tag_ids):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
--- a/spacy/pipeline/simple_ner.py
+++ b/spacy/pipeline/simple_ner.py
@ -57,7 +57,7 @@ class SimpleNER(Pipe):
        scores = self.model.predict(docs)
        return scores

-    def set_annotations(self, docs: List[Doc], scores: List[Floats2d], tensors=None):
+    def set_annotations(self, docs: List[Doc], scores: List[Floats2d]):
        """Set entities on a batch of documents from a batch of scores."""
        tag_names = self.get_tag_names()
        for i, doc in enumerate(docs):
@ -67,9 +67,12 @@ class SimpleNER(Pipe):
                tags = iob_to_biluo(tags)
            doc.ents = spans_from_biluo_tags(doc, tags)

-    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
+    def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None):
+        if losses is None:
+            losses = {}
+        losses.setdefault("ner", 0.0)
        if not any(_has_ner(eg) for eg in examples):
-            return 0
+            return losses
        docs = [eg.predicted for eg in examples]
        set_dropout_rate(self.model, drop)
        scores, bp_scores = self.model.begin_update(docs)
@ -79,10 +82,8 @@ class SimpleNER(Pipe):
            self.set_annotations(docs, scores)
        if sgd is not None:
            self.model.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault("ner", 0.0)
-            losses["ner"] += loss
-        return loss
+        losses["ner"] += loss
+        return losses

    def get_loss(self, examples, scores):
        loss = 0
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -83,12 +83,14 @@ class Tok2Vec(Pipe):
            assert tokvecs.shape[0] == len(doc)
            doc.tensor = tokvecs

-    def update(self, examples, drop=0.0, sgd=None, losses=None, set_annotations=False):
+    def update(self, examples, *, drop=0.0, sgd=None, losses=None, set_annotations=False):
        """Update the model.
-        examples (iterable): A batch of examples
+        examples (Iterable[Example]): A batch of examples
        drop (float): The droput rate.
-        sgd (callable): An optimizer.
-        RETURNS (dict): Results from the update.
+        sgd (Optimizer): An optimizer.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        set_annotations (bool): whether or not to update the examples with the predictions
+        RETURNS (Dict[str, float]): The updated losses dictionary
        """
        if losses is None:
            losses = {}
@ -124,6 +126,7 @@ class Tok2Vec(Pipe):
        self.listeners[-1].receive(batch_id, tokvecs, backprop)
        if set_annotations:
            self.set_annotations(docs, tokvecs)
+        return losses

    def get_loss(self, docs, golds, scores):
        pass
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
 class ProjectConfigAsset(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
-    url: StrictStr = Field(..., title="URL of asset")
+    url: Optional[StrictStr] = Field(None, title="URL of asset")
    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    # fmt: on

@ -232,9 +232,10 @@ class ProjectConfigCommand(BaseModel):
    name: StrictStr = Field(..., title="Name of command")
    help: Optional[StrictStr] = Field(None, title="Command description")
    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
-    deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
-    outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
-    outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
+    deps: List[StrictStr] = Field([], title="File dependencies required by this command")
+    outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
+    outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
+    no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
    # fmt: on

    class Config:
@ -246,7 +247,7 @@ class ProjectConfigSchema(BaseModel):
    # fmt: off
    variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
    assets: List[ProjectConfigAsset] = Field([], title="Data assets")
-    run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
    # fmt: on

--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no


 class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
@ -243,6 +245,13 @@ class ParserStepModel(Model):
            for class_ in unseen_classes:
                self._class_mask[class_] = 0.

+    def clear_memory(self):
+        del self.tokvecs
+        del self.bp_tokvecs
+        del self.state2vec
+        del self.backprops
+        del self._class_mask
+
    @property
    def nO(self):
        if self.attrs["has_upper"]:
@ -271,6 +280,19 @@ class ParserStepModel(Model):
            c_ids += ids.shape[1]
        return ids

+    def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
+        if isinstance(self.state2vec.ops, CupyOps) \
+        and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+            # Move token_ids and d_vector to GPU, asynchronously
+            self.backprops.append((
+                util.get_async(self.cuda_stream, token_ids),
+                util.get_async(self.cuda_stream, d_vector),
+                get_d_tokvecs
+            ))
+        else:
+            self.backprops.append((token_ids, d_vector, get_d_tokvecs))
+
+
    def finish_steps(self, golds):
        # Add a padding vector to the d_tokvecs gradient, so that missing
        # values don't affect the real gradient.
@ -289,11 +311,17 @@ class ParserStepModel(Model):
        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs

+NUMPY_OPS = NumpyOps()

 def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
@ -305,16 +333,9 @@ def step_forward(model: ParserStepModel, states, is_train):
        # Zero vectors for unseen classes
        d_scores *= model._class_mask
        d_vector = get_d_vector(d_scores)
-        if isinstance(model.state2vec.ops, CupyOps) \
-        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
-            # Move token_ids and d_vector to GPU, asynchronously
-            model.backprops.append((
-                util.get_async(model.cuda_stream, token_ids),
-                util.get_async(model.cuda_stream, d_vector),
-                get_d_tokvecs
-            ))
-        else:
-            model.backprops.append((token_ids, d_vector, get_d_tokvecs))
+        if mask is not None:
+            d_vector *= mask
+        model.backprop_step(token_ids, d_vector, get_d_tokvecs)
        return None
    return scores, backprop_parser_step

@ -437,7 +458,7 @@ cdef class precompute_hiddens:
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector = state_vector + self.bias
+        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector_ids):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -65,7 +65,6 @@ cdef class Parser:
            self.set_output(self.moves.n_moves)
        self.cfg = dict(cfg)
        self.cfg.setdefault("update_with_oracle_cut_size", 100)
-        self.cfg.setdefault("normalize_gradients_with_batch_size", True)
        self._multitasks = []
        for multitask in cfg.get("multitasks", []):
            self.add_multitask_objective(multitask)
@ -154,7 +153,7 @@ cdef class Parser:
        doc (Doc): The document to be processed.
        """
        states = self.predict([doc])
-        self.set_annotations([doc], states, tensors=None)
+        self.set_annotations([doc], states)
        return doc

    def pipe(self, docs, int batch_size=256):
@ -171,7 +170,7 @@ cdef class Parser:
            for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
                subbatch = list(subbatch)
                parse_states = self.predict(subbatch)
-                self.set_annotations(subbatch, parse_states, tensors=None)
+                self.set_annotations(subbatch, parse_states)
            yield from batch_in_order

    def predict(self, docs):
@ -201,6 +200,8 @@ cdef class Parser:
        with nogil:
            self._parseC(&states[0],
                weights, sizes)
+        model.clear_memory()
+        del model
        return batch

    cdef void _parseC(self, StateC** states,
@ -223,7 +224,7 @@ cdef class Parser:
            unfinished.clear()
        free_activations(&activations)

-    def set_annotations(self, docs, states, tensors=None):
+    def set_annotations(self, docs, states):
        cdef StateClass state
        cdef Doc doc
        for i, (state, doc) in enumerate(zip(states, docs)):
@ -264,7 +265,7 @@ cdef class Parser:
                states[i].push_hist(guess)
        free(is_valid)

-    def update(self, examples, drop=0., set_annotations=False, sgd=None, losses=None):
+    def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None):
        cdef StateClass state
        if losses is None:
            losses = {}
@ -280,11 +281,12 @@ cdef class Parser:
            [eg.predicted for eg in examples])
        if self.cfg["update_with_oracle_cut_size"] >= 1:
            # Chop sequences into lengths of this many transitions, to make the
-            # batch uniform length. We randomize this to overfit less.
+            # batch uniform length.
+            # We used to randomize this, but it's not clear that actually helps?
            cut_size = self.cfg["update_with_oracle_cut_size"]
            states, golds, max_steps = self._init_gold_batch(
                examples,
-                max_length=numpy.random.choice(range(5, cut_size))
+                max_length=cut_size 
            )
        else:
            states, golds, _ = self.moves.init_gold_batch(examples)
@ -292,24 +294,15 @@ cdef class Parser:
        if not states:
            return losses
        all_states = list(states)
-        states_golds = zip(states, golds)
-        for _ in range(max_steps):
-            if not states_golds:
-                break
+        states_golds = list(zip(states, golds))
+        while states_golds:
            states, golds = zip(*states_golds)
            scores, backprop = model.begin_update(states)
            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            if self.cfg["normalize_gradients_with_batch_size"]:
-                # We have to be very careful how we do this, because of the way we
-                # cut up the batch. We subdivide long sequences. If we normalize
-                # naively, we end up normalizing by sequence length, which
-                # is bad: that would mean that states in long sequences
-                # consistently get smaller gradients. Imagine if we have two
-                # sequences, one length 1000, one length 20. If we cut up
-                # the 1k sequence so that we have a "batch" of 50 subsequences,
-                # we don't want the gradients to get 50 times smaller!
-                d_scores /= n_examples
-
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, scores)
@ -321,6 +314,13 @@ cdef class Parser:
        if set_annotations:
            docs = [eg.predicted for eg in examples]
            self.set_annotations(docs, all_states)
+        # Ugh, this is annoying. If we're working on GPU, we want to free the
+        # memory ASAP. It seems that Python doesn't necessarily get around to
+        # removing these in time if we don't explicitly delete? It's confusing.
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        del model
        return losses

    def rehearse(self, examples, sgd=None, losses=None, **cfg):
@ -344,7 +344,7 @@ cdef class Parser:
        set_dropout_rate(self._rehearsal_model, 0.0)
        set_dropout_rate(self.model, 0.0)
        tutor, _ = self._rehearsal_model.begin_update(docs)
-        model, finish_update = self.model.begin_update(docs)
+        model, backprop_tok2vec = self.model.begin_update(docs)
        n_scores = 0.
        loss = 0.
        while states:
@ -360,10 +360,16 @@ cdef class Parser:
            states = [state for state in states if not state.is_final()]
            n_scores += d_scores.size
        # Do the backprop
-        finish_update(docs)
+        backprop_tok2vec(docs)
        if sgd is not None:
            self.model.finish_update(sgd)
        losses[self.name] += loss / n_scores
+        del backprop
+        del backprop_tok2vec
+        model.clear_memory()
+        tutor.clear_memory()
+        del model
+        del tutor
        return losses

    def get_gradients(self):
@ -407,6 +413,7 @@ cdef class Parser:
            cpu_log_loss(c_d_scores,
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
        if losses is not None:
            losses.setdefault(self.name, 0.)
            losses[self.name] += (d_scores**2).sum()
@ -525,21 +532,25 @@ cdef class Parser:
            StateClass state
            Transition action
        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+        states = []
+        golds = []
        kept = []
        max_length_seen = 0
        for state, eg in zip(all_states, examples):
            if self.moves.has_gold(eg) and not state.is_final():
                gold = self.moves.init_gold(state, eg)
-                oracle_actions = self.moves.get_oracle_sequence_from_state(
-                    state.copy(), gold)
-                kept.append((eg, state, gold, oracle_actions))
-                min_length = min(min_length, len(oracle_actions))
-                max_length_seen = max(max_length, len(oracle_actions))
+                if len(eg.x) < max_length:
+                    states.append(state)
+                    golds.append(gold)
+                else:
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                        state.copy(), gold)
+                    kept.append((eg, state, gold, oracle_actions))
+                    min_length = min(min_length, len(oracle_actions))
+                    max_length_seen = max(max_length, len(oracle_actions))
        if not kept:
-            return [], [], 0
+            return states, golds, 0
        max_length = max(min_length, min(max_length, max_length_seen))
-        states = []
-        golds = []
        cdef int clas
        max_moves = 0
        for eg, state, gold, oracle_actions in kept:
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):

 def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
    assert contains_cycle(tree) is None
-    assert contains_cycle(cyclic_tree) == set([3, 4, 5])
+    assert contains_cycle(cyclic_tree) == {3, 4, 5}
    assert contains_cycle(partial_tree) is None
    assert contains_cycle(multirooted_tree) is None

--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -198,10 +198,10 @@ def test_overfitting_IO():
    nlp.add_pipe(parser)
    optimizer = nlp.begin_training()

-    for i in range(50):
+    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
-    assert losses["parser"] < 0.00001
+    assert losses["parser"] < 0.0001

    # test the trained model
    test_text = "I like securities."
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -38,6 +38,11 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # add some cases where SENT_START == -1
+    train_examples[0].reference[10].is_sent_start = False
+    train_examples[1].reference[1].is_sent_start = False
+    train_examples[1].reference[11].is_sent_start = False
+
    nlp.add_pipe(senter)
    optimizer = nlp.begin_training()

--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -84,7 +84,7 @@ def test_overfitting_IO():
    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
-    textcat = nlp.create_pipe("textcat")
+    textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -23,6 +23,7 @@ def test_issue2070():
    assert len(doc) == 11


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
    assert len(matches) == 3


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
+        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -88,6 +88,7 @@ def test_issue3199():
    assert list(doc[0:3].noun_chunks) == []


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -0,0 +1,472 @@
+import pytest
+from spacy.language import Language
+from spacy.vocab import Vocab
+from spacy.pipeline import EntityRuler, DependencyParser
+from spacy.pipeline.defaults import default_parser
+from spacy import displacy, load
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc, Token
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.errors import MatchPatternError
+from spacy.util import minibatch
+from spacy.gold import Example
+from spacy.lang.hi import Hindi
+from spacy.lang.es import Spanish
+from spacy.lang.en import English
+from spacy.attrs import IS_ALPHA
+from thinc.api import compounding
+import spacy
+import srsly
+import numpy
+
+from ..util import make_tempdir, get_doc
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
+    assert tok.is_stop
+
+
+def test_issue_3526_1(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler_bytes = ruler.to_bytes()
+    assert len(ruler) == len(patterns)
+    assert len(ruler.labels) == 4
+    assert ruler.overwrite
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(ruler_bytes)
+    assert len(new_ruler) == len(ruler)
+    assert len(new_ruler.labels) == 4
+    assert new_ruler.overwrite == ruler.overwrite
+    assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+def test_issue_3526_2(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(bytes_old_style)
+    assert len(new_ruler) == len(ruler)
+    for pattern in ruler.patterns:
+        assert pattern in new_ruler.patterns
+    assert new_ruler.overwrite is not ruler.overwrite
+
+
+def test_issue_3526_3(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    with make_tempdir() as tmpdir:
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
+        for pattern in ruler.patterns:
+            assert pattern in new_ruler.patterns
+        assert len(new_ruler) == len(ruler)
+        assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue_3526_4(en_vocab):
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, overwrite_ents=True)
+    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+    nlp.add_pipe(ruler)
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
+        nlp2 = load(tmpdir)
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True
+
+
+def test_issue3531():
+    """Test that displaCy renderer doesn't require "settings" key."""
+    example_dep = {
+        "words": [
+            {"text": "But", "tag": "CCONJ"},
+            {"text": "Google", "tag": "PROPN"},
+            {"text": "is", "tag": "VERB"},
+            {"text": "starting", "tag": "VERB"},
+            {"text": "from", "tag": "ADP"},
+            {"text": "behind.", "tag": "ADV"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+        ],
+    }
+    example_ent = {
+        "text": "But Google is starting from behind.",
+        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+    }
+    dep_html = displacy.render(example_dep, style="dep", manual=True)
+    assert dep_html
+    ent_html = displacy.render(example_ent, style="ent", manual=True)
+    assert ent_html
+
+
+def test_issue3540(en_vocab):
+    words = ["I", "live", "in", "NewYork", "right", "now"]
+    tensor = numpy.asarray(
+        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+        dtype="f",
+    )
+    doc = Doc(en_vocab, words=words)
+    doc.tensor = tensor
+    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_1 = [token.vector for token in doc]
+    assert len(vectors_1) == len(doc)
+
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[3], 1), doc[2]]
+        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_2 = [token.vector for token in doc]
+    assert len(vectors_2) == len(doc)
+    assert vectors_1[0].tolist() == vectors_2[0].tolist()
+    assert vectors_1[1].tolist() == vectors_2[1].tolist()
+    assert vectors_1[2].tolist() == vectors_2[2].tolist()
+    assert vectors_1[4].tolist() == vectors_2[5].tolist()
+    assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
+def test_issue3549(en_vocab):
+    """Test that match pattern validation doesn't raise on empty errors."""
+    matcher = Matcher(en_vocab, validate=True)
+    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+    matcher.add("GOOD", [pattern])
+    with pytest.raises(MatchPatternError):
+        matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.xfail
+def test_issue3555(en_vocab):
+    """Test that custom extensions with default None don't break matcher."""
+    Token.set_extension("issue3555", default=None)
+    matcher = Matcher(en_vocab)
+    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["have", "apple"])
+    matcher(doc)
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training(X=x_train, Y=y_train)
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+
+
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
+
+
+def test_issue3803():
+    """Test that spanish num-like tokens have True for like_num attribute."""
+    nlp = Spanish()
+    text = "2 dos 1000 mil 12 doce"
+    doc = nlp(text)
+
+    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_no_subtok():
+    """Test that the parser doesn't have subtok label if not learn_tokens"""
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" not in parser.labels
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_with_subtok():
+    """Test that the parser does have subtok label if learn_tokens=True."""
+    config = {
+        "learn_tokens": True,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" in parser.labels
+
+
+def test_issue3839(en_vocab):
+    """Test that match IDs returned by the matcher are correct, are in the string """
+    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+    matcher = Matcher(en_vocab)
+    match_id = "PATTERN"
+    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+    matcher.add(match_id, [pattern1])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+    matcher = Matcher(en_vocab)
+    matcher.add(match_id, [pattern2])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+    doc = nlp(sentence)
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+def test_issue3879(en_vocab):
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    assert len(doc) == 5
+    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [pattern])
+    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("parser"))
+    nlp.add_pipe(nlp.create_pipe("ner"))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.get_pipe("parser").add_label("dep")
+    nlp.get_pipe("ner").add_label("PERSON")
+    nlp.get_pipe("tagger").add_label("NN")
+    nlp.begin_training()
+    for doc in nlp.pipe(texts):
+        pass
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)
+
+
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+def test_issue3959():
+    """ Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+        assert doc2[0].pos_ == "NOUN"
+
+
+def test_issue3962(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+def test_issue3962_long(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # head set to the new artificial head (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
+
+
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+
+    assert len(matches) == 2
+
+    # We should have a match for each of the two rules
+    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+    assert "A" in found_ids
+    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,8 +0,0 @@
-import pytest
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-def test_issue3521(en_tokenizer, word):
-    tok = en_tokenizer(word)[1]
-    # 'not' and 'would' should be stopwords, also in their abbreviated forms
-    assert tok.is_stop
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@ -1,85 +0,0 @@
-import pytest
-from spacy.tokens import Span
-from spacy.language import Language
-from spacy.pipeline import EntityRuler
-from spacy import load
-import srsly
-
-from ..util import make_tempdir
-
-
-@pytest.fixture
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-
-
-@pytest.fixture
-def add_ent():
-    def add_ent_component(doc):
-        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
-        return doc
-
-    return add_ent_component
-
-
-def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    ruler_bytes = ruler.to_bytes()
-    assert len(ruler) == len(patterns)
-    assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(ruler_bytes)
-    assert len(new_ruler) == len(ruler)
-    assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, overwrite_ents=True)
-
-    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-    nlp.add_pipe(ruler)
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        ruler = nlp.get_pipe("entity_ruler")
-        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
-        nlp2 = load(tmpdir)
-        new_ruler = nlp2.get_pipe("entity_ruler")
-        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@ -1,30 +0,0 @@
-from spacy import displacy
-
-
-def test_issue3531():
-    """Test that displaCy renderer doesn't require "settings" key."""
-    example_dep = {
-        "words": [
-            {"text": "But", "tag": "CCONJ"},
-            {"text": "Google", "tag": "PROPN"},
-            {"text": "is", "tag": "VERB"},
-            {"text": "starting", "tag": "VERB"},
-            {"text": "from", "tag": "ADP"},
-            {"text": "behind.", "tag": "ADV"},
-        ],
-        "arcs": [
-            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
-            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
-            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
-            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
-            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
-        ],
-    }
-    example_ent = {
-        "text": "But Google is starting from behind.",
-        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
-    }
-    dep_html = displacy.render(example_dep, style="dep", manual=True)
-    assert dep_html
-    ent_html = displacy.render(example_ent, style="ent", manual=True)
-    assert ent_html
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@ -1,44 +0,0 @@
-from spacy.tokens import Doc
-
-import numpy as np
-
-
-def test_issue3540(en_vocab):
-
-    words = ["I", "live", "in", "NewYork", "right", "now"]
-    tensor = np.asarray(
-        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
-        dtype="f",
-    )
-    doc = Doc(en_vocab, words=words)
-    doc.tensor = tensor
-
-    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_1 = [token.vector for token in doc]
-    assert len(vectors_1) == len(doc)
-
-    with doc.retokenize() as retokenizer:
-        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
-        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
-    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_2 = [token.vector for token in doc]
-    assert len(vectors_2) == len(doc)
-
-    assert vectors_1[0].tolist() == vectors_2[0].tolist()
-    assert vectors_1[1].tolist() == vectors_2[1].tolist()
-    assert vectors_1[2].tolist() == vectors_2[2].tolist()
-
-    assert vectors_1[4].tolist() == vectors_2[5].tolist()
-    assert vectors_1[5].tolist() == vectors_2[6].tolist()
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@ -1,12 +0,0 @@
-import pytest
-from spacy.matcher import Matcher
-from spacy.errors import MatchPatternError
-
-
-def test_issue3549(en_vocab):
-    """Test that match pattern validation doesn't raise on empty errors."""
-    matcher = Matcher(en_vocab, validate=True)
-    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-    matcher.add("GOOD", [pattern])
-    with pytest.raises(MatchPatternError):
-        matcher.add("BAD", [[{"X": "Y"}]])
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@ -1,14 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher
-
-
-@pytest.mark.xfail
-def test_issue3555(en_vocab):
-    """Test that custom extensions with default None don't break matcher."""
-    Token.set_extension("issue3555", default=None)
-    matcher = Matcher(en_vocab)
-    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["have", "apple"])
-    matcher(doc)
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,45 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue3611():
-    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training(X=x_train, Y=y_train)
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -1,9 +0,0 @@
-from spacy.lang.hi import Hindi
-
-
-def test_issue3625():
-    """Test that default punctuation rules applies to hindi unicode characters"""
-    nlp = Hindi()
-    doc = nlp("hi. how हुए. होटल, होटल")
-    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
-    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@ -1,10 +0,0 @@
-from spacy.lang.es import Spanish
-
-
-def test_issue3803():
-    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
-
-    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@ -1,34 +0,0 @@
-from spacy.pipeline.pipes import DependencyParser
-from spacy.vocab import Vocab
-
-from spacy.pipeline.defaults import default_parser
-
-
-def test_issue3830_no_subtok():
-    """Test that the parser doesn't have subtok label if not learn_tokens"""
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" not in parser.labels
-
-
-def test_issue3830_with_subtok():
-    """Test that the parser does have subtok label if learn_tokens=True."""
-    config = {
-        "learn_tokens": True,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,18 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3839(en_vocab):
-    """Test that match IDs returned by the matcher are correct, are in the string """
-    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
-    matcher = Matcher(en_vocab)
-    match_id = "PATTERN"
-    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
-    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
-    matcher.add(match_id, [pattern1])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-    matcher = Matcher(en_vocab)
-    matcher.add(match_id, [pattern2])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.attrs import IS_ALPHA
-from spacy.lang.en import English
-
-
-@pytest.mark.parametrize(
-    "sentence",
-    [
-        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
-        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
-    ],
-)
-def test_issue3869(sentence):
-    """Test that the Doc's count_by function works consistently"""
-    nlp = English()
-    doc = nlp(sentence)
-
-    count = 0
-    for token in doc:
-        count += token.is_alpha
-
-    assert count == doc.count_by(IS_ALPHA).get(1, 0)
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,11 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3879(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
-    assert len(doc) == 5
-    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [pattern])
-    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@ -1,21 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue3880():
-    """Test that `nlp.pipe()` works when an empty string ends the batch.
-
-    Fixed in v7.0.5 of Thinc.
-    """
-    texts = ["hello", "world", "", ""]
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("parser"))
-    nlp.add_pipe(nlp.create_pipe("ner"))
-    nlp.add_pipe(nlp.create_pipe("tagger"))
-    nlp.get_pipe("parser").add_label("dep")
-    nlp.get_pipe("ner").add_label("PERSON")
-    nlp.get_pipe("tagger").add_label("NN")
-    nlp.begin_training()
-    for doc in nlp.pipe(texts):
-        pass
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -1,12 +0,0 @@
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc
-
-
-def test_issue3882(en_vocab):
-    """Test that displaCy doesn't serialize the doc.user_data when making a
-    copy of the Doc.
-    """
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
-    doc.user_data["test"] = set()
-    parse_deps(doc)
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,17 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3951(en_vocab):
-    """Test that combinations of optional rules are matched correctly."""
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"LOWER": "hello"},
-        {"LOWER": "this", "OP": "?"},
-        {"OP": "?"},
-        {"LOWER": "world"},
-    ]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
-    matches = matcher(doc)
-    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -1,26 +0,0 @@
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-def test_issue3959():
-    """ Ensure that a modified pos attribute is serialized correctly."""
-    nlp = English()
-    doc = nlp(
-        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
-    )
-    assert doc[0].pos_ == ""
-
-    doc[0].pos_ = "NOUN"
-    assert doc[0].pos_ == "NOUN"
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    with make_tempdir() as tmp_dir:
-        file_path = tmp_dir / "my_doc"
-        doc.to_disk(file_path)
-
-        doc2 = nlp("")
-        doc2.from_disk(file_path)
-
-        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -1,117 +0,0 @@
-import pytest
-
-from ..util import get_doc
-
-
-@pytest.fixture
-def doc(en_tokenizer):
-    text = "He jests at scars, that never felt a wound."
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ccomp",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962(doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = doc[1:5]  # "jests at scars ,"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root
-    assert doc2[0].dep_ == "dep"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"  # head set to the new artificial root
-    assert doc2[3].dep_ == "dep"
-
-    # We should still have 1 sentence
-    assert len(list(doc2.sents)) == 1
-
-    span3 = doc[6:9]  # "never felt a"
-    doc3 = span3.as_doc()
-    doc3_json = doc3.to_json()
-    assert doc3_json
-
-    assert doc3[0].head.text == "felt"
-    assert doc3[0].dep_ == "neg"
-    assert doc3[1].head.text == "felt"
-    assert doc3[1].dep_ == "ROOT"
-    assert doc3[2].head.text == "felt"  # head set to ancestor
-    assert doc3[2].dep_ == "dep"
-
-    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
-    assert len(list(doc3.sents)) == 1
-
-
-@pytest.fixture
-def two_sent_doc(en_tokenizer):
-    text = "He jests at scars. They never felt a wound."
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ROOT",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962_long(two_sent_doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root (in sentence 1)
-    assert doc2[0].dep_ == "ROOT"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"
-    assert doc2[3].dep_ == "punct"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to itself, being the new artificial root (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to the new artificial head (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-
-    # We should still have 2 sentences
-    sents = list(doc2.sents)
-    assert len(sents) == 2
-    assert sents[0].text == "jests at scars ."
-    assert sents[1].text == "They never"
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,19 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
-    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
-    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
-    matches = matcher(doc)
-
-    assert len(matches) == 2
-
-    # We should have a match for each of the two rules
-    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
-    assert "A" in found_ids
-    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -0,0 +1,469 @@
+import pytest
+from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
+from spacy.pipeline.defaults import default_ner
+from spacy.matcher import PhraseMatcher, Matcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example, Corpus
+from spacy.gold.converters import json2docs
+from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.util import minibatch, ensure_path, load_model
+from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
+from spacy.tokenizer import Tokenizer
+from spacy.lang.el import Greek
+from spacy.language import Language
+import spacy
+from thinc.api import compounding
+from collections import defaultdict
+
+from ..util import make_tempdir
+
+
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes.
+    """
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", [pattern1])
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", [pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
+def test_issue4030():
+    """ Test whether textcat works fine with empty doc """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+    # processing of an empty doc should result in 0.0 for all categories
+    doc = nlp("")
+    assert doc.cats["offensive"] == 0.0
+    assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = load_model(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        config = {
+            "learn_tokens": False,
+            "min_action_freq": 30,
+            "beam_width": 1,
+            "beam_update_prob": 1.0,
+        }
+        ner2 = EntityRecognizer(vocab, default_ner(), **config)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
+
+
+def test_issue4054(en_vocab):
+    """Test that a new blank model can be made with a vocab from file,
+    and that serialization does not drop the language at any point."""
+    nlp1 = English()
+    vocab1 = nlp1.vocab
+    with make_tempdir() as d:
+        vocab_dir = ensure_path(d / "vocab")
+        if not vocab_dir.exists():
+            vocab_dir.mkdir()
+        vocab1.to_disk(vocab_dir)
+        vocab2 = Vocab().from_disk(vocab_dir)
+        print("lang", vocab2.lang)
+        nlp2 = spacy.blank("en", vocab=vocab2)
+        nlp_dir = ensure_path(d / "nlp")
+        if not nlp_dir.exists():
+            nlp_dir.mkdir()
+        nlp2.to_disk(nlp_dir)
+        nlp3 = load_model(nlp_dir)
+        assert nlp3.lang == "en"
+
+
+def test_issue4120(en_vocab):
+    """Test that matches without a final {OP: ?} token are returned."""
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+    doc1 = Doc(en_vocab, words=["a"])
+    assert len(matcher(doc1)) == 1  # works
+    doc2 = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc2)) == 2  # fixed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc3)) == 2  # works
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc4)) == 3  # fixed
+
+
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    doc_bytes = doc.to_bytes()
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+    assert actual == pos
+
+
+def test_issue4190():
+    def customize_tokenizer(nlp):
+        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+        infix_re = compile_infix_regex(nlp.Defaults.infixes)
+        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+        exceptions = {
+            k: v
+            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+            if not (len(k) == 2 and k[1] == ".")
+        }
+        new_tokenizer = Tokenizer(
+            nlp.vocab,
+            exceptions,
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=nlp.tokenizer.token_match,
+        )
+        nlp.tokenizer = new_tokenizer
+
+    test_string = "Test c."
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = load_model(model_dir)
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+    assert result_1b == result_2
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4267():
+    """ Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    assert "ner" in nlp.pipe_names
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.is_nered
+    for token in doc1:
+        assert token.ent_iob == 2
+    # add entity ruler and run again
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.is_nered
+    for token in doc2:
+        assert token.ent_iob == 2
+
+
+def test_issue4272():
+    """Test that lookup table can be accessed from Token.lemma if no POS tags
+    are available."""
+    nlp = Greek()
+    doc = nlp("Χθες")
+    assert doc[0].lemma_
+
+
+def test_multiple_predictions():
+    class DummyPipe(Pipe):
+        def __init__(self):
+            self.model = "dummy_model"
+
+        def predict(self, docs):
+            return ([1, 2, 3], [4, 5, 6])
+
+        def set_annotations(self, docs, scores):
+            return docs
+
+    nlp = Language()
+    doc = nlp.make_doc("foo")
+    dummy_pipe = DummyPipe()
+    dummy_pipe(doc)
+
+
+@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+    nlp = English()
+    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+    TRAIN_DATA = [example, example]
+    tagger = nlp.create_pipe("tagger")
+    nlp.add_pipe(tagger)
+    optimizer = nlp.begin_training()
+    for i in range(5):
+        losses = {}
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+
+
+def test_issue4367():
+    """Test that docbin init goes well"""
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+def test_issue4373():
+    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+    matcher = Matcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+    matcher = PhraseMatcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+
+
+def test_issue4402():
+    json_data = {
+        "id": 0,
+        "paragraphs": [
+            {
+                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "How", "ner": "O"},
+                            {"id": 1, "orth": "should", "ner": "O"},
+                            {"id": 2, "orth": "I", "ner": "O"},
+                            {"id": 3, "orth": "cook", "ner": "O"},
+                            {"id": 4, "orth": "bacon", "ner": "O"},
+                            {"id": 5, "orth": "in", "ner": "O"},
+                            {"id": 6, "orth": "an", "ner": "O"},
+                            {"id": 7, "orth": "oven", "ner": "O"},
+                            {"id": 8, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {
+                        "tokens": [
+                            {"id": 9, "orth": "\n", "ner": "O"},
+                            {"id": 10, "orth": "I", "ner": "O"},
+                            {"id": 11, "orth": "'ve", "ner": "O"},
+                            {"id": 12, "orth": "heard", "ner": "O"},
+                            {"id": 13, "orth": "of", "ner": "O"},
+                            {"id": 14, "orth": "people", "ner": "O"},
+                            {"id": 15, "orth": "cooking", "ner": "O"},
+                            {"id": 16, "orth": "bacon", "ner": "O"},
+                            {"id": 17, "orth": "in", "ner": "O"},
+                            {"id": 18, "orth": "an", "ner": "O"},
+                            {"id": 19, "orth": "oven", "ner": "O"},
+                            {"id": 20, "orth": ".", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                ],
+                "cats": [
+                    {"label": "baking", "value": 1.0},
+                    {"label": "not_baking", "value": 0.0},
+                ],
+            },
+            {
+                "raw": "What is the difference between white and brown eggs?\n",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "What", "ner": "O"},
+                            {"id": 1, "orth": "is", "ner": "O"},
+                            {"id": 2, "orth": "the", "ner": "O"},
+                            {"id": 3, "orth": "difference", "ner": "O"},
+                            {"id": 4, "orth": "between", "ner": "O"},
+                            {"id": 5, "orth": "white", "ner": "O"},
+                            {"id": 6, "orth": "and", "ner": "O"},
+                            {"id": 7, "orth": "brown", "ner": "O"},
+                            {"id": 8, "orth": "eggs", "ner": "O"},
+                            {"id": 9, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+                ],
+                "cats": [
+                    {"label": "baking", "value": 0.0},
+                    {"label": "not_baking", "value": 1.0},
+                ],
+            },
+        ],
+    }
+    nlp = English()
+    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "test4402.spacy"
+        docs = json2docs([json_data])
+        data = DocBin(docs=docs, attrs=attrs).to_bytes()
+        with output_file.open("wb") as file_:
+            file_.write(data)
+        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
+
+        train_data = list(corpus.train_dataset(nlp))
+        assert len(train_data) == 2
+
+        split_train_data = []
+        for eg in train_data:
+            split_train_data.extend(eg.split_sents())
+        assert len(split_train_data) == 4
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@ -1,23 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes.
-    """
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern1 = Doc(en_vocab, words=["c", "d"])
-    assert [t.norm_ for t in pattern1] == ["c", "d"]
-    matcher.add("TEST", [pattern1])
-    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
-    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern2 = Doc(en_vocab, words=["1", "2"])
-    pattern2[0].norm_ = "c"
-    pattern2[1].norm_ = "d"
-    assert [t.norm_ for t in pattern2] == ["c", "d"]
-    matcher.add("TEST", [pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@ -1,50 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue4030():
-    """ Test whether textcat works fine with empty doc """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
-
-    # processing of an empty doc should result in 0.0 for all categories
-    doc = nlp("")
-    assert doc.cats["offensive"] == 0.0
-    assert doc.cats["inoffensive"] == 0.0
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -1,85 +0,0 @@
-import spacy
-from spacy.pipeline import EntityRecognizer, EntityRuler
-from spacy.lang.en import English
-from spacy.tokens import Span
-from spacy.util import ensure_path
-from spacy.pipeline.defaults import default_ner
-
-from ..util import make_tempdir
-
-
-def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
-    nlp = English()
-
-    # add ner pipe
-    ner = nlp.create_pipe("ner")
-    ner.add_label("SOME_LABEL")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    # Add entity ruler
-    ruler = EntityRuler(nlp)
-    patterns = [
-        {"label": "MY_ORG", "pattern": "Apple"},
-        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
-    ]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
-    doc1 = nlp("What do you think about Apple ?")
-    assert doc1.ents[0].label_ == "MY_ORG"
-
-    with make_tempdir() as d:
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.to_disk(output_dir)
-
-        nlp2 = spacy.load(output_dir)
-        doc2 = nlp2("What do you think about Apple ?")
-        assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
-    """
-    Test that serialization of an NER works fine when new labels were added.
-    This is the second bug of two bugs underlying the issue 4042.
-    """
-    nlp1 = English()
-    vocab = nlp1.vocab
-
-    # add ner pipe
-    ner1 = nlp1.create_pipe("ner")
-    ner1.add_label("SOME_LABEL")
-    nlp1.add_pipe(ner1)
-    nlp1.begin_training()
-
-    # add a new label to the doc
-    doc1 = nlp1("What do you think about Apple ?")
-    assert len(ner1.labels) == 1
-    assert "SOME_LABEL" in ner1.labels
-    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
-    doc1.ents = list(doc1.ents) + [apple_ent]
-
-    # reapply the NER - at this point it should resize itself
-    ner1(doc1)
-    assert len(ner1.labels) == 2
-    assert "SOME_LABEL" in ner1.labels
-    assert "MY_ORG" in ner1.labels
-
-    with make_tempdir() as d:
-        # assert IO goes fine
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        ner1.to_disk(output_dir)
-
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-            "beam_width": 1,
-            "beam_update_prob": 1.0,
-        }
-        ner2 = EntityRecognizer(vocab, default_ner(), **config)
-        ner2.from_disk(output_dir)
-        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@ -1,30 +0,0 @@
-from spacy.vocab import Vocab
-import spacy
-from spacy.lang.en import English
-from spacy.util import ensure_path
-
-from ..util import make_tempdir
-
-
-def test_issue4054(en_vocab):
-    """Test that a new blank model can be made with a vocab from file,
-    and that serialization does not drop the language at any point."""
-    nlp1 = English()
-    vocab1 = nlp1.vocab
-
-    with make_tempdir() as d:
-        vocab_dir = ensure_path(d / "vocab")
-        if not vocab_dir.exists():
-            vocab_dir.mkdir()
-        vocab1.to_disk(vocab_dir)
-
-        vocab2 = Vocab().from_disk(vocab_dir)
-        print("lang", vocab2.lang)
-        nlp2 = spacy.blank("en", vocab=vocab2)
-
-        nlp_dir = ensure_path(d / "nlp")
-        if not nlp_dir.exists():
-            nlp_dir.mkdir()
-        nlp2.to_disk(nlp_dir)
-        nlp3 = spacy.load(nlp_dir)
-        assert nlp3.lang == "en"
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,23 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue4120(en_vocab):
-    """Test that matches without a final {OP: ?} token are returned."""
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
-    doc1 = Doc(en_vocab, words=["a"])
-    assert len(matcher(doc1)) == 1  # works
-
-    doc2 = Doc(en_vocab, words=["a", "b", "c"])
-    assert len(matcher(doc2)) == 2  # fixed
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
-    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc3)) == 2  # works
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
-    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc4)) == 3  # fixed
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -1,28 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-
-
-def test_issue4133(en_vocab):
-    nlp = English()
-    vocab_bytes = nlp.vocab.to_bytes()
-    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
-    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
-    doc = Doc(en_vocab, words=words)
-    for i, token in enumerate(doc):
-        token.pos_ = pos[i]
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    doc_bytes = doc.to_bytes()
-
-    vocab = Vocab()
-    vocab = vocab.from_bytes(vocab_bytes)
-    doc = Doc(vocab).from_bytes(doc_bytes)
-
-    actual = []
-    for token in doc:
-        actual.append(token.pos_)
-
-    assert actual == pos
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -1,46 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokenizer import Tokenizer
-from spacy import util
-
-from ..util import make_tempdir
-
-
-def test_issue4190():
-    test_string = "Test c."
-    # Load default language
-    nlp_1 = English()
-    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]  # noqa: F841
-    # Modify tokenizer
-    customize_tokenizer(nlp_1)
-    doc_1b = nlp_1(test_string)
-    result_1b = [token.text for token in doc_1b]
-    # Save and Reload
-    with make_tempdir() as model_dir:
-        nlp_1.to_disk(model_dir)
-        nlp_2 = util.load_model(model_dir)
-    # This should be the modified tokenizer
-    doc_2 = nlp_2(test_string)
-    result_2 = [token.text for token in doc_2]
-    assert result_1b == result_2
-
-
-def customize_tokenizer(nlp):
-    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
-    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
-    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
-    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
-    exceptions = {
-        k: v
-        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
-        if not (len(k) == 2 and k[1] == ".")
-    }
-    new_tokenizer = Tokenizer(
-        nlp.vocab,
-        exceptions,
-        prefix_search=prefix_re.search,
-        suffix_search=suffix_re.search,
-        infix_finditer=infix_re.finditer,
-        token_match=nlp.tokenizer.token_match,
-    )
-    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4267():
-    """ Test that running an entity_ruler after ner gives consistent results"""
-    nlp = English()
-    ner = nlp.create_pipe("ner")
-    ner.add_label("PEOPLE")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    assert "ner" in nlp.pipe_names
-
-    # assert that we have correct IOB annotations
-    doc1 = nlp("hi")
-    assert doc1.is_nered
-    for token in doc1:
-        assert token.ent_iob == 2
-
-    # add entity ruler and run again
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-    assert "entity_ruler" in nlp.pipe_names
-    assert "ner" in nlp.pipe_names
-
-    # assert that we still have correct IOB annotations
-    doc2 = nlp("hi")
-    assert doc2.is_nered
-    for token in doc2:
-        assert token.ent_iob == 2
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -1,9 +0,0 @@
-from spacy.lang.el import Greek
-
-
-def test_issue4272():
-    """Test that lookup table can be accessed from Token.lemma if no POS tags
-    are available."""
-    nlp = Greek()
-    doc = nlp("Χθες")
-    assert doc[0].lemma_
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.pipeline import Pipe
-
-
-class DummyPipe(Pipe):
-    def __init__(self):
-        self.model = "dummy_model"
-
-    def predict(self, docs):
-        return ([1, 2, 3], [4, 5, 6])
-
-    def set_annotations(self, docs, scores, tensors=None):
-        return docs
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-def test_multiple_predictions(nlp):
-    doc = nlp.make_doc("foo")
-    dummy_pipe = DummyPipe()
-    dummy_pipe(doc)
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -1,47 +0,0 @@
-from collections import defaultdict
-
-import pytest
-
-from spacy.pipeline.defaults import default_ner
-from spacy.pipeline import EntityRecognizer
-
-from spacy.lang.en import English
-from spacy.tokens import Span
-
-
-# skipped after removing Beam stuff during the Example/GoldParse refactor
-@pytest.mark.skip
-def test_issue4313():
-    """ This should not crash or exit with some strange error code """
-    beam_width = 16
-    beam_density = 0.0001
-    nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
-    ner.add_label("SOME_LABEL")
-    ner.begin_training([])
-    nlp.add_pipe(ner)
-
-    # add a new label to the doc
-    doc = nlp("What do you think about Apple ?")
-    assert len(ner.labels) == 1
-    assert "SOME_LABEL" in ner.labels
-    apple_ent = Span(doc, 5, 6, label="MY_ORG")
-    doc.ents = list(doc.ents) + [apple_ent]
-
-    # ensure the beam_parse still works with the new label
-    docs = [doc]
-    beams = nlp.entity.beam_parse(
-        docs, beam_width=beam_width, beam_density=beam_density
-    )
-
-    for doc, beam in zip(docs, beams):
-        entity_scores = defaultdict(float)
-        for score, ents in nlp.entity.moves.get_beam_parses(beam):
-            for start, end, label in ents:
-                entity_scores[(start, end, label)] += score
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@ -1,24 +0,0 @@
-from spacy.gold import Example
-from spacy.lang.en import English
-from spacy.util import minibatch
-from thinc.api import compounding
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue4348():
-    """Test that training the tagger with empty data, doesn't throw errors"""
-
-    nlp = English()
-    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
-    TRAIN_DATA = [example, example]
-
-    tagger = nlp.create_pipe("tagger")
-    nlp.add_pipe(tagger)
-
-    optimizer = nlp.begin_training()
-    for i in range(5):
-        losses = {}
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
-        for batch in batches:
-            nlp.update(batch, sgd=optimizer, losses=losses)
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -1,8 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue4367():
-    """Test that docbin init goes well"""
-    DocBin()
-    DocBin(attrs=["LEMMA"])
-    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@ -1,10 +0,0 @@
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.vocab import Vocab
-
-
-def test_issue4373():
-    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
-    matcher = Matcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-    matcher = PhraseMatcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -1,98 +0,0 @@
-from spacy.gold import Corpus
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-from ...gold.converters import json2docs
-from ...tokens import DocBin
-
-
-def test_issue4402():
-    nlp = English()
-    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
-        data = DocBin(docs=docs, attrs=attrs).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
-        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
-
-        train_data = list(corpus.train_dataset(nlp))
-        assert len(train_data) == 2
-
-        split_train_data = []
-        for eg in train_data:
-            split_train_data.extend(eg.split_sents())
-        assert len(split_train_data) == 4
-
-
-json_data = {
-    "id": 0,
-    "paragraphs": [
-        {
-            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "How", "ner": "O"},
-                        {"id": 1, "orth": "should", "ner": "O"},
-                        {"id": 2, "orth": "I", "ner": "O"},
-                        {"id": 3, "orth": "cook", "ner": "O"},
-                        {"id": 4, "orth": "bacon", "ner": "O"},
-                        {"id": 5, "orth": "in", "ner": "O"},
-                        {"id": 6, "orth": "an", "ner": "O"},
-                        {"id": 7, "orth": "oven", "ner": "O"},
-                        {"id": 8, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {
-                    "tokens": [
-                        {"id": 9, "orth": "\n", "ner": "O"},
-                        {"id": 10, "orth": "I", "ner": "O"},
-                        {"id": 11, "orth": "'ve", "ner": "O"},
-                        {"id": 12, "orth": "heard", "ner": "O"},
-                        {"id": 13, "orth": "of", "ner": "O"},
-                        {"id": 14, "orth": "people", "ner": "O"},
-                        {"id": 15, "orth": "cooking", "ner": "O"},
-                        {"id": 16, "orth": "bacon", "ner": "O"},
-                        {"id": 17, "orth": "in", "ner": "O"},
-                        {"id": 18, "orth": "an", "ner": "O"},
-                        {"id": 19, "orth": "oven", "ner": "O"},
-                        {"id": 20, "orth": ".", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-            ],
-            "cats": [
-                {"label": "baking", "value": 1.0},
-                {"label": "not_baking", "value": 0.0},
-            ],
-        },
-        {
-            "raw": "What is the difference between white and brown eggs?\n",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "What", "ner": "O"},
-                        {"id": 1, "orth": "is", "ner": "O"},
-                        {"id": 2, "orth": "the", "ner": "O"},
-                        {"id": 3, "orth": "difference", "ner": "O"},
-                        {"id": 4, "orth": "between", "ner": "O"},
-                        {"id": 5, "orth": "white", "ner": "O"},
-                        {"id": 6, "orth": "and", "ner": "O"},
-                        {"id": 7, "orth": "brown", "ner": "O"},
-                        {"id": 8, "orth": "eggs", "ner": "O"},
-                        {"id": 9, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
-            ],
-            "cats": [
-                {"label": "baking", "value": 0.0},
-                {"label": "not_baking", "value": 1.0},
-            ],
-        },
-    ],
-}
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -0,0 +1,288 @@
+import pytest
+from mock import Mock
+from spacy.pipeline import EntityRuler
+from spacy.matcher import DependencyMatcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example
+from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.lang.en import English
+from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
+from spacy.language import Language
+from spacy.util import ensure_path, load_model_from_path
+import numpy
+import pickle
+
+from ..util import get_doc, make_tempdir
+
+
+def test_issue4528(en_vocab):
+    """Test that user_data is correctly serialized in DocBin."""
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc.user_data["foo"] = "bar"
+    # This is how extension attribute values are stored in the user data
+    doc.user_data[("._.", "foo", None, None)] = "bar"
+    doc_bin = DocBin(store_user_data=True)
+    doc_bin.add(doc)
+    doc_bin_bytes = doc_bin.to_bytes()
+    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+    assert new_doc.user_data["foo"] == "bar"
+    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    Example.from_dict(doc, {"words": words})
+
+
+def test_issue4590(en_vocab):
+    """Test that matches param in on_match method are the same as matches run with no on_match method"""
+    pattern = [
+        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+    ]
+
+    on_match = Mock()
+    matcher = DependencyMatcher(en_vocab)
+    matcher.add("pattern", on_match, pattern)
+    text = "The quick brown fox jumped over the lazy fox"
+    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
+    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+    matches = matcher(doc)
+    on_match_args = on_match.call_args
+    assert on_match_args[0][3] == matches
+
+
+def test_issue4651_with_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4651_without_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    not specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4665():
+    """
+    conllu2json should not raise an exception if the HEAD column contains an
+    underscore
+    """
+    input_data = """
+1	[	_	PUNCT	-LRB-	_	_	punct	_	_
+2	This	_	DET	DT	_	_	det	_	_
+3	killing	_	NOUN	NN	_	_	nsubj	_	_
+4	of	_	ADP	IN	_	_	case	_	_
+5	a	_	DET	DT	_	_	det	_	_
+6	respected	_	ADJ	JJ	_	_	amod	_	_
+7	cleric	_	NOUN	NN	_	_	nmod	_	_
+8	will	_	AUX	MD	_	_	aux	_	_
+9	be	_	AUX	VB	_	_	aux	_	_
+10	causing	_	VERB	VBG	_	_	root	_	_
+11	us	_	PRON	PRP	_	_	iobj	_	_
+12	trouble	_	NOUN	NN	_	_	dobj	_	_
+13	for	_	ADP	IN	_	_	case	_	_
+14	years	_	NOUN	NNS	_	_	nmod	_	_
+15	to	_	PART	TO	_	_	mark	_	_
+16	come	_	VERB	VB	_	_	acl	_	_
+17	.	_	PUNCT	.	_	_	punct	_	_
+18	]	_	PUNCT	-RRB-	_	_	punct	_	_
+"""
+    conllu2docs(input_data)
+
+
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    with pytest.warns(UserWarning):
+        kb.set_entities(
+            entity_list=["Q1", "Q1"],
+            freq_list=[32, 111],
+            vector_list=[vector1, vector2],
+        )
+    assert kb.get_size_entities() == 1
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.dump(str(file_path))
+        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
+        kb2.load_bulk(str(file_path))
+    assert kb2.get_size_entities() == 1
+
+
+def test_issue4707():
+    """Tests that disabled component names are also excluded from nlp.from_disk
+    by default when loading a model.
+    """
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
+    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
+    exclude = ["tokenizer", "sentencizer"]
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir, exclude=exclude)
+        new_nlp = load_model_from_path(tmpdir, disable=exclude)
+    assert "sentencizer" not in new_nlp.pipe_names
+    assert "entity_ruler" in new_nlp.pipe_names
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_1():
+    """ Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["min_action_freq"] == 342
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["min_action_freq"] == 342
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_2():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
+
+def test_issue4849():
+    nlp = English()
+    ruler = EntityRuler(
+        nlp,
+        patterns=[
+            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+        ],
+        phrase_matcher_attr="LOWER",
+    )
+    nlp.add_pipe(ruler)
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+    # USING 2 PROCESSES
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=2):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+
+
+class CustomPipe:
+    name = "my_pipe"
+
+    def __init__(self):
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
+def test_issue4903():
+    """Ensure that this runs correctly and doesn't hang or crash on Windows /
+    macOS."""
+    nlp = English()
+    custom_component = CustomPipe()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(custom_component, after="sentencizer")
+
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    docs = list(nlp.pipe(text, n_process=2))
+    assert docs[0].text == "I like bananas."
+    assert docs[1].text == "Do you like them?"
+    assert docs[2].text == "No, I prefer wasabi."
+
+
+def test_issue4924():
+    nlp = Language()
+    example = Example.from_dict(nlp.make_doc(""), {})
+    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@ -1,16 +0,0 @@
-from spacy.tokens import Doc, DocBin
-
-
-def test_issue4528(en_vocab):
-    """Test that user_data is correctly serialized in DocBin."""
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc.user_data["foo"] = "bar"
-    # This is how extension attribute values are stored in the user data
-    doc.user_data[("._.", "foo", None, None)] = "bar"
-    doc_bin = DocBin(store_user_data=True)
-    doc_bin.add(doc)
-    doc_bin_bytes = doc_bin.to_bytes()
-    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
-    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
-    assert new_doc.user_data["foo"] == "bar"
-    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,11 +0,0 @@
-import pytest
-
-from spacy.gold import Example
-
-
-@pytest.mark.parametrize(
-    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
-    doc = en_tokenizer(text)
-    Example.from_dict(doc, {"words": words})
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -1,35 +0,0 @@
-from mock import Mock
-from spacy.matcher import DependencyMatcher
-from ..util import get_doc
-
-
-def test_issue4590(en_vocab):
-    """Test that matches param in on_match method are the same as matches run with no on_match method"""
-    pattern = [
-        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {
-            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-        {
-            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-    ]
-
-    on_match = Mock()
-
-    matcher = DependencyMatcher(en_vocab)
-    matcher.add("pattern", on_match, pattern)
-
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
-
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-
-    matches = matcher(doc)
-
-    on_match_args = on_match.call_args
-
-    assert on_match_args[0][3] == matches
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@ -1,62 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-from ..util import make_tempdir
-
-
-def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
-
-
-def test_issue4651_without_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    not specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@ -1,35 +0,0 @@
-import pytest
-
-# TODO
-# from spacy.gold.converters.conllu2docs import conllu2docs
-
-input_data = """
-1	[	_	PUNCT	-LRB-	_	_	punct	_	_
-2	This	_	DET	DT	_	_	det	_	_
-3	killing	_	NOUN	NN	_	_	nsubj	_	_
-4	of	_	ADP	IN	_	_	case	_	_
-5	a	_	DET	DT	_	_	det	_	_
-6	respected	_	ADJ	JJ	_	_	amod	_	_
-7	cleric	_	NOUN	NN	_	_	nmod	_	_
-8	will	_	AUX	MD	_	_	aux	_	_
-9	be	_	AUX	VB	_	_	aux	_	_
-10	causing	_	VERB	VBG	_	_	root	_	_
-11	us	_	PRON	PRP	_	_	iobj	_	_
-12	trouble	_	NOUN	NN	_	_	dobj	_	_
-13	for	_	ADP	IN	_	_	case	_	_
-14	years	_	NOUN	NNS	_	_	nmod	_	_
-15	to	_	PART	TO	_	_	mark	_	_
-16	come	_	VERB	VB	_	_	acl	_	_
-17	.	_	PUNCT	.	_	_	punct	_	_
-18	]	_	PUNCT	-RRB-	_	_	punct	_	_
-"""
-
-
-@pytest.mark.xfail
-def test_issue4665():
-    """
-    conllu2json should not raise an exception if the HEAD column contains an
-    underscore
-    """
-    pass
-    # conllu2json(input_data)
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -1,36 +0,0 @@
-import pytest
-from spacy.kb import KnowledgeBase
-from spacy.util import ensure_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4674():
-    """Test that setting entities with overlapping identifiers does not mess up IO"""
-    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-
-    vector1 = [0.9, 1.1, 1.01]
-    vector2 = [1.8, 2.25, 2.01]
-    with pytest.warns(UserWarning):
-        kb.set_entities(
-            entity_list=["Q1", "Q1"],
-            freq_list=[32, 111],
-            vector_list=[vector1, vector2],
-        )
-
-    assert kb.get_size_entities() == 1
-
-    # dumping to file & loading back in
-    with make_tempdir() as d:
-        dir_path = ensure_path(d)
-        if not dir_path.exists():
-            dir_path.mkdir()
-        file_path = dir_path / "kb"
-        kb.dump(str(file_path))
-
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-        kb2.load_bulk(str(file_path))
-
-    assert kb2.get_size_entities() == 1
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@ -1,20 +0,0 @@
-from spacy.util import load_model_from_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4707():
-    """Tests that disabled component names are also excluded from nlp.from_disk
-    by default when loading a model.
-    """
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
-    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
-    exclude = ["tokenizer", "sentencizer"]
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir, exclude=exclude)
-        new_nlp = load_model_from_path(tmpdir, disable=exclude)
-    assert "sentencizer" not in new_nlp.pipe_names
-    assert "entity_ruler" in new_nlp.pipe_names
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -1,41 +0,0 @@
-import pickle
-import numpy
-
-from spacy.lang.en import English
-from spacy.vocab import Vocab
-
-from spacy.tests.util import make_tempdir
-
-
-def test_pickle_ner():
-    """ Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
-    with make_tempdir() as tmp_path:
-        with (tmp_path / "ner.pkl").open("wb") as file_:
-            pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
-
-        with (tmp_path / "ner.pkl").open("rb") as file_:
-            ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
-
-
-def test_issue4725():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4849():
-    nlp = English()
-
-    ruler = EntityRuler(
-        nlp,
-        patterns=[
-            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
-            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
-        ],
-        phrase_matcher_attr="LOWER",
-    )
-
-    nlp.add_pipe(ruler)
-
-    text = """
-    The left is starting to take aim at Democratic front-runner Joe Biden.
-    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
-    """
-
-    # USING 1 PROCESS
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=1):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
-
-    # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,40 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Span, Doc
-
-
-class CustomPipe:
-    name = "my_pipe"
-
-    def __init__(self):
-        Span.set_extension("my_ext", getter=self._get_my_ext)
-        Doc.set_extension("my_ext", default=None)
-
-    def __call__(self, doc):
-        gathered_ext = []
-        for sent in doc.sents:
-            sent_ext = self._get_my_ext(sent)
-            sent._.set("my_ext", sent_ext)
-            gathered_ext.append(sent_ext)
-
-        doc._.set("my_ext", "\n".join(gathered_ext))
-
-        return doc
-
-    @staticmethod
-    def _get_my_ext(span):
-        return str(span.end)
-
-
-def test_issue4903():
-    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
-
-    nlp = English()
-    custom_component = CustomPipe()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(custom_component, after="sentencizer")
-
-    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -1,8 +0,0 @@
-from spacy.gold import Example
-from spacy.language import Language
-
-
-def test_issue4924():
-    nlp = Language()
-    example = Example.from_dict(nlp.make_doc(""), {})
-    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,8 @@
+import pytest
 from spacy.lang.en import English


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
    # Test that the comparison between a Span and a Token, goes well
    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
    text = nlp("Talk about being boring!")
    text_var = nlp("Talk of being boring!")
    y = nlp("Let")
-
    span = text[0:3]  # Talk about being
    span_2 = text[0:3]  # Talk about being
    span_3 = text_var[0:3]  # Talk of being
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -63,7 +63,8 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
+        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger


--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@ -0,0 +1,31 @@
+from spacy.lang.en import English
+from spacy.util import fix_random_seed
+
+
+def test_issue5551():
+    """Test that after fixing the random seed, the results of the pipeline are truly identical"""
+    component = "textcat"
+    pipe_cfg = {"exclusive_classes": False}
+
+    results = []
+    for i in range(3):
+        fix_random_seed(0)
+        nlp = English()
+        example = (
+            "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
+            {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}},
+        )
+        nlp.add_pipe(nlp.create_pipe(component, config=pipe_cfg), last=True)
+        pipe = nlp.get_pipe(component)
+        for label in set(example[1]["cats"]):
+            pipe.add_label(label)
+        nlp.begin_training(component_cfg={component: pipe_cfg})
+
+        # Store the result of each iteration
+        result = pipe.model.predict([nlp.make_doc(example[0])])
+        results.append(list(result[0]))
+
+    # All results should be the same because of the fixed seed
+    assert len(results) == 3
+    assert results[0] == results[1]
+    assert results[0] == results[2]
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -1,3 +1,4 @@
+import numpy
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo
@ -5,6 +6,7 @@ from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.gold.converters import json2docs
 from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
@ -153,6 +155,27 @@ def test_gold_biluo_misalign(en_vocab):
    assert tags == ["O", "O", "O", "-", "-", "-"]


+def test_example_constructor(en_vocab):
+    words = ["I", "like", "stuff"]
+    tags = ["NOUN", "VERB", "NOUN"]
+    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
+    predicted = Doc(en_vocab, words=words)
+    reference = Doc(en_vocab, words=words)
+    reference = reference.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
+    example = Example(predicted, reference)
+    tags = example.get_aligned("TAG", as_string=True)
+    assert tags == ["NOUN", "VERB", "NOUN"]
+
+
+def test_example_from_dict_tags(en_vocab):
+    words = ["I", "like", "stuff"]
+    tags = ["NOUN", "VERB", "NOUN"]
+    predicted = Doc(en_vocab, words=words)
+    example = Example.from_dict(predicted, {"TAGS": tags})
+    tags = example.get_aligned("TAG", as_string=True)
+    assert tags == ["NOUN", "VERB", "NOUN"]
+
+
 def test_example_from_dict_no_ner(en_vocab):
    words = ["a", "b", "c", "d"]
    spaces = [True, True, False, True]
@ -272,72 +295,72 @@ def test_split_sentences(en_vocab):


 def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
-    words = ["Mr. and ", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr. and Mrs. Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "U-LOC", "O"]

    entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]

    entities = [
-        (len("Mr. and "), len("Mr. and Mrs."), "PERSON"),  # "Mrs." is a Person
+        (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", None, "O", "U-LOC", "O"]


 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
-    words = ["Mr. and", "Mrs.", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr. and Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]

    entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]


 def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
-    words = ["Mr. and Mrs.", "Smith", "flew", "to", "San Francisco", "Valley", "."]
+    words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    prefix = "Mr. and Mrs. Smith flew to "
+    prefix = "Mr and Mrs Smith flew to "
    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
-    gold_words = ["Mr.", "and Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
+    gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]

    entities = [
-        (len("Mr. and "), len("Mr. and Mrs. Smith"), "PERSON"),  # "Mrs. Smith" is a PERSON
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["Mr. and", "Mrs. Smith", "flew to", "San", "Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
    assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]
@ -407,6 +430,49 @@ def test_biluo_spans(en_tokenizer):
    assert spans[1].label_ == "GPE"


+def test_aligned_spans_y2x(en_vocab, en_tokenizer):
+    words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
+    spaces = [True, True, True, False, False]
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    ents_ref = example.reference.ents
+    assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
+    ents_y2x = example.get_aligned_spans_y2x(ents_ref)
+    assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
+
+
+def test_aligned_spans_x2y(en_vocab, en_tokenizer):
+    text = "Mr and Mrs Smith flew to San Francisco Valley"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
+                {"label": "LOC", "pattern": "San Francisco Valley"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
+
+    # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
+    ents_pred = example.predicted.ents
+    assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
+    ents_x2y = example.get_aligned_spans_x2y(ents_pred)
+    assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
+
+
 def test_gold_ner_missing_tags(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -414,6 +480,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
    assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]


+def test_projectivize(en_tokenizer):
+    doc = en_tokenizer("He pretty quickly walks away")
+    heads = [3, 2, 3, 0, 2]
+    example = Example.from_dict(doc, {"heads": heads})
+    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
+    nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
+    assert proj_heads == [3, 2, 3, 0, 3]
+    assert nonproj_heads == [3, 2, 3, 0, 2]
+
+
 def test_iob_to_biluo():
    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@ -0,0 +1,156 @@
+from typing import List
+
+import pytest
+from thinc.api import fix_random_seed, Adam, set_dropout_rate
+from numpy.testing import assert_array_equal
+import numpy
+
+from spacy.ml.models import build_Tok2Vec_model
+from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
+from spacy.lang.en import English
+from spacy.lang.en.examples import sentences as EN_SENTENCES
+
+
+def get_all_params(model):
+    params = []
+    for node in model.walk():
+        for name in node.param_names:
+            params.append(node.get_param(name).ravel())
+    return node.ops.xp.concatenate(params)
+
+
+def get_docs():
+    nlp = English()
+    return list(nlp.pipe(EN_SENTENCES + [" ".join(EN_SENTENCES)]))
+
+
+def get_gradient(model, Y):
+    if isinstance(Y, model.ops.xp.ndarray):
+        dY = model.ops.alloc(Y.shape, dtype=Y.dtype)
+        dY += model.ops.xp.random.uniform(-1.0, 1.0, Y.shape)
+        return dY
+    elif isinstance(Y, List):
+        return [get_gradient(model, y) for y in Y]
+    else:
+        raise ValueError(f"Could not get gradient for type {type(Y)}")
+
+
+def default_tok2vec():
+    return build_Tok2Vec_model(**TOK2VEC_KWARGS)
+
+
+TOK2VEC_KWARGS = {
+    "width": 96,
+    "embed_size": 2000,
+    "subword_features": True,
+    "char_embed": False,
+    "conv_depth": 4,
+    "bilstm_depth": 0,
+    "maxout_pieces": 4,
+    "window_size": 1,
+    "dropout": 0.1,
+    "nM": 0,
+    "nC": 0,
+    "pretrained_vectors": None,
+}
+
+TEXTCAT_KWARGS = {
+    "width": 64,
+    "embed_size": 2000,
+    "pretrained_vectors": None,
+    "exclusive_classes": False,
+    "ngram_size": 1,
+    "window_size": 1,
+    "conv_depth": 2,
+    "dropout": None,
+    "nO": 7
+}
+
+TEXTCAT_CNN_KWARGS = {
+    "tok2vec": default_tok2vec(),
+    "exclusive_classes": False,
+    "nO": 13,
+}
+
+
+@pytest.mark.parametrize(
+    "seed,model_func,kwargs",
+    [
+        (0, build_Tok2Vec_model, TOK2VEC_KWARGS),
+        (0, build_text_classifier, TEXTCAT_KWARGS),
+        (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS),
+    ],
+)
+def test_models_initialize_consistently(seed, model_func, kwargs):
+    fix_random_seed(seed)
+    model1 = model_func(**kwargs)
+    model1.initialize()
+    fix_random_seed(seed)
+    model2 = model_func(**kwargs)
+    model2.initialize()
+    params1 = get_all_params(model1)
+    params2 = get_all_params(model2)
+    assert_array_equal(params1, params2)
+
+
+@pytest.mark.parametrize(
+    "seed,model_func,kwargs,get_X",
+    [
+        (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
+        (0, build_text_classifier, TEXTCAT_KWARGS, get_docs),
+        (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
+    ],
+)
+def test_models_predict_consistently(seed, model_func, kwargs, get_X):
+    fix_random_seed(seed)
+    model1 = model_func(**kwargs).initialize()
+    Y1 = model1.predict(get_X())
+    fix_random_seed(seed)
+    model2 = model_func(**kwargs).initialize()
+    Y2 = model2.predict(get_X())
+
+    if model1.has_ref("tok2vec"):
+        tok2vec1 = model1.get_ref("tok2vec").predict(get_X())
+        tok2vec2 = model2.get_ref("tok2vec").predict(get_X())
+        for i in range(len(tok2vec1)):
+            for j in range(len(tok2vec1[i])):
+                assert_array_equal(numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j]))
+
+    if isinstance(Y1, numpy.ndarray):
+        assert_array_equal(Y1, Y2)
+    elif isinstance(Y1, List):
+        assert len(Y1) == len(Y2)
+        for y1, y2 in zip(Y1, Y2):
+            assert_array_equal(y1, y2)
+    else:
+        raise ValueError(f"Could not compare type {type(Y1)}")
+
+
+@pytest.mark.parametrize(
+    "seed,dropout,model_func,kwargs,get_X",
+    [
+        (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs),
+        (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs),
+        (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs),
+    ],
+)
+def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
+    def get_updated_model():
+        fix_random_seed(seed)
+        optimizer = Adam(0.001)
+        model = model_func(**kwargs).initialize()
+        initial_params = get_all_params(model)
+        set_dropout_rate(model, dropout)
+        for _ in range(5):
+            Y, get_dX = model.begin_update(get_X())
+            dY = get_gradient(model, Y)
+            _ = get_dX(dY)
+            model.finish_update(optimizer)
+        updated_params = get_all_params(model)
+        with pytest.raises(AssertionError):
+            assert_array_equal(initial_params, updated_params)
+        return model
+
+    model1 = get_updated_model()
+    model2 = get_updated_model()
+    assert_array_equal(get_all_params(model1), get_all_params(model2))
--- a/spacy/tests/test_projects.py
+++ b/spacy/tests/test_projects.py
@ -0,0 +1,31 @@
+import pytest
+from spacy.cli.project.util import validate_project_commands
+from spacy.schemas import ProjectConfigSchema, validate
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        {"commands": [{"name": "a"}, {"name": "a"}]},
+        {"commands": [{"name": "a"}], "workflows": {"a": []}},
+        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
+    ],
+)
+def test_project_config_validation1(config):
+    with pytest.raises(SystemExit):
+        validate_project_commands(config)
+
+
+@pytest.mark.parametrize(
+    "config,n_errors",
+    [
+        ({"commands": {"a": []}}, 1),
+        ({"commands": [{"help": "..."}]}, 1),
+        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
+        ({"commands": [{"extra": "b"}]}, 2),
+        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
+    ],
+)
+def test_project_config_validation2(config, n_errors):
+    errors = validate(ProjectConfigSchema, config)
+    assert len(errors) == n_errors
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -803,7 +803,7 @@ cdef class Doc:
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
        if array.dtype != numpy.uint64:
-            warnings.warn(Warnings.W101.format(type=array.dtype))
+            warnings.warn(Warnings.W028.format(type=array.dtype))

        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -20,7 +20,6 @@ import subprocess
 from contextlib import contextmanager
 import tempfile
 import shutil
-import hashlib
 import shlex

 try:
@ -449,6 +448,16 @@ def split_command(command: str) -> List[str]:
    return shlex.split(command, posix=not is_windows)


+def join_command(command: List[str]) -> str:
+    """Join a command using shlex. shlex.join is only available for Python 3.8+,
+    so we're using a workaround here.
+
+    command (List[str]): The command to join.
+    RETURNS (str): The joined command
+    """
+    return " ".join(shlex.quote(cmd) for cmd in command)
+
+
 def run_command(command: Union[str, List[str]]) -> None:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
@ -501,23 +510,13 @@ def make_tempdir():
        warnings.warn(Warnings.W091.format(dir=d, msg=e))


-def get_hash(data) -> str:
-    """Get the hash for a JSON-serializable object.
+def is_cwd(path: Union[Path, str]) -> bool:
+    """Check whether a path is the current working directory.

-    data: The data to hash.
-    RETURNS (str): The hash.
+    path (Union[Path, str]): The directory path.
+    RETURNS (bool): Whether the path is the current working directory.
    """
-    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
-    return hashlib.md5(data_str).hexdigest()
-
-
-def get_checksum(path: Union[Path, str]) -> str:
-    """Get the checksum for a file given its file path.
-
-    path (Union[Path, str]): The file path.
-    RETURNS (str): The checksum.
-    """
-    return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()


 def is_in_jupyter():
@ -722,6 +721,51 @@ def minibatch(items, size=8):
        yield list(batch)


+def minibatch_by_padded_size(docs, size, buffer=256, discard_oversize=False):
+    if isinstance(size, int):
+        size_ = itertools.repeat(size)
+    else:
+        size_ = size
+    for outer_batch in minibatch(docs, buffer):
+        outer_batch = list(outer_batch)
+        target_size = next(size_)
+        for indices in _batch_by_length(outer_batch, target_size):
+            subbatch = [outer_batch[i] for i in indices]
+            padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
+            if discard_oversize and padded_size >= target_size:
+                pass
+            else:
+                yield subbatch
+
+
+def _batch_by_length(seqs, max_words):
+    """Given a list of sequences, return a batched list of indices into the
+    list, where the batches are grouped by length, in descending order.
+
+    Batches may be at most max_words in size, defined as max sequence length * size.
+    """
+    # Use negative index so we can get sort by position ascending.
+    lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
+    lengths_indices.sort()
+    batches = []
+    batch = []
+    for length, i in lengths_indices:
+        if not batch:
+            batch.append(i)
+        elif length * (len(batch) + 1) <= max_words:
+            batch.append(i)
+        else:
+            batches.append(batch)
+            batch = [i]
+    if batch:
+        batches.append(batch)
+    # Check lengths match
+    assert sum(len(b) for b in batches) == len(seqs)
+    batches = [list(sorted(batch)) for batch in batches]
+    batches.reverse()
+    return batches
+
+
 def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):
    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
@ -768,7 +812,8 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):

        # yield the previous batch and start a new one. The new one gets the overflow examples.
        else:
-            yield batch
+            if batch:
+                yield batch
            target_size = next(size_)
            tol_size = target_size * tolerance
            batch = overflow
@ -788,15 +833,15 @@ def minibatch_by_words(docs, size, tolerance=0.2, discard_oversize=False):

            # this example does not fit with the previous overflow: start another new batch
            else:
-                yield batch
+                if batch:
+                    yield batch
                target_size = next(size_)
                tol_size = target_size * tolerance
                batch = [doc]
                batch_size = n_words

-    # yield the final batch
+    batch.extend(overflow)
    if batch:
-        batch.extend(overflow)
        yield batch


--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -4,4 +4,34 @@ teaser: Pre-defined model architectures included with the core library
 source: spacy/ml/models
 ---

-TODO: write
+TODO: intro and how architectures work, link to
+[`registry`](/api/top-level#registry),
+[custom models](/usage/training#custom-models) usage etc.
+
+## Parser architectures {source="spacy/ml/models/parser.py"}
+
+### spacy.TransitionBasedParser.v1
+
+<!-- TODO: intro -->
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TransitionBasedParser.v1"
+> nr_feature_tokens = 6
+> hidden_width = 64
+> maxout_pieces = 2
+>
+> [model.tok2vec]
+> # ...
+> ```
+
+| Name                | Type                                       | Description |
+| ------------------- | ------------------------------------------ | ----------- |
+| `tok2vec`           | [`Model`](https://thinc.ai/docs/api-model) |             |
+| `nr_feature_tokens` | int                                        |             |
+| `hidden_width`      | int                                        |             |
+| `maxout_pieces`     | int                                        |             |
+| `use_upper`         | bool                                       |             |
+| `nO`                | int                                        |             |
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -297,60 +297,41 @@ will not be available.

 ## Train {#train}

-<!-- TODO: document new training -->
-
 Train a model. Expects data in spaCy's
-[JSON format](/api/data-formats#json-input). On each epoch, a model will be
-saved out to the directory. Accuracy scores and model details will be added to a
-[`meta.json`](/usage/training#models-generating) to allow packaging the model
-using the [`package`](/api/cli#package) command.
+[binary format](/api/data-formats#training) and a
+[config file](/api/data-formats#config) with all settings and hyperparameters.
+Will save out the best model from all epochs, as well as the final model. The
+`--code` argument can be used to provide a Python file that's imported before
+the training process starts. This lets you register
+[custom functions](/usage/training#custom-models) and architectures and refer to
+them in your config, all while still using spaCy's built-in `train` workflow. If
+you need to manage complex multi-step training workflows, check out the new
+[spaCy projects](/usage/projects).
+
+<Infobox title="New in v3.0" variant="warning">
+
+As of spaCy v3.0, the `train` command doesn't take a long list of command-line
+arguments anymore and instead expects a single
+[`config.cfg` file](/usage/training#config) containing all settings for the
+pipeline, training process and hyperparameters.
+
+</Infobox>

 ```bash
-$ python -m spacy train [lang] [output_path] [train_path] [dev_path]
-[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
-[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
-[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
-[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel]
-[--textcat-positive-label] [--verbose]
+$ python -m spacy train [train_path] [dev_path] [config_path] [--output]
+[--code] [--verbose]
 ```

-| Argument                                                        | Type          | Description                                                                                                                                                       |
-| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`                                                          | positional    | Model language.                                                                                                                                                   |
-| `output_path`                                                   | positional    | Directory to store model in. Will be created if it doesn't exist.                                                                                                 |
-| `train_path`                                                    | positional    | Location of JSON-formatted training data. Can be a file or a directory of files.                                                                                  |
-| `dev_path`                                                      | positional    | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files.                                                                |
-| `--base-model`, `-b` <Tag variant="new">2.1</Tag>               | option        | Optional name of base model to update. Can be any loadable spaCy model.                                                                                           |
-| `--pipeline`, `-p` <Tag variant="new">2.1</Tag>                 | option        | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.                                                                         |
-| `--replace-components`, `-R`                                    | flag          | Replace components from the base model.                                                                                                                           |
-| `--vectors`, `-v`                                               | option        | Model to load vectors from.                                                                                                                                       |
-| `--n-iter`, `-n`                                                | option        | Number of iterations (default: `30`).                                                                                                                             |
-| `--n-early-stopping`, `-ne`                                     | option        | Maximum number of training epochs without dev accuracy improvement.                                                                                               |
-| `--n-examples`, `-ns`                                           | option        | Number of examples to use (defaults to `0` for all examples).                                                                                                     |
-| `--use-gpu`, `-g`                                               | option        | GPU ID or `-1` for CPU only (default: `-1`).                                                                                                                      |
-| `--version`, `-V`                                               | option        | Model version. Will be written out to the model's `meta.json` after training.                                                                                     |
-| `--meta-path`, `-m` <Tag variant="new">2</Tag>                  | option        | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. |
-| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag>           | option        | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.                                                       |
-| `--parser-multitasks`, `-pt`                                    | option        | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                       |
-| `--entity-multitasks`, `-et`                                    | option        | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'`                                                                                                          |
-| `--width`, `-cw` <Tag variant="new">2.2.4</Tag>                 | option        | Width of CNN layers of `Tok2Vec` component.                                                                                                                       |
-| `--conv-depth`, `-cd` <Tag variant="new">2.2.4</Tag>            | option        | Depth of CNN layers of `Tok2Vec` component.                                                                                                                       |
-| `--cnn-window`, `-cW` <Tag variant="new">2.2.4</Tag>            | option        | Window size for CNN layers of `Tok2Vec` component.                                                                                                                |
-| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.4</Tag>            | option        | Maxout size for CNN layers of `Tok2Vec` component.                                                                                                                |
-| `--use-chars`, `-chr` <Tag variant="new">2.2.4</Tag>            | flag          | Whether to use character-based embedding of `Tok2Vec` component.                                                                                                  |
-| `--bilstm-depth`, `-lstm` <Tag variant="new">2.2.4</Tag>        | option        | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch).                                                                                                 |
-| `--embed-rows`, `-er` <Tag variant="new">2.2.4</Tag>            | option        | Number of embedding rows of `Tok2Vec` component.                                                                                                                  |
-| `--noise-level`, `-nl`                                          | option        | Float indicating the amount of corruption for data augmentation.                                                                                                  |
-| `--orth-variant-level`, `-ovl` <Tag variant="new">2.2</Tag>     | option        | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement).                |
-| `--gold-preproc`, `-G`                                          | flag          | Use gold preprocessing.                                                                                                                                           |
-| `--learn-tokens`, `-T`                                          | flag          | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese.                                                   |
-| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag>     | flag          | Text classification classes aren't mutually exclusive (multilabel).                                                                                               |
-| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag>            | option        | Text classification model architecture. Defaults to `"bow"`.                                                                                                      |
-| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option        | Text classification positive label for binary classes with two labels.                                                                                            |
-| `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag>          | option        | Location of JSON-formatted tag map.                                                                                                                               |
-| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag>              | flag          | Show more detailed messages during training.                                                                                                                      |
-| `--help`, `-h`                                                  | flag          | Show help message and available arguments.                                                                                                                        |
-| **CREATES**                                                     | model, pickle | A spaCy model on each epoch.                                                                                                                                      |
+| Argument          | Type       | Description                                                                                                                                          |
+| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `train_path`      | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files.                             |
+| `dev_path`        | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files.           |
+| `config_path`     | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                |
+| `--output`, `-o`  | positional | Directory to store model in. Will be created if it doesn't exist.                                                                                    |
+| `--code`, `-c`    | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
+| `--verbose`, `-V` | flag       | Show more detailed messages during training.                                                                                                         |
+| `--help`, `-h`    | flag       | Show help message and available arguments.                                                                                                           |
+| **CREATES**       | model      | The final model and the best model.                                                                                                                  |

 ## Pretrain {#pretrain new="2.1" tag="experimental"}

@ -471,20 +452,20 @@ as separate files if the respective component is present in the model's
 pipeline.

 ```bash
-$ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit]
-[--gpu-id] [--gold-preproc] [--return-scores]
+$ python -m spacy evaluate [model] [data_path] [--output] [--displacy-path]
+[--displacy-limit] [--gpu-id] [--gold-preproc]
 ```

-| Argument                  | Type           | Description                                                                                                                                              |
-| ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                   | positional     | Model to evaluate. Can be a package or a path to a model data directory.                                                                                 |
-| `data_path`               | positional     | Location of JSON-formatted evaluation data.                                                                                                              |
-| `--displacy-path`, `-dp`  | option         | Directory to output rendered parses as HTML. If not set, no visualizations will be generated.                                                            |
-| `--displacy-limit`, `-dl` | option         | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
-| `--gpu-id`, `-g`          | option         | GPU to use, if any. Defaults to `-1` for CPU.                                                                                                            |
-| `--gold-preproc`, `-G`    | flag           | Use gold preprocessing.                                                                                                                                  |
-| `--return-scores`, `-R`   | flag           | Return dict containing model scores.                                                                                                                     |
-| **CREATES**               | `stdout`, HTML | Training results and optional displaCy visualizations.                                                                                                   |
+| Argument                  | Type                 | Description                                                                                                                                              |
+| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model`                   | positional           | Model to evaluate. Can be a package or a path to a model data directory.                                                                                 |
+| `data_path`               | positional           | Location of evaluation data in spaCy's [binary format](/api/data-formats#training).                                                                      |
+| `--output`, `-o`          | option               | Output JSON file for metrics. If not set, no metrics will be exported.                                                                                   |
+| `--displacy-path`, `-dp`  | option               | Directory to output rendered parses as HTML. If not set, no visualizations will be generated.                                                            |
+| `--displacy-limit`, `-dl` | option               | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. |
+| `--gpu-id`, `-g`          | option               | GPU to use, if any. Defaults to `-1` for CPU.                                                                                                            |
+| `--gold-preproc`, `-G`    | flag                 | Use gold preprocessing.                                                                                                                                  |
+| **CREATES**               | `stdout`, JSON, HTML | Training results and optional metrics and visualizations.                                                                                                |

 ## Package {#package}

@ -504,15 +485,17 @@ so you don't have to run `python setup.py sdist` separately anymore.
 </Infobox>

 ```bash
-$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force]
+$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
+[--version] [--force]
 ```

-```bash
-### Example
-python -m spacy package /input /output
-cd /output/en_model-0.0.0
-pip install dist/en_model-0.0.0.tar.gz
-```
+> #### Example
+>
+> ```bash
+> python -m spacy package /input /output
+> cd /output/en_model-0.0.0
+> pip install dist/en_model-0.0.0.tar.gz
+> ```

 | Argument                                         | Type       | Description                                                                                                                                                                                     |
 | ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -525,18 +508,137 @@ pip install dist/en_model-0.0.0.tar.gz
 | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                                                                                                      |
 | **CREATES**                                      | directory  | A Python package containing the spaCy model.                                                                                                                                                    |

-## Project {#project}
+## Project {#project new="3"}

-<!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design -->
+The `spacy project` CLI includes subcommands for working with
+[spaCy projects](/usage/projects), end-to-end workflows for building and
+deploying custom spaCy models.

 ### project clone {#project-clone}

+Clone a project template from a Git repository. Calls into `git` under the hood
+and uses the sparse checkout feature, so you're only downloading what you need.
+By default, spaCy's
+[project templates repo](https://github.com/explosion/projects) is used, but you
+can provide any other repo (public or private) that you have access to using the
+`--repo` option.
+
+<!-- TODO: update example once we've decided on repo structure -->
+
+```bash
+$ python -m spacy project clone [name] [dest] [--repo]
+```
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy project clone some_example
+> ```
+>
+> Clone from custom repo:
+>
+> ```bash
+> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo
+> ```
+
+| Argument       | Type       | Description                                                                                                                  |
+| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- |
+| `name`         | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. |
+| `dest`         | positional | Where to clone the project. Defaults to current working directory.                                                           |
+| `--repo`, `-r` | option     | The repository to clone from. Can be any public or private Git repo you have access to.                                      |
+| `--help`, `-h` | flag       | Show help message and available arguments.                                                                                   |
+| **CREATES**    | directory  | The cloned [project directory](/usage/projects#project-files).                                                               |
+
 ### project assets {#project-assets}

-### project run-all {#project-run-all}
+Fetch project assets like datasets and pretrained weights. Assets are defined in
+the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a
+`checksum` is provided, the file is only downloaded if no local file with the
+same checksum exists and spaCy will show an error if the checksum of the
+downloaded file doesn't match. If assets don't specify a `url` they're
+considered "private" and you have to take care of putting them into the
+destination directory yourself. If a local path is provided, the asset is copied
+into the current project.
+
+```bash
+$ python -m spacy project assets [project_dir]
+```
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy project assets
+> ```
+
+| Argument       | Type       | Description                                                       |
+| -------------- | ---------- | ----------------------------------------------------------------- |
+| `project_dir`  | positional | Path to project directory. Defaults to current working directory. |
+| `--help`, `-h` | flag       | Show help message and available arguments.                        |
+| **CREATES**    | files      | Downloaded or copied assets defined in the `project.yml`.         |

 ### project run {#project-run}

-### project init {#project-init}
+Run a named command or workflow defined in the
+[`project.yml`](/usage/projects#project-yml). If a workflow name is specified,
+all commands in the workflow are run, in order. If commands define
+[dependencies or outputs](/usage/projects#deps-outputs), they will only be
+re-run if state has changed. For example, if the input dataset changes, a
+preprocessing command that depends on those files will be re-run.

-### project update-dvc {#project-update-dvc}
+```bash
+$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
+```
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy project run train
+> ```
+
+| Argument        | Type       | Description                                                       |
+| --------------- | ---------- | ----------------------------------------------------------------- |
+| `subcommand`    | positional | Name of the command or workflow to run.                           |
+| `project_dir`   | positional | Path to project directory. Defaults to current working directory. |
+| `--force`, `-F` | flag       | Force re-running steps, even if nothing changed.                  |
+| `--dry`, `-D`   | flag       |  Perform a dry run and don't execute scripts.                     |
+| `--help`, `-h`  | flag       | Show help message and available arguments.                        |
+
+### project dvc {#project-dvc}
+
+Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
+[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
+the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
+so you need to specify one workflow defined in the
+[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the
+first defined workflow is used. The DVC config will only be updated if the
+`project.yml` changed. For details, see the
+[DVC integration](/usage/projects#dvc) docs.
+
+<Infobox variant="warning">
+
+This command requires DVC to be installed and initialized in the project
+directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
+You'll also need to add the assets you want to track with
+[`dvc add`](https://dvc.org/doc/command-reference/add).
+
+</Infobox>
+
+```bash
+$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
+```
+
+> #### Example
+>
+> ```bash
+> git init
+> dvc init
+> python -m spacy project dvc all
+> ```
+
+| Argument          | Type       | Description                                                                       |
+| ----------------- | ---------- | --------------------------------------------------------------------------------- |
+| `project_dir`     | positional | Path to project directory. Defaults to current working directory.                 |
+| `workflow`        | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. |
+| `--force`, `-F`   | flag       | Force-updating config file.                                                       |
+| `--verbose`, `-V` | flag       |  Print more output generated by DVC.                                              |
+| `--help`, `-h`    | flag       | Show help message and available arguments.                                        |
--- a/website/docs/api/cython.md
+++ b/website/docs/api/cython.md
@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to
 follow — only to succumb themselves. In short, just say no to optimizing your
 Python. If it's not fast enough the first time, just switch to Cython.

-<Infobox title="📖 Resources">
+<Infobox title="Resources" emoji="📖">

 - [Official Cython documentation](http://docs.cython.org/en/latest/)
  (cython.org)
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -2,7 +2,8 @@
 title: Data formats
 teaser: Details on spaCy's input and output data formats
 menu:
-  - ['Training data', 'training']
+  - ['Training Data', 'training']
+  - ['Training Config', 'config']
  - ['Vocabulary', 'vocab']
 ---

@ -74,6 +75,29 @@ from the English Wall Street Journal portion of the Penn Treebank:
 https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
 ```

+## Training config {#config new="3"}
+
+Config files define the training process and model pipeline and can be passed to
+[`spacy train`](/api/cli#train). They use
+[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
+hood. For details on how to use training configs, see the
+[usage documentation](/usage/training#config).
+
+<Infobox variant="warning">
+
+The `@` syntax lets you refer to function names registered in the
+[function registry](/api/top-level#registry). For example,
+`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
+the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block
+will be passed into that function as arguments. Those arguments depend on the
+registered function. See the [model architectures](/api/architectures) docs for
+API details.
+
+</Infobox>
+
+<!-- TODO: we need to come up with a good way to present the sections and their expected values visually? -->
+<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
+
 ## Lexical data for vocabulary {#vocab-jsonl new="2"}

 To populate a model's vocabulary, you can use the
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
 component is available in the [processing pipeline](/usage/processing-pipelines)
 via the ID `"parser"`.

-## DependencyParser.Model {#model tag="classmethod"}
+## Default config {#config}

-Initialize a model for the pipe. The model should implement the
-`thinc.neural.Model` API. Wrappers are under development for most major machine
-learning libraries.
+This is the default configuration used to initialize the model powering the
+pipeline component. See the [model architectures](/api/architectures)
+documentation for details on the architectures and their arguments and
+hyperparameters. To learn more about how to customize the config and train
+custom models, check out the [training config](/usage/training#config) docs.

-| Name        | Type   | Description                           |
-| ----------- | ------ | ------------------------------------- |
-| `**kwargs`  | -      | Parameters for initializing the model |
-| **RETURNS** | object | The initialized model.                |
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/parser_defaults.cfg
+```

 ## DependencyParser.\_\_init\_\_ {#init tag="method"}

+> #### Example
+>
+> ```python
+> # Construction via create_pipe with default model
+> parser = nlp.create_pipe("parser")
+>
+> # Construction via create_pipe with custom model
+> config = {"model": {"@architectures": "my_parser"}}
+> parser = nlp.create_pipe("parser", config)
+>
+> # Construction from class with custom model from file
+> from spacy.pipeline import DependencyParser
+> model = util.load_config("model.cfg", create_objects=True)["model"]
+> parser = DependencyParser(nlp.vocab, model)
+> ```
+
 Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.create_pipe`](/api/language#create_pipe).

-> #### Example
->
-> ```python
-> # Construction via create_pipe
-> parser = nlp.create_pipe("parser")
->
-> # Construction from class
-> from spacy.pipeline import DependencyParser
-> parser = DependencyParser(nlp.vocab)
-> parser.from_disk("/path/to/model")
-> ```
-
-| Name        | Type                          | Description                                                                                                                                           |
-| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`     | `Vocab`                       | The shared vocabulary.                                                                                                                                |
-| `model`     | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
-| `**cfg`     | -                             | Configuration parameters.                                                                                                                             |
-| **RETURNS** | `DependencyParser`            | The newly constructed object.                                                                                                                         |
+| Name        | Type               | Description                                                                     |
+| ----------- | ------------------ | ------------------------------------------------------------------------------- |
+| `vocab`     | `Vocab`            | The shared vocabulary.                                                          |
+| `model`     | `Model`            | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `**cfg`     | -                  | Configuration parameters.                                                       |
+| **RETURNS** | `DependencyParser` | The newly constructed object.                                                   |

 ## DependencyParser.\_\_call\_\_ {#call tag="method"}

@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 >     pass
 > ```

-| Name         | Type     | Description                                            |
-| ------------ | -------- | ------------------------------------------------------ |
-| `stream`     | iterable | A stream of documents.                                 |
-| `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      |
-| **YIELDS**   | `Doc`    | Processed documents in the order of the original text. |
+| Name         | Type            | Description                                            |
+| ------------ | --------------- | ------------------------------------------------------ |
+| `stream`     | `Iterable[Doc]` | A stream of documents.                                 |
+| `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      |
+| **YIELDS**   | `Doc`           | Processed documents in the order of the original text. |

 ## DependencyParser.predict {#predict tag="method"}

@ -104,7 +109,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 | Name        | Type                | Description                                    |
 | ----------- | ------------------- | ---------------------------------------------- |
-| `docs`      | iterable            | The documents to predict.                      |
+| `docs`      | `Iterable[Doc]`     | The documents to predict.                      |
 | **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |

 ## DependencyParser.set_annotations {#set_annotations tag="method"}
@ -119,33 +124,34 @@ Modify a batch of documents, using pre-computed scores.
 > parser.set_annotations([doc1, doc2], scores)
 > ```

-| Name     | Type     | Description                                                |
-| -------- | -------- | ---------------------------------------------------------- |
-| `docs`   | iterable | The documents to modify.                                   |
-| `scores` | -        | The scores to set, produced by `DependencyParser.predict`. |
+| Name     | Type                | Description                                                |
+| -------- | ------------------- | ---------------------------------------------------------- |
+| `docs`   | `Iterable[Doc]`     | The documents to modify.                                   |
+| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |

 ## DependencyParser.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/dependencyparser#predict) and
+Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
+model. Delegates to [`predict`](/api/dependencyparser#predict) and
 [`get_loss`](/api/dependencyparser#get_loss).

 > #### Example
 >
 > ```python
-> parser = DependencyParser(nlp.vocab)
-> losses = {}
+> parser = DependencyParser(nlp.vocab, parser_model)
 > optimizer = nlp.begin_training()
-> parser.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
+> losses = parser.update(examples, sgd=optimizer)
 > ```

-| Name     | Type     | Description                                                                                  |
-| -------- | -------- | -------------------------------------------------------------------------------------------- |
-| `docs`   | iterable | A batch of documents to learn from.                                                          |
-| `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                 |
-| `drop`   | float    | The dropout rate.                                                                            |
-| `sgd`    | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID.       |
-| `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated. |
+| Name              | Type                | Description                                                                                                                                    |
+| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                    |
+| _keyword-only_    |                     |                                                                                                                                                |
+| `drop`            | float               | The dropout rate.                                                                                                                              |
+| `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
+| `sgd`             | `Optimizer`         | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                |
+| `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                                   |
+| **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                               |

 ## DependencyParser.get_loss {#get_loss tag="method"}

@ -156,21 +162,20 @@ predicted scores.
 >
 > ```python
 > parser = DependencyParser(nlp.vocab)
-> scores = parser.predict([doc1, doc2])
-> loss, d_loss = parser.get_loss([doc1, doc2], [gold1, gold2], scores)
+> scores = parser.predict([eg.predicted for eg in examples])
+> loss, d_loss = parser.get_loss(examples, scores)
 > ```

-| Name        | Type     | Description                                                  |
-| ----------- | -------- | ------------------------------------------------------------ |
-| `docs`      | iterable | The batch of documents.                                      |
-| `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. |
-| `scores`    | -        | Scores representing the model's predictions.                 |
-| **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          |
+| Name        | Type                | Description                                         |
+| ----------- | ------------------- | --------------------------------------------------- |
+| `examples`  | `Iterable[Example]` | The batch of examples.                              |
+| `scores`    | `syntax.StateClass` | Scores representing the model's predictions.        |
+| **RETURNS** | tuple               | The loss and the gradient, i.e. `(loss, gradient)`. |

 ## DependencyParser.begin_training {#begin_training tag="method"}

-Initialize the pipe for training, using data examples if available. If no model
-has been initialized yet, the model is added.
+Initialize the pipe for training, using data examples if available. Return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.

 > #### Example
 >
@ -180,16 +185,17 @@ has been initialized yet, the model is added.
 > optimizer = parser.begin_training(pipeline=nlp.pipeline)
 > ```

-| Name          | Type     | Description                                                                                                                                                                                 |
-| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                           |
-| `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                        |
-| `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`DependencyParser`](/api/dependencyparser#create_optimizer) if not set. |
-| **RETURNS**   | callable | An optimizer.                                                                                                                                                                               |
+| Name           | Type                    | Description                                                                                                                                                          |
+| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                                 |
+| `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                                 |
+| `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
+| **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                        |

 ## DependencyParser.create_optimizer {#create_optimizer tag="method"}

-Create an optimizer for the pipeline component.
+Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
+component.

 > #### Example
 >
@ -198,9 +204,9 @@ Create an optimizer for the pipeline component.
 > optimizer = parser.create_optimizer()
 > ```

-| Name        | Type     | Description    |
-| ----------- | -------- | -------------- |
-| **RETURNS** | callable | The optimizer. |
+| Name        | Type        | Description                                                     |
+| ----------- | ----------- | --------------------------------------------------------------- |
+| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |

 ## DependencyParser.use_params {#use_params tag="method, contextmanager"}

--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -12,44 +12,47 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
 component is available in the [processing pipeline](/usage/processing-pipelines)
 via the ID `"entity_linker"`.

-## EntityLinker.Model {#model tag="classmethod"}
+## Default config {#config}

-Initialize a model for the pipe. The model should implement the
-`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
-context encoder. Wrappers are under development for most major machine learning
-libraries.
+This is the default configuration used to initialize the model powering the
+pipeline component. See the [model architectures](/api/architectures)
+documentation for details on the architectures and their arguments and
+hyperparameters. To learn more about how to customize the config and train
+custom models, check out the [training config](/usage/training#config) docs.

-| Name        | Type   | Description                           |
-| ----------- | ------ | ------------------------------------- |
-| `**kwargs`  | -      | Parameters for initializing the model |
-| **RETURNS** | object | The initialized model.                |
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/entity_linker_defaults.cfg
+```

 ## EntityLinker.\_\_init\_\_ {#init tag="method"}

+> #### Example
+>
+> ```python
+> # Construction via create_pipe with default model
+> entity_linker = nlp.create_pipe("entity_linker")
+>
+> # Construction via create_pipe with custom model
+> config = {"model": {"@architectures": "my_el"}}
+> entity_linker = nlp.create_pipe("entity_linker", config)
+>
+> # Construction from class with custom model from file
+> from spacy.pipeline import EntityLinker
+> model = util.load_config("model.cfg", create_objects=True)["model"]
+> entity_linker = EntityLinker(nlp.vocab, model)
+> ```
+
 Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.create_pipe`](/api/language#create_pipe).

-> #### Example
->
-> ```python
-> # Construction via create_pipe
-> entity_linker = nlp.create_pipe("entity_linker")
->
-> # Construction from class
-> from spacy.pipeline import EntityLinker
-> entity_linker = EntityLinker(nlp.vocab)
-> entity_linker.from_disk("/path/to/model")
-> ```
+| Name    | Type    | Description                                                                     |
+| ------- | ------- | ------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary.                                                          |
+| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `**cfg` | -       | Configuration parameters.                                                       |

-| Name           | Type                          | Description                                                                                                                                           |
-| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | `Vocab`                       | The shared vocabulary.                                                                                                                                |
-| `model`        | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
-| `hidden_width` | int                           | Width of the hidden layer of the entity linking model, defaults to `128`.                                                                             |
-| `incl_prior`   | bool                          | Whether or not to include prior probabilities in the model. Defaults to `True`.                                                                       |
-| `incl_context` | bool                          | Whether or not to include the local context in the model (if not: only prior probabilities are used). Defaults to `True`.                             |
-| **RETURNS**    | `EntityLinker`                | The newly constructed object.                                                                                                                         |
+| **RETURNS** | `EntityLinker` | The newly constructed object. |

 ## EntityLinker.\_\_call\_\_ {#call tag="method"}

@ -91,11 +94,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 >     pass
 > ```

-| Name         | Type     | Description                                            |
-| ------------ | -------- | ------------------------------------------------------ |
-| `stream`     | iterable | A stream of documents.                                 |
-| `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      |
-| **YIELDS**   | `Doc`    | Processed documents in the order of the original text. |
+| Name         | Type            | Description                                            |
+| ------------ | --------------- | ------------------------------------------------------ |
+| `stream`     | `Iterable[Doc]` | A stream of documents.                                 |
+| `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      |
+| **YIELDS**   | `Doc`           | Processed documents in the order of the original text. |

 ## EntityLinker.predict {#predict tag="method"}

@ -105,13 +108,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
 >
 > ```python
 > entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict([doc1, doc2])
+> kb_ids = entity_linker.predict([doc1, doc2])
 > ```

-| Name        | Type     | Description                                                                                                                                                                                        |
-| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`      | iterable | The documents to predict.                                                                                                                                                                          |
-| **RETURNS** | tuple    | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
+| Name        | Type            | Description                                                  |
+| ----------- | --------------- | ------------------------------------------------------------ |
+| `docs`      | `Iterable[Doc]` | The documents to predict.                                    |
+| **RETURNS** | `Iterable[str]` | The predicted KB identifiers for the entities in the `docs`. |

 ## EntityLinker.set_annotations {#set_annotations tag="method"}

@ -122,19 +125,18 @@ entities.
 >
 > ```python
 > entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict([doc1, doc2])
-> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
+> kb_ids = entity_linker.predict([doc1, doc2])
+> entity_linker.set_annotations([doc1, doc2], kb_ids)
 > ```

-| Name      | Type     | Description                                                                                       |
-| --------- | -------- | ------------------------------------------------------------------------------------------------- |
-| `docs`    | iterable | The documents to modify.                                                                          |
-| `kb_ids`  | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
-| `tensors` | iterable | The token representations used to predict the identifiers.                                        |
+| Name     | Type            | Description                                                                                       |
+| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
+| `docs`   | `Iterable[Doc]` | The documents to modify.                                                                          |
+| `kb_ids` | `Iterable[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |

 ## EntityLinker.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating both the
+Learn from a batch of [`Example`](/api/example) objects, updating both the
 pipe's entity linking model and context encoder. Delegates to
 [`predict`](/api/entitylinker#predict) and
 [`get_loss`](/api/entitylinker#get_loss).
@ -142,40 +144,20 @@ pipe's entity linking model and context encoder. Delegates to
 > #### Example
 >
 > ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> losses = {}
+> entity_linker = EntityLinker(nlp.vocab, nel_model)
 > optimizer = nlp.begin_training()
-> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
+> losses = entity_linker.update(examples, sgd=optimizer)
 > ```

-| Name     | Type     | Description                                                                                             |
-| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
-| `docs`   | iterable | A batch of documents to learn from.                                                                     |
-| `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                            |
-| `drop`   | float    | The dropout rate, used both for the EL model and the context encoder.                                   |
-| `sgd`    | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
-| `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated.            |
-
-## EntityLinker.get_loss {#get_loss tag="method"}
-
-Find the loss and gradient of loss for the entities in a batch of documents and
-their predicted scores.
-
-> #### Example
->
-> ```python
-> entity_linker = EntityLinker(nlp.vocab)
-> kb_ids, tensors = entity_linker.predict(docs)
-> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
-> ```
-
-| Name        | Type     | Description                                                  |
-| ----------- | -------- | ------------------------------------------------------------ |
-| `docs`      | iterable | The batch of documents.                                      |
-| `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. |
-| `kb_ids`    | iterable | KB identifiers representing the model's predictions.         |
-| `tensors`   | iterable | The token representations used to predict the identifiers    |
-| **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          |
+| Name              | Type                | Description                                                                                                                                |
+| ----------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                |
+| _keyword-only_    |                     |                                                                                                                                            |
+| `drop`            | float               | The dropout rate.                                                                                                                          |
+| `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entitylinker#set_annotations). |
+| `sgd`             | `Optimizer`         | [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                |
+| `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                               |
+| **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                           |

 ## EntityLinker.set_kb {#set_kb tag="method"}

@ -195,9 +177,9 @@ identifiers.

 ## EntityLinker.begin_training {#begin_training tag="method"}

-Initialize the pipe for training, using data examples if available. If no model
-has been initialized yet, the model is added. Before calling this method, a
-knowledge base should have been defined with
+Initialize the pipe for training, using data examples if available. Return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
+method, a knowledge base should have been defined with
 [`set_kb`](/api/entitylinker#set_kb).

 > #### Example
@ -209,12 +191,12 @@ knowledge base should have been defined with
 > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
 > ```

-| Name          | Type     | Description                                                                                                                                                                         |
-| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                   |
-| `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                |
-| `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
-| **RETURNS**   | callable | An optimizer.                                                                                                                                                                       |
+| Name           | Type                    | Description                                                                                                                                                      |
+| -------------- | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                             |
+| `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                             |
+| `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entitylinker#create_optimizer) if not set. |
+| **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                    |  |

 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}

@ -227,9 +209,9 @@ Create an optimizer for the pipeline component.
 > optimizer = entity_linker.create_optimizer()
 > ```

-| Name        | Type     | Description    |
-| ----------- | -------- | -------------- |
-| **RETURNS** | callable | The optimizer. |
+| Name        | Type        | Description                                                     |
+| ----------- | ----------- | --------------------------------------------------------------- |
+| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |

 ## EntityLinker.use_params {#use_params tag="method, contextmanager"}

--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -8,41 +8,46 @@ This class is a subclass of `Pipe` and follows the same API. The pipeline
 component is available in the [processing pipeline](/usage/processing-pipelines)
 via the ID `"ner"`.

-## EntityRecognizer.Model {#model tag="classmethod"}
+## Default config {#config}

-Initialize a model for the pipe. The model should implement the
-`thinc.neural.Model` API. Wrappers are under development for most major machine
-learning libraries.
+This is the default configuration used to initialize the model powering the
+pipeline component. See the [model architectures](/api/architectures)
+documentation for details on the architectures and their arguments and
+hyperparameters. To learn more about how to customize the config and train
+custom models, check out the [training config](/usage/training#config) docs.

-| Name        | Type   | Description                           |
-| ----------- | ------ | ------------------------------------- |
-| `**kwargs`  | -      | Parameters for initializing the model |
-| **RETURNS** | object | The initialized model.                |
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/defaults/ner_defaults.cfg
+```

 ## EntityRecognizer.\_\_init\_\_ {#init tag="method"}

-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.create_pipe`](/api/language#create_pipe).
-
 > #### Example
 >
 > ```python
 > # Construction via create_pipe
 > ner = nlp.create_pipe("ner")
 >
-> # Construction from class
+> # Construction via create_pipe with custom model
+> config = {"model": {"@architectures": "my_ner"}}
+> parser = nlp.create_pipe("ner", config)
+>
+> # Construction from class with custom model from file
 > from spacy.pipeline import EntityRecognizer
-> ner = EntityRecognizer(nlp.vocab)
-> ner.from_disk("/path/to/model")
+> model = util.load_config("model.cfg", create_objects=True)["model"]
+> ner = EntityRecognizer(nlp.vocab, model)
 > ```

-| Name        | Type                          | Description                                                                                                                                           |
-| ----------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`     | `Vocab`                       | The shared vocabulary.                                                                                                                                |
-| `model`     | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
-| `**cfg`     | -                             | Configuration parameters.                                                                                                                             |
-| **RETURNS** | `EntityRecognizer`            | The newly constructed object.                                                                                                                         |
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.create_pipe`](/api/language#create_pipe).
+
+| Name        | Type               | Description                                                                     |
+| ----------- | ------------------ | ------------------------------------------------------------------------------- |
+| `vocab`     | `Vocab`            | The shared vocabulary.                                                          |
+| `model`     | `Model`            | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `**cfg`     | -                  | Configuration parameters.                                                       |
+| **RETURNS** | `EntityRecognizer` | The newly constructed object.                                                   |

 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

@ -85,11 +90,11 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 >     pass
 > ```

-| Name         | Type     | Description                                            |
-| ------------ | -------- | ------------------------------------------------------ |
-| `stream`     | iterable | A stream of documents.                                 |
-| `batch_size` | int      | The number of texts to buffer. Defaults to `128`.      |
-| **YIELDS**   | `Doc`    | Processed documents in the order of the original text. |
+| Name         | Type            | Description                                            |
+| ------------ | --------------- | ------------------------------------------------------ |
+| `stream`     | `Iterable[Doc]` | A stream of documents.                                 |
+| `batch_size` | int             | The number of texts to buffer. Defaults to `128`.      |
+| **YIELDS**   | `Doc`           | Processed documents in the order of the original text. |

 ## EntityRecognizer.predict {#predict tag="method"}

@ -99,13 +104,13 @@ Apply the pipeline's model to a batch of docs, without modifying them.
 >
 > ```python
 > ner = EntityRecognizer(nlp.vocab)
-> scores, tensors = ner.predict([doc1, doc2])
+> scores = ner.predict([doc1, doc2])
 > ```

-| Name        | Type     | Description                                                                                                                                                                                                                        |
-| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`      | iterable | The documents to predict.                                                                                                                                                                                                          |
-| **RETURNS** | list | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
+| Name        | Type               | Description                                                                                                |
+| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
+| `docs`      | `Iterable[Doc]`    | The documents to predict.                                                                                  |
+| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |

 ## EntityRecognizer.set_annotations {#set_annotations tag="method"}

@ -115,38 +120,38 @@ Modify a batch of documents, using pre-computed scores.
 >
 > ```python
 > ner = EntityRecognizer(nlp.vocab)
-> scores, tensors = ner.predict([doc1, doc2])
-> ner.set_annotations([doc1, doc2], scores, tensors)
+> scores = ner.predict([doc1, doc2])
+> ner.set_annotations([doc1, doc2], scores)
 > ```

-| Name      | Type     | Description                                                |
-| --------- | -------- | ---------------------------------------------------------- |
-| `docs`    | iterable | The documents to modify.                                   |
-| `scores`  | -        | The scores to set, produced by `EntityRecognizer.predict`. |
-| `tensors` | iterable | The token representations used to predict the scores.      |
+| Name     | Type               | Description                                                |
+| -------- | ------------------ | ---------------------------------------------------------- |
+| `docs`   | `Iterable[Doc]`    | The documents to modify.                                   |
+| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |

 ## EntityRecognizer.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/entityrecognizer#predict) and
+Learn from a batch of [`Example`](/api/example) objects, updating the pipe's
+model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 [`get_loss`](/api/entityrecognizer#get_loss).

 > #### Example
 >
 > ```python
-> ner = EntityRecognizer(nlp.vocab)
-> losses = {}
+> ner = EntityRecognizer(nlp.vocab, ner_model)
 > optimizer = nlp.begin_training()
-> ner.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
+> losses = ner.update(examples, sgd=optimizer)
 > ```

-| Name     | Type     | Description                                                                                  |
-| -------- | -------- | -------------------------------------------------------------------------------------------- |
-| `docs`   | iterable | A batch of documents to learn from.                                                          |
-| `golds`  | iterable | The gold-standard data. Must have the same length as `docs`.                                 |
-| `drop`   | float    | The dropout rate.                                                                            |
-| `sgd`    | callable | The optimizer. Should take two arguments `weights` and `gradient`, and an optional ID.       |
-| `losses` | dict     | Optional record of the loss during training. The value keyed by the model's name is updated. |
+| Name              | Type                | Description                                                                                                                                    |
+| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from.                                                                                    |
+| _keyword-only_    |                     |                                                                                                                                                |
+| `drop`            | float               | The dropout rate.                                                                                                                              |
+| `set_annotations` | bool                | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
+| `sgd`             | `Optimizer`         | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.                                                                                |
+| `losses`          | `Dict[str, float]`  | Optional record of the loss during training. The value keyed by the model's name is updated.                                                   |
+| **RETURNS**       | `Dict[str, float]`  | The updated `losses` dictionary.                                                                                                               |

 ## EntityRecognizer.get_loss {#get_loss tag="method"}

@ -157,21 +162,20 @@ predicted scores.
 >
 > ```python
 > ner = EntityRecognizer(nlp.vocab)
-> scores = ner.predict([doc1, doc2])
-> loss, d_loss = ner.get_loss([doc1, doc2], [gold1, gold2], scores)
+> scores = ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = ner.get_loss(examples, scores)
 > ```

-| Name        | Type     | Description                                                  |
-| ----------- | -------- | ------------------------------------------------------------ |
-| `docs`      | iterable | The batch of documents.                                      |
-| `golds`     | iterable | The gold-standard data. Must have the same length as `docs`. |
-| `scores`    | -        | Scores representing the model's predictions.                 |
-| **RETURNS** | tuple    | The loss and the gradient, i.e. `(loss, gradient)`.          |
+| Name        | Type                | Description                                         |
+| ----------- | ------------------- | --------------------------------------------------- |
+| `examples`  | `Iterable[Example]` | The batch of examples.                              |
+| `scores`    | `List[StateClass]`  | Scores representing the model's predictions.        |
+| **RETURNS** | tuple               | The loss and the gradient, i.e. `(loss, gradient)`. |

 ## EntityRecognizer.begin_training {#begin_training tag="method"}

-Initialize the pipe for training, using data examples if available. If no model
-has been initialized yet, the model is added.
+Initialize the pipe for training, using data examples if available. Return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.

 > #### Example
 >
@ -181,12 +185,12 @@ has been initialized yet, the model is added.
 > optimizer = ner.begin_training(pipeline=nlp.pipeline)
 > ```

-| Name          | Type     | Description                                                                                                                                                                                 |
-| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects.                                                                                           |
-| `pipeline`    | list     | Optional list of pipeline components that this component is part of.                                                                                                                        |
-| `sgd`         | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityRecognizer`](/api/entityrecognizer#create_optimizer) if not set. |
-| **RETURNS**   | callable | An optimizer.                                                                                                                                                                               |
+| Name           | Type                    | Description                                                                                                                                                          |
+| -------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | `Iterable[Example]`     | Optional gold-standard annotations in the form of [`Example`](/api/example) objects.                                                                                 |
+| `pipeline`     | `List[(str, callable)]` | Optional list of pipeline components that this component is part of.                                                                                                 |
+| `sgd`          | `Optimizer`             | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
+| **RETURNS**    | `Optimizer`             | An optimizer.                                                                                                                                                        |

 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}

@ -199,9 +203,9 @@ Create an optimizer for the pipeline component.
 > optimizer = ner.create_optimizer()
 > ```

-| Name        | Type     | Description    |
-| ----------- | -------- | -------------- |
-| **RETURNS** | callable | The optimizer. |
+| Name        | Type        | Description                                                     |
+| ----------- | ----------- | --------------------------------------------------------------- |
+| **RETURNS** | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |

 ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}

--- a/Show More
+++ b/Show More