Merge branch 'develop' into nightly.spacy.io

2025-09-18 01:52:37 +03:00 · 2020-07-09 11:43:57 +02:00 · 2020-07-09 11:43:57 +02:00 · 028f8210e8
commit 028f8210e8
parent 9ae4040183 8f9552d9e7
94 changed files with 2453 additions and 2536 deletions
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@ -9,27 +9,28 @@ max_length = 5000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-dropout = 0.2
+dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
+patience = 100000
 max_epochs = 0
-max_steps = 20000
-eval_frequency = 500
+max_steps = 0
+eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 1
+accumulate_gradient = 2
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = false
+discard_oversize = true
 omit_extra_lookups = false
+batch_by_words = true

 [training.batch_size]
@schedules = "compounding.v1"
-start = 100
+start = 1000
 stop = 1000
 compound = 1.001

@ -37,18 +38,18 @@ compound = 1.001
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
-L2_is_weight_decay = false
-L2 = 1e-6
+L2_is_weight_decay = true
+L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001

-#[optimizer.learn_rate]
+#[training.optimizer.learn_rate]
 #@schedules = "warmup_linear.v1"
-#warmup_steps = 250
-#total_steps = 20000
-#initial_rate = 0.001
+#warmup_steps = 1000
+#total_steps = 50000
+#initial_rate = 0.003

 [nlp]
 lang = "en"
@ -58,8 +59,6 @@ vectors = null
 factory = "ner"
 learn_tokens = false
 min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0

 [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
@ -75,6 +74,6 @@ width = 96
 depth = 4
 window_size = 1
 embed_size = 2000
-maxout_pieces = 3
+maxout_pieces = 1
 subword_features = true
 dropout = ${training:dropout}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,6 +7,7 @@ requires = [
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.0.0a12,<8.0.0a20",
-    "blis>=0.4.0,<0.5.0"
+    "blis>=0.4.0,<0.5.0",
+    "pytokenizations"
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
 pydantic>=1.3.0,<2.0.0
+pytokenizations
 # Official Python utilities
 setuptools
 packaging
--- a/setup.cfg
+++ b/setup.cfg
@ -51,6 +51,7 @@ install_requires =
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
    pydantic>=1.3.0,<2.0.0
+    pytokenizations
    # Official Python utilities
    setuptools
    packaging
--- a/setup.py
+++ b/setup.py
@ -1,11 +1,11 @@
 #!/usr/bin/env python
+from setuptools import Extension, setup, find_packages
 import sys
 import platform
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
 import distutils.util
 from distutils import ccompiler, msvccompiler
-from setuptools import Extension, setup, find_packages
 import numpy
 from pathlib import Path
 import shutil
@ -23,7 +23,6 @@ Options.docstrings = True

 PACKAGES = find_packages()
 MOD_NAMES = [
-    "spacy.gold.align",
    "spacy.gold.example",
    "spacy.parts_of_speech",
    "spacy.strings",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,8 +1,7 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a1"
+__version__ = "3.0.0a2"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
 __projects__ = "https://github.com/explosion/spacy-boilerplates"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,8 +15,10 @@ from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project import project_clone, project_assets, project_run  # noqa: F401
-from .project import project_run_all  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface

 DOCS: https://spacy.io/api/cli
 """
+PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
+project templates. You'd typically start by cloning a project template to a local
+directory and fetching its assets like datasets etc. See the project's
+project.yml for the available commands.
+"""


 app = typer.Typer(name=NAME, help=HELP)
+project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
+app.add_typer(project_cli)

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,4 +1,4 @@
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence
 import requests
 import sys
 from wasabi import msg
@ -8,6 +8,23 @@ from ._app import app, Arg, Opt
 from .. import about
 from ..util import is_package, get_base_version, run_command

+# These are the old shortcuts we previously supported in spacy download. As of
+# v3, shortcuts are deprecated so we're not expecting to add anything to this
+# list. It only exists to show users warnings.
+OLD_SHORTCUTS = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm",
+    "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "el": "el_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "xx": "xx_ent_wiki_sm",
+}
+

@app.command(
    "download",
@ -48,8 +65,13 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
        version = components[-1]
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
-        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
-        model_name = shortcuts.get(model, model)
+        model_name = model
+        if model in OLD_SHORTCUTS:
+            msg.warn(
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
+                f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
+            )
+            model_name = OLD_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
@ -59,23 +81,19 @@ def download(model: str, direct: bool = False, *pip_args) -> None:
    )


-def get_json(url: str, desc: str) -> Union[dict, list]:
-    r = requests.get(url)
+def get_compatibility() -> dict:
+    version = get_base_version(about.__version__)
+    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
            f"Server error ({r.status_code})",
-            f"Couldn't fetch {desc}. Please find a model for your spaCy "
+            f"Couldn't fetch compatibility table. Please find a model for your spaCy "
            f"installation (v{about.__version__}), and download it manually. "
            f"For more details, see the documentation: "
            f"https://spacy.io/usage/models",
            exits=1,
        )
-    return r.json()
-
-
-def get_compatibility() -> dict:
-    version = get_base_version(about.__version__)
-    comp_table = get_json(about.__compatibility__, "compatibility table")
+    comp_table = r.json()
    comp = comp_table["spacy"]
    if version not in comp:
        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -1,708 +0,0 @@
-from typing import List, Dict, Any, Optional, Sequence
-import typer
-import srsly
-from pathlib import Path
-from wasabi import msg
-import subprocess
-import os
-import re
-import shutil
-import sys
-import requests
-import tqdm
-
-from ._app import app, Arg, Opt, COMMAND, NAME
-from .. import about
-from ..schemas import ProjectConfigSchema, validate
-from ..util import ensure_path, run_command, make_tempdir, working_dir
-from ..util import get_hash, get_checksum, split_command
-
-
-CONFIG_FILE = "project.yml"
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-DIRS = [
-    "assets",
-    "metas",
-    "configs",
-    "packages",
-    "metrics",
-    "scripts",
-    "notebooks",
-    "training",
-    "corpus",
-]
-CACHES = [
-    Path.home() / ".torch",
-    Path.home() / ".caches" / "torch",
-    os.environ.get("TORCH_HOME"),
-    Path.home() / ".keras",
-]
-DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
-# it directly and edit the project.yml instead and re-run the project."""
-CLI_HELP = f"""Command-line interface for spaCy projects and working with project
-templates. You'd typically start by cloning a project template to a local
-directory and fetching its assets like datasets etc. See the project's
-{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
-Version Control) to manage input and output files and to ensure steps are only
-re-run if their inputs change.
-"""
-
-project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
-
-
-@project_cli.callback(invoke_without_command=True)
-def callback(ctx: typer.Context):
-    """This runs before every project command and ensures DVC is installed."""
-    ensure_dvc()
-
-
-################
-# CLI COMMANDS #
-################
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
-    initialize the project directory as a Git repo. If the project is intended
-    to be a Git repo, it should be initialized with Git first, before
-    initializing DVC (Data Version Control). This allows DVC to integrate with
-    Git.
-    """
-    if dest == Path.cwd():
-        dest = dest / name
-    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
-
-
-@project_cli.command("init")
-def project_init_cli(
-    # fmt: off
-    path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
-    # fmt: on
-):
-    """Initialize a project directory with DVC and optionally Git. This should
-    typically be taken care of automatically when you run the "project clone"
-    command, but you can also run it separately. If the project is intended to
-    be a Git repo, it should be initialized with Git first, before initializing
-    DVC. This allows DVC to integrate with Git.
-    """
-    project_init(path, git=git, force=force, silent=True)
-
-
-@project_cli.command("assets")
-def project_assets_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Use DVC (Data Version Control) to fetch project assets. Assets are
-    defined in the "assets" section of the project config. If possible, DVC
-    will try to track the files so you can pull changes from upstream. It will
-    also try and store the checksum so the assets are versioned. If the file
-    can't be tracked or checked, it will be downloaded without DVC. If a checksum
-    is provided in the project config, the file is only downloaded if no local
-    file with the same checksum exists.
-    """
-    project_assets(project_dir)
-
-
-@project_cli.command(
-    "run-all",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_all_cli(
-    # fmt: off
-    ctx: typer.Context,
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run all commands defined in the project. This command will use DVC and
-    the defined outputs and dependencies in the project config to determine
-    which steps need to be re-run and where to start. This means you're only
-    re-generating data if the inputs have changed.
-
-    This command calls into "dvc repro" and all additional arguments are passed
-    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
-    """
-    if show_help:
-        print_run_help(project_dir)
-    else:
-        project_run_all(project_dir, *ctx.args)
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,
-    subcommand: str = Arg(None, help="Name of command defined in project config"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named script defined in the project config. If the command is
-    part of the default pipeline defined in the "run" section, DVC is used to
-    determine whether the step should re-run if its inputs have changed, or
-    whether everything is up to date. If the script is not part of the default
-    pipeline, it will be called separately without DVC.
-
-    If DVC is used, the command calls into "dvc repro" and all additional
-    arguments are passed to the "dvc repro" command:
-    https://dvc.org/doc/command-reference/repro
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        project_run(project_dir, subcommand, *ctx.args)
-
-
-@project_cli.command("exec", hidden=True)
-def project_exec_cli(
-    # fmt: off
-    subcommand: str = Arg(..., help="Name of command defined in project config"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Execute a command defined in the project config. This CLI command is
-    only called internally in auto-generated DVC pipelines, as a shortcut for
-    multi-step commands in the project config. You typically shouldn't have to
-    call it yourself. To run a command, call "run" or "run-all".
-    """
-    project_exec(project_dir, subcommand)
-
-
-@project_cli.command("update-dvc")
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Update the auto-generated DVC config file. Uses the steps defined in the
-    "run" section of the project config. This typically happens automatically
-    when running a command, but can also be triggered manually if needed.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
-    if updated:
-        msg.good(f"Updated DVC config from {CONFIG_FILE}")
-    else:
-        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
-
-
-app.add_typer(project_cli, name="project")
-
-
-#################
-# CLI FUNCTIONS #
-#################
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    git: bool = False,
-    no_init: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    git (bool): Initialize project as Git repo. Should be set to True if project
-        is intended as a repo, since it will allow DVC to integrate with Git.
-    no_init (bool): Don't initialize DVC and Git automatically. If True, the
-        "init" command or "git init" and "dvc init" need to be run manually.
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
-        try:
-            run_command(cmd)
-        except SystemExit:
-            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
-            msg.fail(err)
-        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
-            f.write(name)
-        try:
-            run_command(["git", "-C", str(tmp_dir), "fetch"])
-            run_command(["git", "-C", str(tmp_dir), "checkout"])
-        except SystemExit:
-            err = f"Could not clone '{name}' in the repo '{repo}'."
-            msg.fail(err)
-        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
-    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
-    for sub_dir in DIRS:
-        dir_path = project_dir / sub_dir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    if not no_init:
-        project_init(project_dir, git=git, force=True, silent=True)
-    msg.good(f"Your project is now ready!", dest)
-    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def project_init(
-    project_dir: Path,
-    *,
-    git: bool = False,
-    force: bool = False,
-    silent: bool = False,
-    analytics: bool = False,
-):
-    """Initialize a project as a DVC and (optionally) as a Git repo.
-
-    project_dir (Path): Path to project directory.
-    git (bool): Also call "git init" to initialize directory as a Git repo.
-    silent (bool): Don't print any output (via DVC).
-    analytics (bool): Opt-in to DVC analytics (defaults to False).
-    """
-    with working_dir(project_dir) as cwd:
-        if git:
-            run_command(["git", "init"])
-        init_cmd = ["dvc", "init"]
-        if silent:
-            init_cmd.append("--quiet")
-        if not git:
-            init_cmd.append("--no-scm")
-        if force:
-            init_cmd.append("--force")
-        run_command(init_cmd)
-        # We don't want to have analytics on by default – our users should
-        # opt-in explicitly. If they want it, they can always enable it.
-        if not analytics:
-            run_command(["dvc", "config", "core.analytics", "false"])
-        # Remove unused and confusing plot templates from .dvc directory
-        # TODO: maybe we shouldn't do this, but it's otherwise super confusing
-        # once you commit your changes via Git and it creates a bunch of files
-        # that have no purpose
-        plots_dir = cwd / DVC_DIR / "plots"
-        if plots_dir.exists():
-            shutil.rmtree(str(plots_dir))
-        config = load_project_config(cwd)
-        setup_check_dvc(cwd, config)
-
-
-def project_assets(project_dir: Path) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path)
-    setup_check_dvc(project_path, config)
-    assets = config.get("assets", {})
-    if not assets:
-        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
-    msg.info(f"Fetching {len(assets)} asset(s)")
-    variables = config.get("variables", {})
-    fetched_assets = []
-    for asset in assets:
-        url = asset["url"].format(**variables)
-        dest = asset["dest"].format(**variables)
-        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
-        if fetched_path:
-            fetched_assets.append(str(fetched_path))
-    if fetched_assets:
-        with working_dir(project_path):
-            run_command(["dvc", "add", *fetched_assets, "--external"])
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> Optional[Path]:
-    """Fetch an asset from a given URL or path. Will try to import the file
-    using DVC's import-url if possible (fully tracked and versioned) and falls
-    back to get-url (versioned) and a non-DVC download if necessary. If a
-    checksum is provided and a local file exists, it's only re-downloaded if the
-    checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    url = convert_asset_url(url)
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
-        # If there's already a file, check for checksum
-        # TODO: add support for caches (dvc import-url with local path)
-        if checksum == get_checksum(dest_path):
-            msg.good(f"Skipping download with matching checksum: {dest}")
-            return dest_path
-    with working_dir(project_path):
-        try:
-            # If these fail, we don't want to output an error or info message.
-            # Try with tracking the source first, then just downloading with
-            # DVC, then a regular non-DVC download.
-            try:
-                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-            except subprocess.CalledProcessError:
-                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
-                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
-        except subprocess.CalledProcessError:
-            try:
-                download_file(url, dest_path)
-            except requests.exceptions.HTTPError as e:
-                msg.fail(f"Download failed: {dest}", e)
-                return None
-    if checksum and checksum != get_checksum(dest_path):
-        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
-    msg.good(f"Fetched asset {dest}")
-    return dest_path
-
-
-def project_run_all(project_dir: Path, *dvc_args) -> None:
-    """Run all commands defined in the project using DVC.
-
-    project_dir (Path): Path to project directory.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    dvc_cmd = ["dvc", "repro", *dvc_args]
-    with working_dir(project_dir):
-        run_command(dvc_cmd)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project config.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    if subcommand:
-        validate_subcommand(commands.keys(), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
-        help_text = commands[subcommand].get("help")
-        if help_text:
-            msg.text(f"\n{help_text}\n")
-    else:
-        print(f"\nAvailable commands in {CONFIG_FILE}")
-        print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
-        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        msg.text("Run all commands defined in the 'run' block of the project config:")
-        print(f"{COMMAND} project run-all {project_dir}")
-
-
-def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
-    """Run a named script defined in the project config. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    validate_subcommand(commands.keys(), subcommand)
-    if subcommand in config.get("run", []):
-        # This is one of the pipeline commands tracked in DVC
-        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
-        with working_dir(project_dir):
-            run_command(dvc_cmd)
-    else:
-        cmd = commands[subcommand]
-        # Deps in non-DVC commands aren't tracked, but if they're defined,
-        # make sure they exist before running the command
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                msg.fail(err, exits=1)
-        with working_dir(project_dir):
-            run_commands(cmd["script"], variables)
-
-
-def project_exec(project_dir: Path, subcommand: str):
-    """Execute a command defined in the project config.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    with working_dir(project_dir):
-        run_commands(commands[subcommand]["script"], variables)
-
-
-###########
-# HELPERS #
-###########
-
-
-def load_project_config(path: Path) -> Dict[str, Any]:
-    """Load the project config file from a directory and validate it.
-
-    path (Path): The path to the project directory.
-    RETURNS (Dict[str, Any]): The loaded project config.
-    """
-    config_path = path / CONFIG_FILE
-    if not config_path.exists():
-        msg.fail("Can't find project config", config_path, exits=1)
-    invalid_err = f"Invalid project config in {CONFIG_FILE}"
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err, "\n".join(errors), exits=1)
-    return config
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    verbose: bool = False,
-    silent: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
-    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project config, don't need to update
-        dvc_config_path.unlink()
-    variables = config.get("variables", {})
-    commands = []
-    # We only want to include commands that are part of the main list of "run"
-    # commands in project.yml and should be run in sequence
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    for name in config.get("run", []):
-        validate_subcommand(config_commands.keys(), name)
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "exec", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
-        if verbose:
-            dvc_cmd.append("--verbose")
-        if silent:
-            dvc_cmd.append("--quiet")
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        commands.append(" ".join(full_cmd))
-    with working_dir(path):
-        run_commands(commands, variables, silent=True)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def ensure_dvc() -> None:
-    """Ensure that the "dvc" command is available and show an error if not."""
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-
-
-def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
-    """Check that the project is set up correctly with DVC and update its
-    config if needed. Will raise an error if the project is not an initialized
-    DVC project.
-
-    project_dir (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project config.
-    """
-    if not project_dir.exists():
-        msg.fail(f"Can't find project directory: {project_dir}")
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project.",
-            f"Make sure that the project template was cloned correctly. To "
-            f"initialize the project directory manually, you can run: "
-            f"{COMMAND} project init {project_dir}",
-            exits=1,
-        )
-    with msg.loading("Updating DVC config..."):
-        updated = update_dvc_config(project_dir, config, silent=True)
-    if updated:
-        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
-
-
-def run_commands(
-    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    silent (bool): Don't print the commands.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {' '.join(command)}")
-        run_command(command)
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match("(http(s?)):\/\/github.com", url):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    try:
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
-            f"To clone a project without Git, copy the files from the '{name}' "
-            f"directory in the {repo} to {dest} manually and then run:",
-            f"{COMMAND} project init {dest}",
-            exits=1,
-        )
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
-            exits=1,
-        )
-
-
-def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if subcommand not in commands:
-        msg.fail(
-            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
-            f"Available commands: {', '.join(commands)}",
-            exits=1,
-        )
-
-
-def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
-    """Download a file using requests.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    chunk_size (int): The size of chunks to read/write.
-    """
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    total = int(response.headers.get("content-length", 0))
-    progress_settings = {
-        "total": total,
-        "unit": "iB",
-        "unit_scale": True,
-        "unit_divisor": chunk_size,
-        "leave": False,
-    }
-    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
-        for data in response.iter_content(chunk_size=chunk_size):
-            size = f.write(data)
-            bar.update(size)
--- a/spacy/cli/project/init.py
+++ b/spacy/cli/project/init.py
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -0,0 +1,154 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import requests
+import tqdm
+import re
+import shutil
+
+from ...util import ensure_path, get_checksum, working_dir
+from .._app import project_cli, Arg
+from .util import PROJECT_FILE, load_project_config
+
+
+# TODO: find a solution for caches
+# CACHES = [
+#     Path.home() / ".torch",
+#     Path.home() / ".caches" / "torch",
+#     os.environ.get("TORCH_HOME"),
+#     Path.home() / ".keras",
+# ]
+
+
+@project_cli.command("assets")
+def project_assets_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+    """
+    project_assets(project_dir)
+
+
+def project_assets(project_dir: Path) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path)
+    assets = config.get("assets", {})
+    if not assets:
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+    msg.info(f"Fetching {len(assets)} asset(s)")
+    variables = config.get("variables", {})
+    for asset in assets:
+        dest = asset["dest"].format(**variables)
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            check_private_asset(dest, checksum)
+            continue
+        url = url.format(**variables)
+        fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Desintation path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if checksum and checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    # TODO: add support for caches
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists() and checksum:
+        # If there's already a file, check for checksum
+        if checksum == get_checksum(dest_path):
+            msg.good(f"Skipping download with matching checksum: {dest}")
+            return dest_path
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+                return
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if re.match(r"(http(s?)):\/\/github.com", url):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
+
+
+def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
+    """Download a file using requests.
+
+    url (str): The URL of the file.
+    dest (Path): The destination path.
+    chunk_size (int): The size of chunks to read/write.
+    """
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total = int(response.headers.get("content-length", 0))
+    progress_settings = {
+        "total": total,
+        "unit": "iB",
+        "unit_scale": True,
+        "unit_divisor": chunk_size,
+        "leave": False,
+    }
+    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
+        for data in response.iter_content(chunk_size=chunk_size):
+            size = f.write(data)
+            bar.update(size)
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -0,0 +1,110 @@
+from pathlib import Path
+from wasabi import msg
+import subprocess
+import shutil
+
+from ... import about
+from ...util import ensure_path, run_command, make_tempdir
+from .._app import project_cli, Arg, Opt, COMMAND
+
+
+DIRS = [
+    "assets",
+    "metas",
+    "configs",
+    "packages",
+    "metrics",
+    "scripts",
+    "notebooks",
+    "training",
+    "corpus",
+]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to fetch"),
+    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo). Setting the --git flag will also
+    initialize the project directory as a Git repo. If the project is intended
+    to be a Git repo, it should be initialized with Git first, before
+    initializing DVC (Data Version Control). This allows DVC to integrate with
+    Git.
+    """
+    if dest == Path.cwd():
+        dest = dest / name
+    project_clone(name, dest, repo=repo)
+
+
+def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    # We're using Git and sparse checkout to only clone the files we need
+    with make_tempdir() as tmp_dir:
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
+        try:
+            run_command(cmd)
+        except subprocess.CalledProcessError:
+            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
+            msg.fail(err)
+        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
+            f.write(name)
+        try:
+            run_command(["git", "-C", str(tmp_dir), "fetch"])
+            run_command(["git", "-C", str(tmp_dir), "checkout"])
+        except subprocess.CalledProcessError:
+            err = f"Could not clone '{name}' in the repo '{repo}'."
+            msg.fail(err)
+        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
+    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
+    for sub_dir in DIRS:
+        dir_path = project_dir / sub_dir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    msg.good(f"Your project is now ready!", dest)
+    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    try:
+        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+            f"To clone a project without Git, copy the files from the '{name}' "
+            f"directory in the {repo} to {dest} manually and then run:",
+            f"{COMMAND} project init {dest}",
+            exits=1,
+        )
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            exits=1,
+        )
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -0,0 +1,206 @@
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+from typing import Dict, Any, List, Optional
+import subprocess
+from pathlib import Path
+from wasabi import msg
+
+from .util import PROJECT_FILE, load_project_config
+from .._app import project_cli, Arg, Opt, NAME, COMMAND
+from ...util import get_hash, working_dir, split_command, join_command, run_command
+
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    silent: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    silent (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    variables = config.get("variables", {})
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+    with working_dir(path):
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        dvc_command = ["dvc", *command]
+        # Add the flags if they are set to True
+        for flag, is_active in flags.items():
+            if is_active:
+                dvc_command.append(flag)
+        run_command(dvc_command)
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -0,0 +1,250 @@
+from typing import Optional, List, Dict, Sequence, Any
+from pathlib import Path
+from wasabi import msg
+import typer
+import sys
+import srsly
+
+from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
+from ...util import get_hash, join_command
+from .._app import project_cli, Arg, Opt, COMMAND
+from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named script or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define inputs and/or outputs, they will only be re-run if state
+    has changed.
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    """
+    config = load_project_config(project_dir)
+    variables = config.get("variables", {})
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(project_dir, cmd, force=force, dry=dry)
+    else:
+        cmd = commands[subcommand]
+        variables = config.get("variables", {})
+        for dep in cmd.get("deps", []):
+            dep = dep.format(**variables)
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_kwargs = {"exits": 1} if not dry else {}
+                msg.fail(err, **err_kwargs)
+        with working_dir(project_dir) as current_dir:
+            rerun = check_rerun(current_dir, cmd, variables)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                msg.divider(subcommand)
+                run_commands(cmd["script"], variables, dry=dry)
+                update_lockfile(current_dir, cmd, variables)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(commands.keys(), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        help_text = commands[subcommand].get("help")
+        if help_text:
+            msg.text(f"\n{help_text}\n")
+    else:
+        print(f"\nAvailable commands in {PROJECT_FILE}")
+        print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
+        print(f"{COMMAND} project run {project_loc}")
+
+
+def run_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, Any] = {},
+    silent: bool = False,
+    dry: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (bool): Whether to re-run the command.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+
+
+def update_lockfile(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+    }
+
+
+def get_fileinfo(
+    project_dir: Path, paths: List[str], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        path = path.format(**variables)
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
--- a/spacy/cli/project/util.py
+++ b/spacy/cli/project/util.py
@ -0,0 +1,57 @@
+from typing import Dict, Any
+from pathlib import Path
+from wasabi import msg
+import srsly
+
+from ...schemas import ProjectConfigSchema, validate
+
+
+PROJECT_FILE = "project.yml"
+PROJECT_LOCK = "project.lock"
+
+
+def load_project_config(path: Path) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it.
+
+    path (Path): The path to the project directory.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err, "\n".join(errors), exits=1)
+    validate_project_commands(config)
+    return config
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -203,7 +203,8 @@ def train(
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
        train_examples = list(
            corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"]
+                nlp, shuffle=False, gold_preproc=training["gold_preproc"],
+                max_length=training["max_length"]
            )
        )
        nlp.begin_training(lambda: train_examples)
@ -306,11 +307,18 @@ def create_train_batches(nlp, corpus, cfg):
        if len(train_examples) == 0:
            raise ValueError(Errors.E988)
        epoch += 1
-        batches = util.minibatch_by_words(
-            train_examples,
-            size=cfg["batch_size"],
-            discard_oversize=cfg["discard_oversize"],
-        )
+        if cfg.get("batch_by_words", True):
+            batches = util.minibatch_by_words(
+                train_examples,
+                size=cfg["batch_size"],
+                discard_oversize=cfg["discard_oversize"],
+            )
+        else:
+            batches = util.minibatch(
+                train_examples,
+                size=cfg["batch_size"],
+            )
+ 
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
            first = next(batches)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,15 +477,14 @@ class Errors(object):
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")

    # TODO: fix numbering after merging develop into master
+    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
            "array and {doc_length} for the Doc itself.")
    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
    E973 = ("Unexpected type for NER data")
    E974 = ("Unknown {obj} attribute: {key}")
-    E975 = ("The method 'Example.from_dict' expects a Doc as first argument, "
-            "but got {type}")
-    E976 = ("The method 'Example.from_dict' expects a dict as second argument, "
+    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
            "but received None.")
    E977 = ("Can not compare a MorphAnalysis with a string object. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold/init.py
+++ b/spacy/gold/init.py
@ -1,6 +1,6 @@
 from .corpus import Corpus
 from .example import Example
-from .align import align
+from .align import Alignment

 from .iob_utils import iob_to_biluo, biluo_to_iob
 from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
--- a/spacy/gold/align.pxd
+++ b/spacy/gold/align.pxd
@ -1,8 +0,0 @@
-cdef class Alignment:
-    cdef public object cost
-    cdef public object i2j
-    cdef public object j2i
-    cdef public object i2j_multi
-    cdef public object j2i_multi
-    cdef public object cand_to_gold
-    cdef public object gold_to_cand
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@ -0,0 +1,30 @@
+from typing import List
+import numpy
+from thinc.types import Ragged
+from dataclasses import dataclass
+import tokenizations
+
+
+@dataclass
+class Alignment:
+    x2y: Ragged
+    y2x: Ragged
+
+    @classmethod
+    def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
+        x2y = _make_ragged(x2y)
+        y2x = _make_ragged(y2x)
+        return Alignment(x2y=x2y, y2x=y2x)
+    
+    @classmethod
+    def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
+        x2y, y2x = tokenizations.get_alignments(A, B)
+        return Alignment.from_indices(x2y=x2y, y2x=y2x)
+
+
+def _make_ragged(indices):
+    lengths = numpy.array([len(x) for x in indices], dtype="i")
+    flat = []
+    for x in indices:
+        flat.extend(x)
+    return Ragged(numpy.array(flat, dtype="i"), lengths)
--- a/spacy/gold/align.pyx
+++ b/spacy/gold/align.pyx
@ -1,101 +0,0 @@
-import numpy
-from ..errors import Errors, AlignmentError
-
-
-cdef class Alignment:
-    def __init__(self, spacy_words, gold_words):
-        # Do many-to-one alignment for misaligned tokens.
-        # If we over-segment, we'll have one gold word that covers a sequence
-        # of predicted words
-        # If we under-segment, we'll have one predicted word that covers a
-        # sequence of gold words.
-        # If we "mis-segment", we'll have a sequence of predicted words covering
-        # a sequence of gold words. That's many-to-many -- we don't do that
-        # except for NER spans where the start and end can be aligned.
-        cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
-        self.cost = cost
-        self.i2j = i2j
-        self.j2i = j2i
-        self.i2j_multi = i2j_multi
-        self.j2i_multi = j2i_multi
-        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
-        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
-
-
-def align(tokens_a, tokens_b):
-    """Calculate alignment tables between two tokenizations.
-
-    tokens_a (List[str]): The candidate tokenization.
-    tokens_b (List[str]): The reference tokenization.
-    RETURNS: (tuple): A 5-tuple consisting of the following information:
-      * cost (int): The number of misaligned tokens.
-      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
-        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
-        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
-        it has the value -1.
-      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
-      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
-        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
-        the same token of `tokens_b`.
-      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
-            direction.
-    """
-    tokens_a = _normalize_for_alignment(tokens_a)
-    tokens_b = _normalize_for_alignment(tokens_b)
-    cost = 0
-    a2b = numpy.empty(len(tokens_a), dtype="i")
-    b2a = numpy.empty(len(tokens_b), dtype="i")
-    a2b.fill(-1)
-    b2a.fill(-1)
-    a2b_multi = {}
-    b2a_multi = {}
-    i = 0
-    j = 0
-    offset_a = 0
-    offset_b = 0
-    while i < len(tokens_a) and j < len(tokens_b):
-        a = tokens_a[i][offset_a:]
-        b = tokens_b[j][offset_b:]
-        if a == b:
-            if offset_a == offset_b == 0:
-                a2b[i] = j
-                b2a[j] = i
-            elif offset_a == 0:
-                cost += 2
-                a2b_multi[i] = j
-            elif offset_b == 0:
-                cost += 2
-                b2a_multi[j] = i
-            offset_a = offset_b = 0
-            i += 1
-            j += 1
-        elif a == "":
-            assert offset_a == 0
-            cost += 1
-            i += 1
-        elif b == "":
-            assert offset_b == 0
-            cost += 1
-            j += 1
-        elif b.startswith(a):
-            cost += 1
-            if offset_a == 0:
-                a2b_multi[i] = j
-            i += 1
-            offset_a = 0
-            offset_b += len(a)
-        elif a.startswith(b):
-            cost += 1
-            if offset_b == 0:
-                b2a_multi[j] = i
-            j += 1
-            offset_b = 0
-            offset_a += len(b)
-        else:
-            assert "".join(tokens_a) != "".join(tokens_b)
-            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
-    return cost, a2b, b2a, a2b_multi, b2a_multi
-
-
-def _normalize_for_alignment(tokens):
-    return [w.replace(" ", "").lower() for w in tokens]
--- a/spacy/gold/example.pxd
+++ b/spacy/gold/example.pxd
@ -1,8 +1,7 @@
 from ..tokens.doc cimport Doc
-from .align cimport Alignment


 cdef class Example:
    cdef readonly Doc x
    cdef readonly Doc y
-    cdef readonly Alignment _alignment
+    cdef readonly object _alignment
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -6,10 +6,9 @@ from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
-from .align cimport Alignment
+from .align import Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 from .iob_utils import spans_from_biluo_tags
-from .align import Alignment
 from ..errors import Errors, Warnings
 from ..syntax import nonproj

@ -28,8 +27,7 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):


 cdef class Example:
-    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
-        """ Doc can either be text, or an actual Doc """
+    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
        if predicted is None:
            raise TypeError(Errors.E972.format(arg="predicted"))
        if reference is None:
@ -60,17 +58,15 @@ cdef class Example:

    @classmethod
    def from_dict(cls, Doc predicted, dict example_dict):
+        if predicted is None:
+            raise ValueError(Errors.E976.format(n="first", type="Doc"))
        if example_dict is None:
-            raise ValueError(Errors.E976)
-        if not isinstance(predicted, Doc):
-            raise TypeError(Errors.E975.format(type=type(predicted)))
+            raise ValueError(Errors.E976.format(n="second", type="dict"))
        example_dict = _fix_legacy_dict_data(example_dict)
        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
        if "ORTH" not in tok_dict:
            tok_dict["ORTH"] = [tok.text for tok in predicted]
            tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
-        if not _has_field(tok_dict, "SPACY"):
-            spaces = _guess_spaces(predicted.text, tok_dict["ORTH"])
        return Example(
            predicted,
            annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -83,34 +79,38 @@ cdef class Example:
            gold_words = [token.orth_ for token in self.reference]
            if gold_words == []:
                gold_words = spacy_words
-            self._alignment = Alignment(spacy_words, gold_words)
+            self._alignment = Alignment.from_strings(spacy_words, gold_words)
        return self._alignment

    def get_aligned(self, field, as_string=False):
        """Return an aligned array for a token attribute."""
-        i2j_multi = self.alignment.i2j_multi
-        cand_to_gold = self.alignment.cand_to_gold
+        align = self.alignment.x2y

        vocab = self.reference.vocab
        gold_values = self.reference.to_array([field])
        output = [None] * len(self.predicted)
-        for i, gold_i in enumerate(cand_to_gold):
-            if self.predicted[i].text.isspace():
-                output[i] = None
-            if gold_i is None:
-                if i in i2j_multi:
-                    output[i] = gold_values[i2j_multi[i]]
-                else:
-                    output[i] = None
+        for token in self.predicted:
+            if token.is_space:
+                output[token.i] = None
            else:
-                output[i] = gold_values[gold_i]
+                values = gold_values[align[token.i].dataXd]
+                values = values.ravel()
+                if len(values) == 0:
+                    output[token.i] = None
+                elif len(values) == 1:
+                    output[token.i] = values[0]
+                elif len(set(list(values))) == 1:
+                    # If all aligned tokens have the same value, use it.
+                    output[token.i] = values[0]
+                else:
+                    output[token.i] = None
        if as_string and field not in ["ENT_IOB", "SENT_START"]:
            output = [vocab.strings[o] if o is not None else o for o in output]
        return output

    def get_aligned_parse(self, projectivize=True):
-        cand_to_gold = self.alignment.cand_to_gold
-        gold_to_cand = self.alignment.gold_to_cand
+        cand_to_gold = self.alignment.x2y
+        gold_to_cand = self.alignment.y2x
        aligned_heads = [None] * self.x.length
        aligned_deps = [None] * self.x.length
        heads = [token.head.i for token in self.y]
@ -118,52 +118,51 @@ cdef class Example:
        if projectivize:
            heads, deps = nonproj.projectivize(heads, deps)
        for cand_i in range(self.x.length):
-            gold_i = cand_to_gold[cand_i]
-            if gold_i is not None: # Alignment found
-                gold_head = gold_to_cand[heads[gold_i]]
-                if gold_head is not None:
-                    aligned_heads[cand_i] = gold_head
+            if cand_to_gold.lengths[cand_i] == 1:
+                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
+                if gold_to_cand.lengths[heads[gold_i]] == 1:
+                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
                    aligned_deps[cand_i] = deps[gold_i]
        return aligned_heads, aligned_deps

+    def get_aligned_spans_x2y(self, x_spans):
+        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
+
+    def get_aligned_spans_y2x(self, y_spans):
+        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
+    
+    def _get_aligned_spans(self, doc, spans, align):
+        seen = set()
+        output = []
+        for span in spans:
+            indices = align[span.start : span.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen]
+            if len(indices) >= 1:
+                aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
+                target_text = span.text.lower().strip().replace(" ", "")
+                our_text = aligned_span.text.lower().strip().replace(" ", "")
+                if our_text == target_text:
+                    output.append(aligned_span)
+                    seen.update(indices)
+        return output
+
    def get_aligned_ner(self):
        if not self.y.is_nered:
            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
-        x_text = self.x.text
-        # Get a list of entities, and make spans for non-entity tokens.
-        # We then work through the spans in order, trying to find them in
-        # the text and using that to get the offset. Any token that doesn't
-        # get a tag set this way is tagged None.
-        # This could maybe be improved? It at least feels easy to reason about.
-        y_spans = list(self.y.ents)
-        y_spans.sort()
-        x_text_offset = 0
-        x_spans = []
-        for y_span in y_spans:
-            if x_text.count(y_span.text) >= 1:
-                start_char = x_text.index(y_span.text) + x_text_offset
-                end_char = start_char + len(y_span.text)
-                x_span = self.x.char_span(start_char, end_char, label=y_span.label)
-                if x_span is not None:
-                    x_spans.append(x_span)
-                    x_text = self.x.text[end_char:]
-                    x_text_offset = end_char
+        x_ents = self.get_aligned_spans_y2x(self.y.ents)
+        # Default to 'None' for missing values
        x_tags = biluo_tags_from_offsets(
            self.x,
-            [(e.start_char, e.end_char, e.label_) for e in x_spans],
+            [(e.start_char, e.end_char, e.label_) for e in x_ents],
            missing=None
        )
-        gold_to_cand = self.alignment.gold_to_cand
-        for token in self.y:
-            if token.ent_iob_ == "O":
-                cand_i = gold_to_cand[token.i]
-                if cand_i is not None and x_tags[cand_i] is None:
-                    x_tags[cand_i] = "O"
-        i2j_multi = self.alignment.i2j_multi
-        for i, tag in enumerate(x_tags):
-            if tag is None and i in i2j_multi:
-                gold_i = i2j_multi[i]
-                if gold_i is not None and self.y[gold_i].ent_iob_ == "O":
+        # Now fill the tokens we can align to O.
+        O = 2 # I=1, O=2, B=3
+        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
+            if x_tags[i] is None:
+                if ent_iob == O:
+                    x_tags[i] = "O"
+                elif self.x[i].is_space:
                    x_tags[i] = "O"
        return x_tags

@ -194,25 +193,22 @@ cdef class Example:
                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
        return links

-
    def split_sents(self):
        """ Split the token annotations into multiple Examples based on
        sent_starts and return a list of the new Examples"""
        if not self.reference.is_sentenced:
            return [self]
-
-        sent_starts = self.get_aligned("SENT_START")
-        sent_starts.append(1)   # appending virtual start of a next sentence to facilitate search
-
+        
+        align = self.alignment.y2x
+        seen_indices = set()
        output = []
-        pred_start = 0
-        for sent in self.reference.sents:
-            new_ref = sent.as_doc()
-            pred_end = sent_starts.index(1, pred_start+1)  # find where the next sentence starts
-            new_pred = self.predicted[pred_start : pred_end].as_doc()
-            output.append(Example(new_pred, new_ref))
-            pred_start = pred_end
-
+        for y_sent in self.reference.sents:
+            indices = align[y_sent.start : y_sent.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen_indices]
+            if indices:
+                x_sent = self.predicted[indices[0] : indices[-1] + 1]
+                output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
+                seen_indices.update(indices)
        return output

    property text:
@ -258,7 +254,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([vocab.morphology.add(v) for v in value])
        else:
            attrs.append(key)
-            values.append([vocab.strings.add(v) for v in value])
+            try:
+                values.append([vocab.strings.add(v) for v in value])
+            except TypeError:
+                types= set([type(v) for v in value])
+                raise TypeError(Errors.E969.format(field=key, types=types))

    array = numpy.asarray(values, dtype="uint64")
    return attrs, array.T
--- a/spacy/language.py
+++ b/spacy/language.py
@ -540,19 +540,15 @@ class Language(object):

        if component_cfg is None:
            component_cfg = {}
-        component_deps = count_pipeline_interdependencies(self.pipeline)
-        # Determine whether component should set annotations. In theory I guess
-        # we should do this by inspecting the meta? Or we could just always
-        # say "yes"
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            component_cfg[name].setdefault("drop", drop)
-            component_cfg[name]["set_annotations"] = bool(component_deps[i])
+            component_cfg[name].setdefault("set_annotations", False)
        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd is not False:
+        if sgd not in (None, False):
            for name, proc in self.pipeline:
                if hasattr(proc, "model"):
                    proc.model.finish_update(sgd)
--- a/spacy/ml/_precomputable_affine.py
+++ b/spacy/ml/_precomputable_affine.py
@ -1,13 +1,14 @@
 from thinc.api import Model, normal_init


-def PrecomputableAffine(nO, nI, nF, nP):
+def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
    model = Model(
        "precomputable_affine",
        forward,
        init=init,
        dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
        params={"W": None, "b": None, "pad": None},
+        attrs={"dropout_rate": dropout}
    )
    return model

@ -48,17 +49,14 @@ def forward(model, X, is_train):
        model.inc_grad("b", dY.sum(axis=0))
        dY = dY.reshape((dY.shape[0], nO * nP))

-        Wopfi = model.ops.as_contig(W.transpose((1, 2, 0, 3)))
+        Wopfi = W.transpose((1, 2, 0, 3))
        Wopfi = Wopfi.reshape((nO * nP, nF * nI))
        dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)

-        # Reuse the buffer
-        dWopfi = Wopfi
-        dWopfi.fill(0.0)
-        model.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
+        dWopfi = model.ops.gemm(dY, Xf, trans1=True)
        dWopfi = dWopfi.reshape((nO, nP, nF, nI))
        # (o, p, f, i) --> (f, o, p, i)
-        dWopfi = model.ops.as_contig(dWopfi.transpose((2, 0, 1, 3)))
+        dWopfi = dWopfi.transpose((2, 0, 1, 3))
        model.inc_grad("W", dWopfi)
        return dXf.reshape((dXf.shape[0], nF, nI))

--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -263,20 +263,20 @@ def build_Tok2Vec_model(
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        norm = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout,
+            nO=width, nV=embed_size, column=cols.index(NORM), dropout=None,
            seed=0
        )
        if subword_features:
            prefix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=None,
                seed=1
            )
            suffix = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=None,
                seed=2
            )
            shape = HashEmbed(
-                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout,
+                nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=None,
                seed=3
            )
        else:
@ -296,7 +296,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -309,7 +309,7 @@ def build_Tok2Vec_model(
                    >> Maxout(
                        nO=width,
                        nI=width * columns,
-                        nP=maxout_pieces,
+                        nP=3,
                        dropout=0.0,
                        normalize=True,
                    ),
@ -322,7 +322,7 @@ def build_Tok2Vec_model(
                >> Maxout(
                    nO=width,
                    nI=width * columns,
-                    nP=maxout_pieces,
+                    nP=3,
                    dropout=0.0,
                    normalize=True,
                ),
@ -335,7 +335,7 @@ def build_Tok2Vec_model(
            reduce_dimensions = Maxout(
                nO=width,
                nI=nM * nC + width,
-                nP=maxout_pieces,
+                nP=3,
                dropout=0.0,
                normalize=True,
            )
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -2,7 +2,7 @@ from thinc.api import Model, noop, use_ops, Linear
 from ..syntax._parser_model import ParserStepModel


-def TransitionModel(tok2vec, lower, upper, unseen_classes=set()):
+def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()):
    """Set up a stepwise transition-based model"""
    if upper is None:
        has_upper = False
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -272,7 +272,7 @@ cdef class Morphology:

    @staticmethod
    def feats_to_dict(feats):
-        if not feats:
+        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -3,7 +3,7 @@ cimport numpy as np

 import numpy
 import srsly
-from thinc.api import to_categorical
+from thinc.api import SequenceCategoricalCrossentropy

 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
            doc.is_morphed = True

    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        truths = []
        for eg in examples:
+            eg_truths = []
            pos_tags = eg.get_aligned("POS", as_string=True)
            morphs = eg.get_aligned("MORPH", as_string=True)
            for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
                    morph = self.vocab.strings[self.vocab.morphology.add(feats)]
                if morph == "":
                    morph = Morphology.EMPTY_MORPH
-                if morph is None:
-                    correct[idx] = guesses[idx]
-                elif morph in tag_index:
-                    correct[idx] = tag_index[morph]
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                eg_truths.append(morph)
+            truths.append(eg_truths)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

    def to_bytes(self, exclude=tuple()):
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -334,7 +334,7 @@ class Tagger(Pipe):
            losses[self.name] += (gradient**2).sum()

    def get_loss(self, examples, scores):
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels)
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
        truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
@ -521,29 +521,23 @@ class SentenceRecognizer(Tagger):
                        doc.c[j].sent_start = -1

    def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = range(len(self.labels))
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        labels = self.labels
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        truths = []
        for eg in examples:
-            sent_starts = eg.get_aligned("sent_start")
-            for sent_start in sent_starts:
-                if sent_start is None:
-                    correct[idx] = guesses[idx]
-                elif sent_start in tag_index:
-                    correct[idx] = sent_start
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

    def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -222,7 +222,7 @@ class TrainingSchema(BaseModel):
 class ProjectConfigAsset(BaseModel):
    # fmt: off
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
-    url: StrictStr = Field(..., title="URL of asset")
+    url: Optional[StrictStr] = Field(None, title="URL of asset")
    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
    # fmt: on

@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
    # fmt: off
    variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
    assets: List[ProjectConfigAsset] = Field([], title="Data assets")
-    run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
    # fmt: on

--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -326,10 +326,11 @@ class Scorer(object):
        for token in doc:
            if token.orth_.isspace():
                continue
-            gold_i = align.cand_to_gold[token.i]
-            if gold_i is None:
+            if align.x2y.lengths[token.i] != 1:
                self.tokens.fp += 1
+                gold_i = None
            else:
+                gold_i = align.x2y[token.i].dataXd[0, 0]
                self.tokens.tp += 1
                cand_tags.add((gold_i, token.tag_))
                cand_pos.add((gold_i, token.pos_))
@ -345,7 +346,10 @@ class Scorer(object):
                if token.is_sent_start:
                    cand_sent_starts.add(gold_i)
            if token.dep_.lower() not in punct_labels and token.orth_.strip():
-                gold_head = align.cand_to_gold[token.head.i]
+                if align.x2y.lengths[token.head.i] == 1:
+                    gold_head = align.x2y[token.head.i].dataXd[0, 0]
+                else:
+                    gold_head = None
                # None is indistinct, so we can't just add it to the set
                # Multiple (None, None) deps are possible
                if gold_i is None or gold_head is None:
@ -381,15 +385,9 @@ class Scorer(object):
                gold_ents.add(gold_ent)
                gold_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
            cand_per_ents = {ent_label: set() for ent_label in ent_labels}
-            for ent in doc.ents:
-                first = align.cand_to_gold[ent.start]
-                last = align.cand_to_gold[ent.end - 1]
-                if first is None or last is None:
-                    self.ner.fp += 1
-                    self.ner_per_ents[ent.label_].fp += 1
-                else:
-                    cand_ents.add((ent.label_, first, last))
-                    cand_per_ents[ent.label_].add((ent.label_, first, last))
+            for ent in example.get_aligned_spans_x2y(doc.ents):
+                cand_ents.add((ent.label_, ent.start, ent.end - 1))
+                cand_per_ents[ent.label_].add((ent.label_, ent.start, ent.end - 1))
            # Scores per ent
            for k, v in self.ner_per_ents.items():
                if k in cand_per_ents:
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@ -219,9 +219,11 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no


 class ParserStepModel(Model):
-    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True):
+    def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
+            dropout=0.1):
        Model.__init__(self, name="parser_step_model", forward=step_forward)
        self.attrs["has_upper"] = has_upper
+        self.attrs["dropout_rate"] = dropout
        self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
        if layers[1].get_dim("nP") >= 2:
            activation = "maxout"
@ -289,11 +291,17 @@ class ParserStepModel(Model):
        self.bp_tokvecs(d_tokvecs[:-1])
        return d_tokvecs

+NUMPY_OPS = NumpyOps()

 def step_forward(model: ParserStepModel, states, is_train):
    token_ids = model.get_token_ids(states)
    vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
+    mask = None
    if model.attrs["has_upper"]:
+        dropout_rate = model.attrs["dropout_rate"]
+        if is_train and dropout_rate > 0:
+            mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
+            vector *= mask
        scores, get_d_vector = model.vec2scores(vector, is_train)
    else:
        scores = NumpyOps().asarray(vector)
@ -305,6 +313,8 @@ def step_forward(model: ParserStepModel, states, is_train):
        # Zero vectors for unseen classes
        d_scores *= model._class_mask
        d_vector = get_d_vector(d_scores)
+        if mask is not None:
+            d_vector *= mask
        if isinstance(model.state2vec.ops, CupyOps) \
        and not isinstance(token_ids, model.state2vec.ops.xp.ndarray):
            # Move token_ids and d_vector to GPU, asynchronously
@ -437,7 +447,7 @@ cdef class precompute_hiddens:
        sum_state_features(<float*>state_vector.data,
            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
-        state_vector = state_vector + self.bias
+        state_vector += self.bias
        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

        def backward(d_state_vector_ids):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -65,7 +65,6 @@ cdef class Parser:
            self.set_output(self.moves.n_moves)
        self.cfg = dict(cfg)
        self.cfg.setdefault("update_with_oracle_cut_size", 100)
-        self.cfg.setdefault("normalize_gradients_with_batch_size", True)
        self._multitasks = []
        for multitask in cfg.get("multitasks", []):
            self.add_multitask_objective(multitask)
@ -280,11 +279,12 @@ cdef class Parser:
            [eg.predicted for eg in examples])
        if self.cfg["update_with_oracle_cut_size"] >= 1:
            # Chop sequences into lengths of this many transitions, to make the
-            # batch uniform length. We randomize this to overfit less.
+            # batch uniform length.
+            # We used to randomize this, but it's not clear that actually helps?
            cut_size = self.cfg["update_with_oracle_cut_size"]
            states, golds, max_steps = self._init_gold_batch(
                examples,
-                max_length=numpy.random.choice(range(5, cut_size))
+                max_length=cut_size 
            )
        else:
            states, golds, _ = self.moves.init_gold_batch(examples)
@ -292,24 +292,15 @@ cdef class Parser:
        if not states:
            return losses
        all_states = list(states)
-        states_golds = zip(states, golds)
-        for _ in range(max_steps):
-            if not states_golds:
-                break
+        states_golds = list(zip(states, golds))
+        while states_golds:
            states, golds = zip(*states_golds)
            scores, backprop = model.begin_update(states)
            d_scores = self.get_batch_loss(states, golds, scores, losses)
-            if self.cfg["normalize_gradients_with_batch_size"]:
-                # We have to be very careful how we do this, because of the way we
-                # cut up the batch. We subdivide long sequences. If we normalize
-                # naively, we end up normalizing by sequence length, which
-                # is bad: that would mean that states in long sequences
-                # consistently get smaller gradients. Imagine if we have two
-                # sequences, one length 1000, one length 20. If we cut up
-                # the 1k sequence so that we have a "batch" of 50 subsequences,
-                # we don't want the gradients to get 50 times smaller!
-                d_scores /= n_examples
-
+            # Note that the gradient isn't normalized by the batch size
+            # here, because our "samples" are really the states...But we
+            # can't normalize by the number of states either, as then we'd
+            # be getting smaller gradients for states in long sequences.
            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, scores)
@ -407,6 +398,7 @@ cdef class Parser:
            cpu_log_loss(c_d_scores,
                costs, is_valid, &scores[i, 0], d_scores.shape[1])
            c_d_scores += d_scores.shape[1]
+        # Note that we don't normalize this. See comment in update() for why.
        if losses is not None:
            losses.setdefault(self.name, 0.)
            losses[self.name] += (d_scores**2).sum()
@ -525,21 +517,25 @@ cdef class Parser:
            StateClass state
            Transition action
        all_states = self.moves.init_batch([eg.predicted for eg in examples])
+        states = []
+        golds = []
        kept = []
        max_length_seen = 0
        for state, eg in zip(all_states, examples):
            if self.moves.has_gold(eg) and not state.is_final():
                gold = self.moves.init_gold(state, eg)
-                oracle_actions = self.moves.get_oracle_sequence_from_state(
-                    state.copy(), gold)
-                kept.append((eg, state, gold, oracle_actions))
-                min_length = min(min_length, len(oracle_actions))
-                max_length_seen = max(max_length, len(oracle_actions))
+                if len(eg.x) < max_length:
+                    states.append(state)
+                    golds.append(gold)
+                else:
+                    oracle_actions = self.moves.get_oracle_sequence_from_state(
+                        state.copy(), gold)
+                    kept.append((eg, state, gold, oracle_actions))
+                    min_length = min(min_length, len(oracle_actions))
+                    max_length_seen = max(max_length, len(oracle_actions))
        if not kept:
-            return [], [], 0
+            return states, golds, 0
        max_length = max(min_length, min(max_length, max_length_seen))
-        states = []
-        golds = []
        cdef int clas
        max_moves = 0
        for eg, state, gold, oracle_actions in kept:
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@ -45,7 +45,7 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):

 def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
    assert contains_cycle(tree) is None
-    assert contains_cycle(cyclic_tree) == set([3, 4, 5])
+    assert contains_cycle(cyclic_tree) == {3, 4, 5}
    assert contains_cycle(partial_tree) is None
    assert contains_cycle(multirooted_tree) is None

--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -38,6 +38,11 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # add some cases where SENT_START == -1
+    train_examples[0].reference[10].is_sent_start = False
+    train_examples[1].reference[1].is_sent_start = False
+    train_examples[1].reference[11].is_sent_start = False
+
    nlp.add_pipe(senter)
    optimizer = nlp.begin_training()

--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -23,6 +23,7 @@ def test_issue2070():
    assert len(doc) == 11


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
    assert len(matches) == 3


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
    assert doc[0].like_num


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue2800():
    """Test issue that arises when too many labels are added to NER model.
    Used to cause segfault.
    """
    nlp = English()
    train_data = []
-    train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
+    train_data.extend(
+        [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
+    )
    entity_types = [str(i) for i in range(1000)]
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -88,6 +88,7 @@ def test_issue3199():
    assert list(doc[0:3].noun_chunks) == []


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@ -0,0 +1,472 @@
+import pytest
+from spacy.language import Language
+from spacy.vocab import Vocab
+from spacy.pipeline import EntityRuler, DependencyParser
+from spacy.pipeline.defaults import default_parser
+from spacy import displacy, load
+from spacy.displacy import parse_deps
+from spacy.tokens import Doc, Token
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.errors import MatchPatternError
+from spacy.util import minibatch
+from spacy.gold import Example
+from spacy.lang.hi import Hindi
+from spacy.lang.es import Spanish
+from spacy.lang.en import English
+from spacy.attrs import IS_ALPHA
+from thinc.api import compounding
+import spacy
+import srsly
+import numpy
+
+from ..util import make_tempdir, get_doc
+
+
+@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
+    assert tok.is_stop
+
+
+def test_issue_3526_1(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    ruler_bytes = ruler.to_bytes()
+    assert len(ruler) == len(patterns)
+    assert len(ruler.labels) == 4
+    assert ruler.overwrite
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(ruler_bytes)
+    assert len(new_ruler) == len(ruler)
+    assert len(new_ruler.labels) == 4
+    assert new_ruler.overwrite == ruler.overwrite
+    assert new_ruler.ent_id_sep == ruler.ent_id_sep
+
+
+def test_issue_3526_2(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
+    new_ruler = EntityRuler(nlp)
+    new_ruler = new_ruler.from_bytes(bytes_old_style)
+    assert len(new_ruler) == len(ruler)
+    for pattern in ruler.patterns:
+        assert pattern in new_ruler.patterns
+    assert new_ruler.overwrite is not ruler.overwrite
+
+
+def test_issue_3526_3(en_vocab):
+    patterns = [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+    ]
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
+    with make_tempdir() as tmpdir:
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
+        for pattern in ruler.patterns:
+            assert pattern in new_ruler.patterns
+        assert len(new_ruler) == len(ruler)
+        assert new_ruler.overwrite is not ruler.overwrite
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue_3526_4(en_vocab):
+    nlp = Language(vocab=en_vocab)
+    ruler = EntityRuler(nlp, overwrite_ents=True)
+    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
+    nlp.add_pipe(ruler)
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
+        nlp2 = load(tmpdir)
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True
+
+
+def test_issue3531():
+    """Test that displaCy renderer doesn't require "settings" key."""
+    example_dep = {
+        "words": [
+            {"text": "But", "tag": "CCONJ"},
+            {"text": "Google", "tag": "PROPN"},
+            {"text": "is", "tag": "VERB"},
+            {"text": "starting", "tag": "VERB"},
+            {"text": "from", "tag": "ADP"},
+            {"text": "behind.", "tag": "ADV"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
+            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
+            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
+            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
+        ],
+    }
+    example_ent = {
+        "text": "But Google is starting from behind.",
+        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+    }
+    dep_html = displacy.render(example_dep, style="dep", manual=True)
+    assert dep_html
+    ent_html = displacy.render(example_ent, style="ent", manual=True)
+    assert ent_html
+
+
+def test_issue3540(en_vocab):
+    words = ["I", "live", "in", "NewYork", "right", "now"]
+    tensor = numpy.asarray(
+        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
+        dtype="f",
+    )
+    doc = Doc(en_vocab, words=words)
+    doc.tensor = tensor
+    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_1 = [token.vector for token in doc]
+    assert len(vectors_1) == len(doc)
+
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[3], 1), doc[2]]
+        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
+        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
+
+    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.text for token in doc] == gold_text
+    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
+    assert [token.lemma_ for token in doc] == gold_lemma
+    vectors_2 = [token.vector for token in doc]
+    assert len(vectors_2) == len(doc)
+    assert vectors_1[0].tolist() == vectors_2[0].tolist()
+    assert vectors_1[1].tolist() == vectors_2[1].tolist()
+    assert vectors_1[2].tolist() == vectors_2[2].tolist()
+    assert vectors_1[4].tolist() == vectors_2[5].tolist()
+    assert vectors_1[5].tolist() == vectors_2[6].tolist()
+
+
+def test_issue3549(en_vocab):
+    """Test that match pattern validation doesn't raise on empty errors."""
+    matcher = Matcher(en_vocab, validate=True)
+    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
+    matcher.add("GOOD", [pattern])
+    with pytest.raises(MatchPatternError):
+        matcher.add("BAD", [[{"X": "Y"}]])
+
+
+@pytest.mark.xfail
+def test_issue3555(en_vocab):
+    """Test that custom extensions with default None don't break matcher."""
+    Token.set_extension("issue3555", default=None)
+    matcher = Matcher(en_vocab)
+    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["have", "apple"])
+    matcher(doc)
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training(X=x_train, Y=y_train)
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+
+
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp("hi. how हुए. होटल, होटल")
+    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
+    assert [token.text for token in doc] == expected
+
+
+def test_issue3803():
+    """Test that spanish num-like tokens have True for like_num attribute."""
+    nlp = Spanish()
+    text = "2 dos 1000 mil 12 doce"
+    doc = nlp(text)
+
+    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_no_subtok():
+    """Test that the parser doesn't have subtok label if not learn_tokens"""
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" not in parser.labels
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3830_with_subtok():
+    """Test that the parser does have subtok label if learn_tokens=True."""
+    config = {
+        "learn_tokens": True,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    parser = DependencyParser(Vocab(), default_parser(), **config)
+    parser.add_label("nsubj")
+    assert "subtok" not in parser.labels
+    parser.begin_training(lambda: [])
+    assert "subtok" in parser.labels
+
+
+def test_issue3839(en_vocab):
+    """Test that match IDs returned by the matcher are correct, are in the string """
+    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
+    matcher = Matcher(en_vocab)
+    match_id = "PATTERN"
+    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
+    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
+    matcher.add(match_id, [pattern1])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+    matcher = Matcher(en_vocab)
+    matcher.add(match_id, [pattern2])
+    matches = matcher(doc)
+    assert matches[0][0] == en_vocab.strings[match_id]
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
+        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
+        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+    doc = nlp(sentence)
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+
+def test_issue3879(en_vocab):
+    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+    assert len(doc) == 5
+    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [pattern])
+    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("parser"))
+    nlp.add_pipe(nlp.create_pipe("ner"))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.get_pipe("parser").add_label("dep")
+    nlp.get_pipe("ner").add_label("PERSON")
+    nlp.get_pipe("tagger").add_label("NN")
+    nlp.begin_training()
+    for doc in nlp.pipe(texts):
+        pass
+
+
+def test_issue3882(en_vocab):
+    """Test that displaCy doesn't serialize the doc.user_data when making a
+    copy of the Doc.
+    """
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc.is_parsed = True
+    doc.user_data["test"] = set()
+    parse_deps(doc)
+
+
+def test_issue3951(en_vocab):
+    """Test that combinations of optional rules are matched correctly."""
+    matcher = Matcher(en_vocab)
+    pattern = [
+        {"LOWER": "hello"},
+        {"LOWER": "this", "OP": "?"},
+        {"OP": "?"},
+        {"LOWER": "world"},
+    ]
+    matcher.add("TEST", [pattern])
+    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
+    matches = matcher(doc)
+    assert len(matches) == 0
+
+
+def test_issue3959():
+    """ Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+        assert doc2[0].pos_ == "NOUN"
+
+
+def test_issue3962(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+def test_issue3962_long(en_vocab):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    # fmt: off
+    words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
+    # fmt: on
+    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+    # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].head.text == "jests"
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # head set to the new artificial head (in sentence 2)
+    assert doc2[4].head.text == "They"
+    assert doc2[4].dep_ == "dep"
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
+
+
+def test_issue3972(en_vocab):
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
+    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
+    matches = matcher(doc)
+
+    assert len(matches) == 2
+
+    # We should have a match for each of the two rules
+    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
+    assert "A" in found_ids
+    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -1,8 +0,0 @@
-import pytest
-
-
-@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
-def test_issue3521(en_tokenizer, word):
-    tok = en_tokenizer(word)[1]
-    # 'not' and 'would' should be stopwords, also in their abbreviated forms
-    assert tok.is_stop
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@ -1,85 +0,0 @@
-import pytest
-from spacy.tokens import Span
-from spacy.language import Language
-from spacy.pipeline import EntityRuler
-from spacy import load
-import srsly
-
-from ..util import make_tempdir
-
-
-@pytest.fixture
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-    ]
-
-
-@pytest.fixture
-def add_ent():
-    def add_ent_component(doc):
-        doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
-        return doc
-
-    return add_ent_component
-
-
-def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    ruler_bytes = ruler.to_bytes()
-    assert len(ruler) == len(patterns)
-    assert len(ruler.labels) == 4
-    assert ruler.overwrite
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(ruler_bytes)
-    assert len(new_ruler) == len(ruler)
-    assert len(new_ruler.labels) == 4
-    assert new_ruler.overwrite == ruler.overwrite
-    assert new_ruler.ent_id_sep == ruler.ent_id_sep
-
-
-def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
-    new_ruler = EntityRuler(nlp)
-    new_ruler = new_ruler.from_bytes(bytes_old_style)
-    assert len(new_ruler) == len(ruler)
-    for pattern in ruler.patterns:
-        assert pattern in new_ruler.patterns
-    assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
-    with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler"
-        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
-        new_ruler = EntityRuler(nlp).from_disk(out_file)
-        for pattern in ruler.patterns:
-            assert pattern in new_ruler.patterns
-        assert len(new_ruler) == len(ruler)
-        assert new_ruler.overwrite is not ruler.overwrite
-
-
-def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
-    nlp = Language(vocab=en_vocab)
-    ruler = EntityRuler(nlp, overwrite_ents=True)
-
-    ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
-    nlp.add_pipe(ruler)
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        ruler = nlp.get_pipe("entity_ruler")
-        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert ruler.overwrite is True
-        nlp2 = load(tmpdir)
-        new_ruler = nlp2.get_pipe("entity_ruler")
-        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert new_ruler.overwrite is True
--- a/spacy/tests/regression/test_issue3531.py
+++ b/spacy/tests/regression/test_issue3531.py
@ -1,30 +0,0 @@
-from spacy import displacy
-
-
-def test_issue3531():
-    """Test that displaCy renderer doesn't require "settings" key."""
-    example_dep = {
-        "words": [
-            {"text": "But", "tag": "CCONJ"},
-            {"text": "Google", "tag": "PROPN"},
-            {"text": "is", "tag": "VERB"},
-            {"text": "starting", "tag": "VERB"},
-            {"text": "from", "tag": "ADP"},
-            {"text": "behind.", "tag": "ADV"},
-        ],
-        "arcs": [
-            {"start": 0, "end": 3, "label": "cc", "dir": "left"},
-            {"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
-            {"start": 2, "end": 3, "label": "aux", "dir": "left"},
-            {"start": 3, "end": 4, "label": "prep", "dir": "right"},
-            {"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
-        ],
-    }
-    example_ent = {
-        "text": "But Google is starting from behind.",
-        "ents": [{"start": 4, "end": 10, "label": "ORG"}],
-    }
-    dep_html = displacy.render(example_dep, style="dep", manual=True)
-    assert dep_html
-    ent_html = displacy.render(example_ent, style="ent", manual=True)
-    assert ent_html
--- a/spacy/tests/regression/test_issue3540.py
+++ b/spacy/tests/regression/test_issue3540.py
@ -1,44 +0,0 @@
-from spacy.tokens import Doc
-
-import numpy as np
-
-
-def test_issue3540(en_vocab):
-
-    words = ["I", "live", "in", "NewYork", "right", "now"]
-    tensor = np.asarray(
-        [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
-        dtype="f",
-    )
-    doc = Doc(en_vocab, words=words)
-    doc.tensor = tensor
-
-    gold_text = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_1 = [token.vector for token in doc]
-    assert len(vectors_1) == len(doc)
-
-    with doc.retokenize() as retokenizer:
-        heads = [(doc[3], 1), doc[2]]
-        attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
-        retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
-
-    gold_text = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.text for token in doc] == gold_text
-
-    gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
-    assert [token.lemma_ for token in doc] == gold_lemma
-
-    vectors_2 = [token.vector for token in doc]
-    assert len(vectors_2) == len(doc)
-
-    assert vectors_1[0].tolist() == vectors_2[0].tolist()
-    assert vectors_1[1].tolist() == vectors_2[1].tolist()
-    assert vectors_1[2].tolist() == vectors_2[2].tolist()
-
-    assert vectors_1[4].tolist() == vectors_2[5].tolist()
-    assert vectors_1[5].tolist() == vectors_2[6].tolist()
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@ -1,12 +0,0 @@
-import pytest
-from spacy.matcher import Matcher
-from spacy.errors import MatchPatternError
-
-
-def test_issue3549(en_vocab):
-    """Test that match pattern validation doesn't raise on empty errors."""
-    matcher = Matcher(en_vocab, validate=True)
-    pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-    matcher.add("GOOD", [pattern])
-    with pytest.raises(MatchPatternError):
-        matcher.add("BAD", [[{"X": "Y"}]])
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@ -1,14 +0,0 @@
-import pytest
-from spacy.tokens import Doc, Token
-from spacy.matcher import Matcher
-
-
-@pytest.mark.xfail
-def test_issue3555(en_vocab):
-    """Test that custom extensions with default None don't break matcher."""
-    Token.set_extension("issue3555", default=None)
-    matcher = Matcher(en_vocab)
-    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["have", "apple"])
-    matcher(doc)
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -1,45 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue3611():
-    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training(X=x_train, Y=y_train)
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@ -1,9 +0,0 @@
-from spacy.lang.hi import Hindi
-
-
-def test_issue3625():
-    """Test that default punctuation rules applies to hindi unicode characters"""
-    nlp = Hindi()
-    doc = nlp("hi. how हुए. होटल, होटल")
-    expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
-    assert [token.text for token in doc] == expected
--- a/spacy/tests/regression/test_issue3803.py
+++ b/spacy/tests/regression/test_issue3803.py
@ -1,10 +0,0 @@
-from spacy.lang.es import Spanish
-
-
-def test_issue3803():
-    """Test that spanish num-like tokens have True for like_num attribute."""
-    nlp = Spanish()
-    text = "2 dos 1000 mil 12 doce"
-    doc = nlp(text)
-
-    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
--- a/spacy/tests/regression/test_issue3830.py
+++ b/spacy/tests/regression/test_issue3830.py
@ -1,34 +0,0 @@
-from spacy.pipeline.pipes import DependencyParser
-from spacy.vocab import Vocab
-
-from spacy.pipeline.defaults import default_parser
-
-
-def test_issue3830_no_subtok():
-    """Test that the parser doesn't have subtok label if not learn_tokens"""
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" not in parser.labels
-
-
-def test_issue3830_with_subtok():
-    """Test that the parser does have subtok label if learn_tokens=True."""
-    config = {
-        "learn_tokens": True,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    parser = DependencyParser(Vocab(), default_parser(), **config)
-    parser.add_label("nsubj")
-    assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [])
-    assert "subtok" in parser.labels
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@ -1,18 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3839(en_vocab):
-    """Test that match IDs returned by the matcher are correct, are in the string """
-    doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
-    matcher = Matcher(en_vocab)
-    match_id = "PATTERN"
-    pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
-    pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
-    matcher.add(match_id, [pattern1])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
-    matcher = Matcher(en_vocab)
-    matcher.add(match_id, [pattern2])
-    matches = matcher(doc)
-    assert matches[0][0] == en_vocab.strings[match_id]
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.attrs import IS_ALPHA
-from spacy.lang.en import English
-
-
-@pytest.mark.parametrize(
-    "sentence",
-    [
-        "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
-        "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
-        "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
-        "It was a missed assignment, but it shouldn't have resulted in a turnover ...",
-    ],
-)
-def test_issue3869(sentence):
-    """Test that the Doc's count_by function works consistently"""
-    nlp = English()
-    doc = nlp(sentence)
-
-    count = 0
-    for token in doc:
-        count += token.is_alpha
-
-    assert count == doc.count_by(IS_ALPHA).get(1, 0)
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@ -1,11 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3879(en_vocab):
-    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
-    assert len(doc) == 5
-    pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [pattern])
-    assert len(matcher(doc)) == 2  # fails because of a FP match 'is a test'
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@ -1,21 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue3880():
-    """Test that `nlp.pipe()` works when an empty string ends the batch.
-
-    Fixed in v7.0.5 of Thinc.
-    """
-    texts = ["hello", "world", "", ""]
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("parser"))
-    nlp.add_pipe(nlp.create_pipe("ner"))
-    nlp.add_pipe(nlp.create_pipe("tagger"))
-    nlp.get_pipe("parser").add_label("dep")
-    nlp.get_pipe("ner").add_label("PERSON")
-    nlp.get_pipe("tagger").add_label("NN")
-    nlp.begin_training()
-    for doc in nlp.pipe(texts):
-        pass
--- a/spacy/tests/regression/test_issue3882.py
+++ b/spacy/tests/regression/test_issue3882.py
@ -1,12 +0,0 @@
-from spacy.displacy import parse_deps
-from spacy.tokens import Doc
-
-
-def test_issue3882(en_vocab):
-    """Test that displaCy doesn't serialize the doc.user_data when making a
-    copy of the Doc.
-    """
-    doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
-    doc.user_data["test"] = set()
-    parse_deps(doc)
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@ -1,17 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3951(en_vocab):
-    """Test that combinations of optional rules are matched correctly."""
-    matcher = Matcher(en_vocab)
-    pattern = [
-        {"LOWER": "hello"},
-        {"LOWER": "this", "OP": "?"},
-        {"OP": "?"},
-        {"LOWER": "world"},
-    ]
-    matcher.add("TEST", [pattern])
-    doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
-    matches = matcher(doc)
-    assert len(matches) == 0
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -1,26 +0,0 @@
-from spacy.lang.en import English
-from ..util import make_tempdir
-
-
-def test_issue3959():
-    """ Ensure that a modified pos attribute is serialized correctly."""
-    nlp = English()
-    doc = nlp(
-        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
-    )
-    assert doc[0].pos_ == ""
-
-    doc[0].pos_ = "NOUN"
-    assert doc[0].pos_ == "NOUN"
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    with make_tempdir() as tmp_dir:
-        file_path = tmp_dir / "my_doc"
-        doc.to_disk(file_path)
-
-        doc2 = nlp("")
-        doc2.from_disk(file_path)
-
-        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue3962.py
+++ b/spacy/tests/regression/test_issue3962.py
@ -1,117 +0,0 @@
-import pytest
-
-from ..util import get_doc
-
-
-@pytest.fixture
-def doc(en_tokenizer):
-    text = "He jests at scars, that never felt a wound."
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ccomp",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962(doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = doc[1:5]  # "jests at scars ,"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root
-    assert doc2[0].dep_ == "dep"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"  # head set to the new artificial root
-    assert doc2[3].dep_ == "dep"
-
-    # We should still have 1 sentence
-    assert len(list(doc2.sents)) == 1
-
-    span3 = doc[6:9]  # "never felt a"
-    doc3 = span3.as_doc()
-    doc3_json = doc3.to_json()
-    assert doc3_json
-
-    assert doc3[0].head.text == "felt"
-    assert doc3[0].dep_ == "neg"
-    assert doc3[1].head.text == "felt"
-    assert doc3[1].dep_ == "ROOT"
-    assert doc3[2].head.text == "felt"  # head set to ancestor
-    assert doc3[2].dep_ == "dep"
-
-    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
-    assert len(list(doc3.sents)) == 1
-
-
-@pytest.fixture
-def two_sent_doc(en_tokenizer):
-    text = "He jests at scars. They never felt a wound."
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
-    deps = [
-        "nsubj",
-        "ROOT",
-        "prep",
-        "pobj",
-        "punct",
-        "nsubj",
-        "neg",
-        "ROOT",
-        "det",
-        "dobj",
-        "punct",
-    ]
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
-
-def test_issue3962_long(two_sent_doc):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
-    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
-    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
-    doc2 = span2.as_doc()
-    doc2_json = doc2.to_json()
-    assert doc2_json
-
-    assert (
-        doc2[0].head.text == "jests"
-    )  # head set to itself, being the new artificial root (in sentence 1)
-    assert doc2[0].dep_ == "ROOT"
-    assert doc2[1].head.text == "jests"
-    assert doc2[1].dep_ == "prep"
-    assert doc2[2].head.text == "at"
-    assert doc2[2].dep_ == "pobj"
-    assert doc2[3].head.text == "jests"
-    assert doc2[3].dep_ == "punct"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to itself, being the new artificial root (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-    assert (
-        doc2[4].head.text == "They"
-    )  # head set to the new artificial head (in sentence 2)
-    assert doc2[4].dep_ == "dep"
-
-    # We should still have 2 sentences
-    sents = list(doc2.sents)
-    assert len(sents) == 2
-    assert sents[0].text == "jests at scars ."
-    assert sents[1].text == "They never"
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@ -1,19 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
-    matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
-    doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
-    matches = matcher(doc)
-
-    assert len(matches) == 2
-
-    # We should have a match for each of the two rules
-    found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
-    assert "A" in found_ids
-    assert "B" in found_ids
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@ -0,0 +1,469 @@
+import pytest
+from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
+from spacy.pipeline.defaults import default_ner
+from spacy.matcher import PhraseMatcher, Matcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example, Corpus
+from spacy.gold.converters import json2docs
+from spacy.vocab import Vocab
+from spacy.lang.en import English
+from spacy.util import minibatch, ensure_path, load_model
+from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
+from spacy.tokenizer import Tokenizer
+from spacy.lang.el import Greek
+from spacy.language import Language
+import spacy
+from thinc.api import compounding
+from collections import defaultdict
+
+from ..util import make_tempdir
+
+
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes.
+    """
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", [pattern1])
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", [pattern2])
+    matches = matcher(doc)
+    assert len(matches) == 1
+
+
+def test_issue4030():
+    """ Test whether textcat works fine with empty doc """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = [
+        "This is an offensive text",
+        "This is the second offensive text",
+        "inoff",
+    ]
+    y_train = ["offensive", "offensive", "inoffensive"]
+    nlp = spacy.blank("en")
+    # preparing the data
+    train_data = []
+    for text, train_instance in zip(x_train, y_train):
+        cat_dict = {label: label == train_instance for label in unique_classes}
+        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
+    # add a text categorizer component
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
+    )
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+    # training the network
+    with nlp.select_pipes(enable="textcat"):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                nlp.update(
+                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
+                )
+    # processing of an empty doc should result in 0.0 for all categories
+    doc = nlp("")
+    assert doc.cats["offensive"] == 0.0
+    assert doc.cats["inoffensive"] == 0.0
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = load_model(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        config = {
+            "learn_tokens": False,
+            "min_action_freq": 30,
+            "beam_width": 1,
+            "beam_update_prob": 1.0,
+        }
+        ner2 = EntityRecognizer(vocab, default_ner(), **config)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
+
+
+def test_issue4054(en_vocab):
+    """Test that a new blank model can be made with a vocab from file,
+    and that serialization does not drop the language at any point."""
+    nlp1 = English()
+    vocab1 = nlp1.vocab
+    with make_tempdir() as d:
+        vocab_dir = ensure_path(d / "vocab")
+        if not vocab_dir.exists():
+            vocab_dir.mkdir()
+        vocab1.to_disk(vocab_dir)
+        vocab2 = Vocab().from_disk(vocab_dir)
+        print("lang", vocab2.lang)
+        nlp2 = spacy.blank("en", vocab=vocab2)
+        nlp_dir = ensure_path(d / "nlp")
+        if not nlp_dir.exists():
+            nlp_dir.mkdir()
+        nlp2.to_disk(nlp_dir)
+        nlp3 = load_model(nlp_dir)
+        assert nlp3.lang == "en"
+
+
+def test_issue4120(en_vocab):
+    """Test that matches without a final {OP: ?} token are returned."""
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
+    doc1 = Doc(en_vocab, words=["a"])
+    assert len(matcher(doc1)) == 1  # works
+    doc2 = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc2)) == 2  # fixed
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
+    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc3)) == 2  # works
+    matcher = Matcher(en_vocab)
+    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
+    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
+    assert len(matcher(doc4)) == 3  # fixed
+
+
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+    doc_bytes = doc.to_bytes()
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+    assert actual == pos
+
+
+def test_issue4190():
+    def customize_tokenizer(nlp):
+        prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
+        suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
+        infix_re = compile_infix_regex(nlp.Defaults.infixes)
+        # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
+        exceptions = {
+            k: v
+            for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
+            if not (len(k) == 2 and k[1] == ".")
+        }
+        new_tokenizer = Tokenizer(
+            nlp.vocab,
+            exceptions,
+            prefix_search=prefix_re.search,
+            suffix_search=suffix_re.search,
+            infix_finditer=infix_re.finditer,
+            token_match=nlp.tokenizer.token_match,
+        )
+        nlp.tokenizer = new_tokenizer
+
+    test_string = "Test c."
+    # Load default language
+    nlp_1 = English()
+    doc_1a = nlp_1(test_string)
+    result_1a = [token.text for token in doc_1a]  # noqa: F841
+    # Modify tokenizer
+    customize_tokenizer(nlp_1)
+    doc_1b = nlp_1(test_string)
+    result_1b = [token.text for token in doc_1b]
+    # Save and Reload
+    with make_tempdir() as model_dir:
+        nlp_1.to_disk(model_dir)
+        nlp_2 = load_model(model_dir)
+    # This should be the modified tokenizer
+    doc_2 = nlp_2(test_string)
+    result_2 = [token.text for token in doc_2]
+    assert result_1b == result_2
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4267():
+    """ Test that running an entity_ruler after ner gives consistent results"""
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    ner.add_label("PEOPLE")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    assert "ner" in nlp.pipe_names
+    # assert that we have correct IOB annotations
+    doc1 = nlp("hi")
+    assert doc1.is_nered
+    for token in doc1:
+        assert token.ent_iob == 2
+    # add entity ruler and run again
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    assert "entity_ruler" in nlp.pipe_names
+    assert "ner" in nlp.pipe_names
+    # assert that we still have correct IOB annotations
+    doc2 = nlp("hi")
+    assert doc2.is_nered
+    for token in doc2:
+        assert token.ent_iob == 2
+
+
+def test_issue4272():
+    """Test that lookup table can be accessed from Token.lemma if no POS tags
+    are available."""
+    nlp = Greek()
+    doc = nlp("Χθες")
+    assert doc[0].lemma_
+
+
+def test_multiple_predictions():
+    class DummyPipe(Pipe):
+        def __init__(self):
+            self.model = "dummy_model"
+
+        def predict(self, docs):
+            return ([1, 2, 3], [4, 5, 6])
+
+        def set_annotations(self, docs, scores, tensors=None):
+            return docs
+
+    nlp = Language()
+    doc = nlp.make_doc("foo")
+    dummy_pipe = DummyPipe()
+    dummy_pipe(doc)
+
+
+@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 30,
+        "beam_width": 1,
+        "beam_update_prob": 1.0,
+    }
+    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4348():
+    """Test that training the tagger with empty data, doesn't throw errors"""
+    nlp = English()
+    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
+    TRAIN_DATA = [example, example]
+    tagger = nlp.create_pipe("tagger")
+    nlp.add_pipe(tagger)
+    optimizer = nlp.begin_training()
+    for i in range(5):
+        losses = {}
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+
+
+def test_issue4367():
+    """Test that docbin init goes well"""
+    DocBin()
+    DocBin(attrs=["LEMMA"])
+    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+
+
+def test_issue4373():
+    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
+    matcher = Matcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+    matcher = PhraseMatcher(Vocab())
+    assert isinstance(matcher.vocab, Vocab)
+
+
+def test_issue4402():
+    json_data = {
+        "id": 0,
+        "paragraphs": [
+            {
+                "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "How", "ner": "O"},
+                            {"id": 1, "orth": "should", "ner": "O"},
+                            {"id": 2, "orth": "I", "ner": "O"},
+                            {"id": 3, "orth": "cook", "ner": "O"},
+                            {"id": 4, "orth": "bacon", "ner": "O"},
+                            {"id": 5, "orth": "in", "ner": "O"},
+                            {"id": 6, "orth": "an", "ner": "O"},
+                            {"id": 7, "orth": "oven", "ner": "O"},
+                            {"id": 8, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {
+                        "tokens": [
+                            {"id": 9, "orth": "\n", "ner": "O"},
+                            {"id": 10, "orth": "I", "ner": "O"},
+                            {"id": 11, "orth": "'ve", "ner": "O"},
+                            {"id": 12, "orth": "heard", "ner": "O"},
+                            {"id": 13, "orth": "of", "ner": "O"},
+                            {"id": 14, "orth": "people", "ner": "O"},
+                            {"id": 15, "orth": "cooking", "ner": "O"},
+                            {"id": 16, "orth": "bacon", "ner": "O"},
+                            {"id": 17, "orth": "in", "ner": "O"},
+                            {"id": 18, "orth": "an", "ner": "O"},
+                            {"id": 19, "orth": "oven", "ner": "O"},
+                            {"id": 20, "orth": ".", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                ],
+                "cats": [
+                    {"label": "baking", "value": 1.0},
+                    {"label": "not_baking", "value": 0.0},
+                ],
+            },
+            {
+                "raw": "What is the difference between white and brown eggs?\n",
+                "sentences": [
+                    {
+                        "tokens": [
+                            {"id": 0, "orth": "What", "ner": "O"},
+                            {"id": 1, "orth": "is", "ner": "O"},
+                            {"id": 2, "orth": "the", "ner": "O"},
+                            {"id": 3, "orth": "difference", "ner": "O"},
+                            {"id": 4, "orth": "between", "ner": "O"},
+                            {"id": 5, "orth": "white", "ner": "O"},
+                            {"id": 6, "orth": "and", "ner": "O"},
+                            {"id": 7, "orth": "brown", "ner": "O"},
+                            {"id": 8, "orth": "eggs", "ner": "O"},
+                            {"id": 9, "orth": "?", "ner": "O"},
+                        ],
+                        "brackets": [],
+                    },
+                    {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+                ],
+                "cats": [
+                    {"label": "baking", "value": 0.0},
+                    {"label": "not_baking", "value": 1.0},
+                ],
+            },
+        ],
+    }
+    nlp = English()
+    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "test4402.spacy"
+        docs = json2docs([json_data])
+        data = DocBin(docs=docs, attrs=attrs).to_bytes()
+        with output_file.open("wb") as file_:
+            file_.write(data)
+        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
+
+        train_data = list(corpus.train_dataset(nlp))
+        assert len(train_data) == 2
+
+        split_train_data = []
+        for eg in train_data:
+            split_train_data.extend(eg.split_sents())
+        assert len(split_train_data) == 4
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@ -1,23 +0,0 @@
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes.
-    """
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern1 = Doc(en_vocab, words=["c", "d"])
-    assert [t.norm_ for t in pattern1] == ["c", "d"]
-    matcher.add("TEST", [pattern1])
-    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
-    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
-    matches = matcher(doc)
-    assert len(matches) == 1
-    matcher = PhraseMatcher(en_vocab, attr="NORM")
-    pattern2 = Doc(en_vocab, words=["1", "2"])
-    pattern2[0].norm_ = "c"
-    pattern2[1].norm_ = "d"
-    assert [t.norm_ for t in pattern2] == ["c", "d"]
-    matcher.add("TEST", [pattern2])
-    matches = matcher(doc)
-    assert len(matches) == 1
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@ -1,50 +0,0 @@
-import spacy
-from spacy.util import minibatch
-from thinc.api import compounding
-from spacy.gold import Example
-
-
-def test_issue4030():
-    """ Test whether textcat works fine with empty doc """
-    unique_classes = ["offensive", "inoffensive"]
-    x_train = [
-        "This is an offensive text",
-        "This is the second offensive text",
-        "inoff",
-    ]
-    y_train = ["offensive", "offensive", "inoffensive"]
-
-    nlp = spacy.blank("en")
-
-    # preparing the data
-    train_data = []
-    for text, train_instance in zip(x_train, y_train):
-        cat_dict = {label: label == train_instance for label in unique_classes}
-        train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
-
-    # add a text categorizer component
-    textcat = nlp.create_pipe(
-        "textcat",
-        config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
-    )
-
-    for label in unique_classes:
-        textcat.add_label(label)
-    nlp.add_pipe(textcat, last=True)
-
-    # training the network
-    with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
-        for i in range(3):
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
-
-            for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
-
-    # processing of an empty doc should result in 0.0 for all categories
-    doc = nlp("")
-    assert doc.cats["offensive"] == 0.0
-    assert doc.cats["inoffensive"] == 0.0
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -1,85 +0,0 @@
-import spacy
-from spacy.pipeline import EntityRecognizer, EntityRuler
-from spacy.lang.en import English
-from spacy.tokens import Span
-from spacy.util import ensure_path
-from spacy.pipeline.defaults import default_ner
-
-from ..util import make_tempdir
-
-
-def test_issue4042():
-    """Test that serialization of an EntityRuler before NER works fine."""
-    nlp = English()
-
-    # add ner pipe
-    ner = nlp.create_pipe("ner")
-    ner.add_label("SOME_LABEL")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    # Add entity ruler
-    ruler = EntityRuler(nlp)
-    patterns = [
-        {"label": "MY_ORG", "pattern": "Apple"},
-        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
-    ]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
-    doc1 = nlp("What do you think about Apple ?")
-    assert doc1.ents[0].label_ == "MY_ORG"
-
-    with make_tempdir() as d:
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        nlp.to_disk(output_dir)
-
-        nlp2 = spacy.load(output_dir)
-        doc2 = nlp2("What do you think about Apple ?")
-        assert doc2.ents[0].label_ == "MY_ORG"
-
-
-def test_issue4042_bug2():
-    """
-    Test that serialization of an NER works fine when new labels were added.
-    This is the second bug of two bugs underlying the issue 4042.
-    """
-    nlp1 = English()
-    vocab = nlp1.vocab
-
-    # add ner pipe
-    ner1 = nlp1.create_pipe("ner")
-    ner1.add_label("SOME_LABEL")
-    nlp1.add_pipe(ner1)
-    nlp1.begin_training()
-
-    # add a new label to the doc
-    doc1 = nlp1("What do you think about Apple ?")
-    assert len(ner1.labels) == 1
-    assert "SOME_LABEL" in ner1.labels
-    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
-    doc1.ents = list(doc1.ents) + [apple_ent]
-
-    # reapply the NER - at this point it should resize itself
-    ner1(doc1)
-    assert len(ner1.labels) == 2
-    assert "SOME_LABEL" in ner1.labels
-    assert "MY_ORG" in ner1.labels
-
-    with make_tempdir() as d:
-        # assert IO goes fine
-        output_dir = ensure_path(d)
-        if not output_dir.exists():
-            output_dir.mkdir()
-        ner1.to_disk(output_dir)
-
-        config = {
-            "learn_tokens": False,
-            "min_action_freq": 30,
-            "beam_width": 1,
-            "beam_update_prob": 1.0,
-        }
-        ner2 = EntityRecognizer(vocab, default_ner(), **config)
-        ner2.from_disk(output_dir)
-        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4054.py
+++ b/spacy/tests/regression/test_issue4054.py
@ -1,30 +0,0 @@
-from spacy.vocab import Vocab
-import spacy
-from spacy.lang.en import English
-from spacy.util import ensure_path
-
-from ..util import make_tempdir
-
-
-def test_issue4054(en_vocab):
-    """Test that a new blank model can be made with a vocab from file,
-    and that serialization does not drop the language at any point."""
-    nlp1 = English()
-    vocab1 = nlp1.vocab
-
-    with make_tempdir() as d:
-        vocab_dir = ensure_path(d / "vocab")
-        if not vocab_dir.exists():
-            vocab_dir.mkdir()
-        vocab1.to_disk(vocab_dir)
-
-        vocab2 = Vocab().from_disk(vocab_dir)
-        print("lang", vocab2.lang)
-        nlp2 = spacy.blank("en", vocab=vocab2)
-
-        nlp_dir = ensure_path(d / "nlp")
-        if not nlp_dir.exists():
-            nlp_dir.mkdir()
-        nlp2.to_disk(nlp_dir)
-        nlp3 = spacy.load(nlp_dir)
-        assert nlp3.lang == "en"
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@ -1,23 +0,0 @@
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue4120(en_vocab):
-    """Test that matches without a final {OP: ?} token are returned."""
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
-    doc1 = Doc(en_vocab, words=["a"])
-    assert len(matcher(doc1)) == 1  # works
-
-    doc2 = Doc(en_vocab, words=["a", "b", "c"])
-    assert len(matcher(doc2)) == 2  # fixed
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
-    doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc3)) == 2  # works
-
-    matcher = Matcher(en_vocab)
-    matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
-    doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
-    assert len(matcher(doc4)) == 3  # fixed
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -1,28 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-
-
-def test_issue4133(en_vocab):
-    nlp = English()
-    vocab_bytes = nlp.vocab.to_bytes()
-    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
-    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
-    doc = Doc(en_vocab, words=words)
-    for i, token in enumerate(doc):
-        token.pos_ = pos[i]
-
-    # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
-
-    doc_bytes = doc.to_bytes()
-
-    vocab = Vocab()
-    vocab = vocab.from_bytes(vocab_bytes)
-    doc = Doc(vocab).from_bytes(doc_bytes)
-
-    actual = []
-    for token in doc:
-        actual.append(token.pos_)
-
-    assert actual == pos
--- a/spacy/tests/regression/test_issue4190.py
+++ b/spacy/tests/regression/test_issue4190.py
@ -1,46 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokenizer import Tokenizer
-from spacy import util
-
-from ..util import make_tempdir
-
-
-def test_issue4190():
-    test_string = "Test c."
-    # Load default language
-    nlp_1 = English()
-    doc_1a = nlp_1(test_string)
-    result_1a = [token.text for token in doc_1a]  # noqa: F841
-    # Modify tokenizer
-    customize_tokenizer(nlp_1)
-    doc_1b = nlp_1(test_string)
-    result_1b = [token.text for token in doc_1b]
-    # Save and Reload
-    with make_tempdir() as model_dir:
-        nlp_1.to_disk(model_dir)
-        nlp_2 = util.load_model(model_dir)
-    # This should be the modified tokenizer
-    doc_2 = nlp_2(test_string)
-    result_2 = [token.text for token in doc_2]
-    assert result_1b == result_2
-
-
-def customize_tokenizer(nlp):
-    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
-    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
-    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
-    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
-    exceptions = {
-        k: v
-        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
-        if not (len(k) == 2 and k[1] == ".")
-    }
-    new_tokenizer = Tokenizer(
-        nlp.vocab,
-        exceptions,
-        prefix_search=prefix_re.search,
-        suffix_search=suffix_re.search,
-        infix_finditer=infix_re.finditer,
-        token_match=nlp.tokenizer.token_match,
-    )
-    nlp.tokenizer = new_tokenizer
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4267():
-    """ Test that running an entity_ruler after ner gives consistent results"""
-    nlp = English()
-    ner = nlp.create_pipe("ner")
-    ner.add_label("PEOPLE")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-
-    assert "ner" in nlp.pipe_names
-
-    # assert that we have correct IOB annotations
-    doc1 = nlp("hi")
-    assert doc1.is_nered
-    for token in doc1:
-        assert token.ent_iob == 2
-
-    # add entity ruler and run again
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
-
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-    assert "entity_ruler" in nlp.pipe_names
-    assert "ner" in nlp.pipe_names
-
-    # assert that we still have correct IOB annotations
-    doc2 = nlp("hi")
-    assert doc2.is_nered
-    for token in doc2:
-        assert token.ent_iob == 2
--- a/spacy/tests/regression/test_issue4272.py
+++ b/spacy/tests/regression/test_issue4272.py
@ -1,9 +0,0 @@
-from spacy.lang.el import Greek
-
-
-def test_issue4272():
-    """Test that lookup table can be accessed from Token.lemma if no POS tags
-    are available."""
-    nlp = Greek()
-    doc = nlp("Χθες")
-    assert doc[0].lemma_
--- a/spacy/tests/regression/test_issue4278.py
+++ b/spacy/tests/regression/test_issue4278.py
@ -1,25 +0,0 @@
-import pytest
-from spacy.language import Language
-from spacy.pipeline import Pipe
-
-
-class DummyPipe(Pipe):
-    def __init__(self):
-        self.model = "dummy_model"
-
-    def predict(self, docs):
-        return ([1, 2, 3], [4, 5, 6])
-
-    def set_annotations(self, docs, scores, tensors=None):
-        return docs
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-def test_multiple_predictions(nlp):
-    doc = nlp.make_doc("foo")
-    dummy_pipe = DummyPipe()
-    dummy_pipe(doc)
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -1,47 +0,0 @@
-from collections import defaultdict
-
-import pytest
-
-from spacy.pipeline.defaults import default_ner
-from spacy.pipeline import EntityRecognizer
-
-from spacy.lang.en import English
-from spacy.tokens import Span
-
-
-# skipped after removing Beam stuff during the Example/GoldParse refactor
-@pytest.mark.skip
-def test_issue4313():
-    """ This should not crash or exit with some strange error code """
-    beam_width = 16
-    beam_density = 0.0001
-    nlp = English()
-    config = {
-        "learn_tokens": False,
-        "min_action_freq": 30,
-        "beam_width": 1,
-        "beam_update_prob": 1.0,
-    }
-    ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
-    ner.add_label("SOME_LABEL")
-    ner.begin_training([])
-    nlp.add_pipe(ner)
-
-    # add a new label to the doc
-    doc = nlp("What do you think about Apple ?")
-    assert len(ner.labels) == 1
-    assert "SOME_LABEL" in ner.labels
-    apple_ent = Span(doc, 5, 6, label="MY_ORG")
-    doc.ents = list(doc.ents) + [apple_ent]
-
-    # ensure the beam_parse still works with the new label
-    docs = [doc]
-    beams = nlp.entity.beam_parse(
-        docs, beam_width=beam_width, beam_density=beam_density
-    )
-
-    for doc, beam in zip(docs, beams):
-        entity_scores = defaultdict(float)
-        for score, ents in nlp.entity.moves.get_beam_parses(beam):
-            for start, end, label in ents:
-                entity_scores[(start, end, label)] += score
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@ -1,24 +0,0 @@
-from spacy.gold import Example
-from spacy.lang.en import English
-from spacy.util import minibatch
-from thinc.api import compounding
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_issue4348():
-    """Test that training the tagger with empty data, doesn't throw errors"""
-
-    nlp = English()
-    example = Example.from_dict(nlp.make_doc(""), {"tags": []})
-    TRAIN_DATA = [example, example]
-
-    tagger = nlp.create_pipe("tagger")
-    nlp.add_pipe(tagger)
-
-    optimizer = nlp.begin_training()
-    for i in range(5):
-        losses = {}
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
-        for batch in batches:
-            nlp.update(batch, sgd=optimizer, losses=losses)
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@ -1,8 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue4367():
-    """Test that docbin init goes well"""
-    DocBin()
-    DocBin(attrs=["LEMMA"])
-    DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
--- a/spacy/tests/regression/test_issue4373.py
+++ b/spacy/tests/regression/test_issue4373.py
@ -1,10 +0,0 @@
-from spacy.matcher import Matcher, PhraseMatcher
-from spacy.vocab import Vocab
-
-
-def test_issue4373():
-    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
-    matcher = Matcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
-    matcher = PhraseMatcher(Vocab())
-    assert isinstance(matcher.vocab, Vocab)
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -1,98 +0,0 @@
-from spacy.gold import Corpus
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-from ...gold.converters import json2docs
-from ...tokens import DocBin
-
-
-def test_issue4402():
-    nlp = English()
-    attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
-        data = DocBin(docs=docs, attrs=attrs).to_bytes()
-        with output_file.open("wb") as file_:
-            file_.write(data)
-        corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
-
-        train_data = list(corpus.train_dataset(nlp))
-        assert len(train_data) == 2
-
-        split_train_data = []
-        for eg in train_data:
-            split_train_data.extend(eg.split_sents())
-        assert len(split_train_data) == 4
-
-
-json_data = {
-    "id": 0,
-    "paragraphs": [
-        {
-            "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "How", "ner": "O"},
-                        {"id": 1, "orth": "should", "ner": "O"},
-                        {"id": 2, "orth": "I", "ner": "O"},
-                        {"id": 3, "orth": "cook", "ner": "O"},
-                        {"id": 4, "orth": "bacon", "ner": "O"},
-                        {"id": 5, "orth": "in", "ner": "O"},
-                        {"id": 6, "orth": "an", "ner": "O"},
-                        {"id": 7, "orth": "oven", "ner": "O"},
-                        {"id": 8, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {
-                    "tokens": [
-                        {"id": 9, "orth": "\n", "ner": "O"},
-                        {"id": 10, "orth": "I", "ner": "O"},
-                        {"id": 11, "orth": "'ve", "ner": "O"},
-                        {"id": 12, "orth": "heard", "ner": "O"},
-                        {"id": 13, "orth": "of", "ner": "O"},
-                        {"id": 14, "orth": "people", "ner": "O"},
-                        {"id": 15, "orth": "cooking", "ner": "O"},
-                        {"id": 16, "orth": "bacon", "ner": "O"},
-                        {"id": 17, "orth": "in", "ner": "O"},
-                        {"id": 18, "orth": "an", "ner": "O"},
-                        {"id": 19, "orth": "oven", "ner": "O"},
-                        {"id": 20, "orth": ".", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-            ],
-            "cats": [
-                {"label": "baking", "value": 1.0},
-                {"label": "not_baking", "value": 0.0},
-            ],
-        },
-        {
-            "raw": "What is the difference between white and brown eggs?\n",
-            "sentences": [
-                {
-                    "tokens": [
-                        {"id": 0, "orth": "What", "ner": "O"},
-                        {"id": 1, "orth": "is", "ner": "O"},
-                        {"id": 2, "orth": "the", "ner": "O"},
-                        {"id": 3, "orth": "difference", "ner": "O"},
-                        {"id": 4, "orth": "between", "ner": "O"},
-                        {"id": 5, "orth": "white", "ner": "O"},
-                        {"id": 6, "orth": "and", "ner": "O"},
-                        {"id": 7, "orth": "brown", "ner": "O"},
-                        {"id": 8, "orth": "eggs", "ner": "O"},
-                        {"id": 9, "orth": "?", "ner": "O"},
-                    ],
-                    "brackets": [],
-                },
-                {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
-            ],
-            "cats": [
-                {"label": "baking", "value": 0.0},
-                {"label": "not_baking", "value": 1.0},
-            ],
-        },
-    ],
-}
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@ -0,0 +1,288 @@
+import pytest
+from mock import Mock
+from spacy.pipeline import EntityRuler
+from spacy.matcher import DependencyMatcher
+from spacy.tokens import Doc, Span, DocBin
+from spacy.gold import Example
+from spacy.gold.converters.conllu2docs import conllu2docs
+from spacy.lang.en import English
+from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
+from spacy.language import Language
+from spacy.util import ensure_path, load_model_from_path
+import numpy
+import pickle
+
+from ..util import get_doc, make_tempdir
+
+
+def test_issue4528(en_vocab):
+    """Test that user_data is correctly serialized in DocBin."""
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc.user_data["foo"] = "bar"
+    # This is how extension attribute values are stored in the user data
+    doc.user_data[("._.", "foo", None, None)] = "bar"
+    doc_bin = DocBin(store_user_data=True)
+    doc_bin.add(doc)
+    doc_bin_bytes = doc_bin.to_bytes()
+    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+    assert new_doc.user_data["foo"] == "bar"
+    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    Example.from_dict(doc, {"words": words})
+
+
+def test_issue4590(en_vocab):
+    """Test that matches param in on_match method are the same as matches run with no on_match method"""
+    pattern = [
+        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+    ]
+
+    on_match = Mock()
+    matcher = DependencyMatcher(en_vocab)
+    matcher.add("pattern", on_match, pattern)
+    text = "The quick brown fox jumped over the lazy fox"
+    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
+    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+    matches = matcher(doc)
+    on_match_args = on_match.call_args
+    assert on_match_args[0][3] == matches
+
+
+def test_issue4651_with_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4651_without_phrase_matcher_attr():
+    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
+    the method from_disk when the EntityRuler argument phrase_matcher_attr is
+    not specified.
+    """
+    text = "Spacy is a python library for nlp"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
+    nlp_reloaded = English()
+    with make_tempdir() as d:
+        file_path = d / "entityruler"
+        ruler.to_disk(file_path)
+        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
+    nlp_reloaded.add_pipe(ruler_reloaded)
+    doc_reloaded = nlp_reloaded(text)
+    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
+    assert res == res_reloaded
+
+
+def test_issue4665():
+    """
+    conllu2json should not raise an exception if the HEAD column contains an
+    underscore
+    """
+    input_data = """
+1	[	_	PUNCT	-LRB-	_	_	punct	_	_
+2	This	_	DET	DT	_	_	det	_	_
+3	killing	_	NOUN	NN	_	_	nsubj	_	_
+4	of	_	ADP	IN	_	_	case	_	_
+5	a	_	DET	DT	_	_	det	_	_
+6	respected	_	ADJ	JJ	_	_	amod	_	_
+7	cleric	_	NOUN	NN	_	_	nmod	_	_
+8	will	_	AUX	MD	_	_	aux	_	_
+9	be	_	AUX	VB	_	_	aux	_	_
+10	causing	_	VERB	VBG	_	_	root	_	_
+11	us	_	PRON	PRP	_	_	iobj	_	_
+12	trouble	_	NOUN	NN	_	_	dobj	_	_
+13	for	_	ADP	IN	_	_	case	_	_
+14	years	_	NOUN	NNS	_	_	nmod	_	_
+15	to	_	PART	TO	_	_	mark	_	_
+16	come	_	VERB	VB	_	_	acl	_	_
+17	.	_	PUNCT	.	_	_	punct	_	_
+18	]	_	PUNCT	-RRB-	_	_	punct	_	_
+"""
+    conllu2docs(input_data)
+
+
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    with pytest.warns(UserWarning):
+        kb.set_entities(
+            entity_list=["Q1", "Q1"],
+            freq_list=[32, 111],
+            vector_list=[vector1, vector2],
+        )
+    assert kb.get_size_entities() == 1
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.dump(str(file_path))
+        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
+        kb2.load_bulk(str(file_path))
+    assert kb2.get_size_entities() == 1
+
+
+def test_issue4707():
+    """Tests that disabled component names are also excluded from nlp.from_disk
+    by default when loading a model.
+    """
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
+    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
+    exclude = ["tokenizer", "sentencizer"]
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir, exclude=exclude)
+        new_nlp = load_model_from_path(tmpdir, disable=exclude)
+    assert "sentencizer" not in new_nlp.pipe_names
+    assert "entity_ruler" in new_nlp.pipe_names
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_1():
+    """ Ensure the pickling of the NER goes well"""
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
+    with make_tempdir() as tmp_path:
+        with (tmp_path / "ner.pkl").open("wb") as file_:
+            pickle.dump(ner, file_)
+            assert ner.cfg["min_action_freq"] == 342
+
+        with (tmp_path / "ner.pkl").open("rb") as file_:
+            ner2 = pickle.load(file_)
+            assert ner2.cfg["min_action_freq"] == 342
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_issue4725_2():
+    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
+    vocab = Vocab(vectors_name="test_vocab_add_vector")
+    data = numpy.ndarray((5, 3), dtype="f")
+    data[0] = 1.0
+    data[1] = 2.0
+    vocab.set_vector("cat", data[0])
+    vocab.set_vector("dog", data[1])
+    nlp = English(vocab=vocab)
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+    docs = ["Kurt is in London."] * 10
+    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+        pass
+
+
+def test_issue4849():
+    nlp = English()
+    ruler = EntityRuler(
+        nlp,
+        patterns=[
+            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
+            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
+        ],
+        phrase_matcher_attr="LOWER",
+    )
+    nlp.add_pipe(ruler)
+    text = """
+    The left is starting to take aim at Democratic front-runner Joe Biden.
+    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
+    """
+    # USING 1 PROCESS
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=1):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+    # USING 2 PROCESSES
+    count_ents = 0
+    for doc in nlp.pipe([text], n_process=2):
+        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+    assert count_ents == 2
+
+
+class CustomPipe:
+    name = "my_pipe"
+
+    def __init__(self):
+        Span.set_extension("my_ext", getter=self._get_my_ext)
+        Doc.set_extension("my_ext", default=None)
+
+    def __call__(self, doc):
+        gathered_ext = []
+        for sent in doc.sents:
+            sent_ext = self._get_my_ext(sent)
+            sent._.set("my_ext", sent_ext)
+            gathered_ext.append(sent_ext)
+
+        doc._.set("my_ext", "\n".join(gathered_ext))
+
+        return doc
+
+    @staticmethod
+    def _get_my_ext(span):
+        return str(span.end)
+
+
+def test_issue4903():
+    """Ensure that this runs correctly and doesn't hang or crash on Windows /
+    macOS."""
+    nlp = English()
+    custom_component = CustomPipe()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    nlp.add_pipe(custom_component, after="sentencizer")
+
+    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
+    docs = list(nlp.pipe(text, n_process=2))
+    assert docs[0].text == "I like bananas."
+    assert docs[1].text == "Do you like them?"
+    assert docs[2].text == "No, I prefer wasabi."
+
+
+def test_issue4924():
+    nlp = Language()
+    example = Example.from_dict(nlp.make_doc(""), {})
+    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue4528.py
+++ b/spacy/tests/regression/test_issue4528.py
@ -1,16 +0,0 @@
-from spacy.tokens import Doc, DocBin
-
-
-def test_issue4528(en_vocab):
-    """Test that user_data is correctly serialized in DocBin."""
-    doc = Doc(en_vocab, words=["hello", "world"])
-    doc.user_data["foo"] = "bar"
-    # This is how extension attribute values are stored in the user data
-    doc.user_data[("._.", "foo", None, None)] = "bar"
-    doc_bin = DocBin(store_user_data=True)
-    doc_bin.add(doc)
-    doc_bin_bytes = doc_bin.to_bytes()
-    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
-    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
-    assert new_doc.user_data["foo"] == "bar"
-    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,11 +0,0 @@
-import pytest
-
-from spacy.gold import Example
-
-
-@pytest.mark.parametrize(
-    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
-)
-def test_gold_misaligned(en_tokenizer, text, words):
-    doc = en_tokenizer(text)
-    Example.from_dict(doc, {"words": words})
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -1,35 +0,0 @@
-from mock import Mock
-from spacy.matcher import DependencyMatcher
-from ..util import get_doc
-
-
-def test_issue4590(en_vocab):
-    """Test that matches param in on_match method are the same as matches run with no on_match method"""
-    pattern = [
-        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {
-            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-        {
-            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
-            "PATTERN": {"ORTH": "fox"},
-        },
-    ]
-
-    on_match = Mock()
-
-    matcher = DependencyMatcher(en_vocab)
-    matcher.add("pattern", on_match, pattern)
-
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
-
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-
-    matches = matcher(doc)
-
-    on_match_args = on_match.call_args
-
-    assert on_match_args[0][3] == matches
--- a/spacy/tests/regression/test_issue4651.py
+++ b/spacy/tests/regression/test_issue4651.py
@ -1,62 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-from ..util import make_tempdir
-
-
-def test_issue4651_with_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
-
-
-def test_issue4651_without_phrase_matcher_attr():
-    """Test that the EntityRuler PhraseMatcher is deserialize correctly using
-    the method from_disk when the EntityRuler argument phrase_matcher_attr is
-    not specified.
-    """
-    text = "Spacy is a python library for nlp"
-
-    nlp = English()
-    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
-    ruler.add_patterns(patterns)
-    nlp.add_pipe(ruler)
-
-    doc = nlp(text)
-    res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
-
-    nlp_reloaded = English()
-    with make_tempdir() as d:
-        file_path = d / "entityruler"
-        ruler.to_disk(file_path)
-        ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
-
-    nlp_reloaded.add_pipe(ruler_reloaded)
-    doc_reloaded = nlp_reloaded(text)
-    res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
-
-    assert res == res_reloaded
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@ -1,35 +0,0 @@
-import pytest
-
-# TODO
-# from spacy.gold.converters.conllu2docs import conllu2docs
-
-input_data = """
-1	[	_	PUNCT	-LRB-	_	_	punct	_	_
-2	This	_	DET	DT	_	_	det	_	_
-3	killing	_	NOUN	NN	_	_	nsubj	_	_
-4	of	_	ADP	IN	_	_	case	_	_
-5	a	_	DET	DT	_	_	det	_	_
-6	respected	_	ADJ	JJ	_	_	amod	_	_
-7	cleric	_	NOUN	NN	_	_	nmod	_	_
-8	will	_	AUX	MD	_	_	aux	_	_
-9	be	_	AUX	VB	_	_	aux	_	_
-10	causing	_	VERB	VBG	_	_	root	_	_
-11	us	_	PRON	PRP	_	_	iobj	_	_
-12	trouble	_	NOUN	NN	_	_	dobj	_	_
-13	for	_	ADP	IN	_	_	case	_	_
-14	years	_	NOUN	NNS	_	_	nmod	_	_
-15	to	_	PART	TO	_	_	mark	_	_
-16	come	_	VERB	VB	_	_	acl	_	_
-17	.	_	PUNCT	.	_	_	punct	_	_
-18	]	_	PUNCT	-RRB-	_	_	punct	_	_
-"""
-
-
-@pytest.mark.xfail
-def test_issue4665():
-    """
-    conllu2json should not raise an exception if the HEAD column contains an
-    underscore
-    """
-    pass
-    # conllu2json(input_data)
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -1,36 +0,0 @@
-import pytest
-from spacy.kb import KnowledgeBase
-from spacy.util import ensure_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4674():
-    """Test that setting entities with overlapping identifiers does not mess up IO"""
-    nlp = English()
-    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
-
-    vector1 = [0.9, 1.1, 1.01]
-    vector2 = [1.8, 2.25, 2.01]
-    with pytest.warns(UserWarning):
-        kb.set_entities(
-            entity_list=["Q1", "Q1"],
-            freq_list=[32, 111],
-            vector_list=[vector1, vector2],
-        )
-
-    assert kb.get_size_entities() == 1
-
-    # dumping to file & loading back in
-    with make_tempdir() as d:
-        dir_path = ensure_path(d)
-        if not dir_path.exists():
-            dir_path.mkdir()
-        file_path = dir_path / "kb"
-        kb.dump(str(file_path))
-
-        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
-        kb2.load_bulk(str(file_path))
-
-    assert kb2.get_size_entities() == 1
--- a/spacy/tests/regression/test_issue4707.py
+++ b/spacy/tests/regression/test_issue4707.py
@ -1,20 +0,0 @@
-from spacy.util import load_model_from_path
-from spacy.lang.en import English
-
-from ..util import make_tempdir
-
-
-def test_issue4707():
-    """Tests that disabled component names are also excluded from nlp.from_disk
-    by default when loading a model.
-    """
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
-    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
-    exclude = ["tokenizer", "sentencizer"]
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir, exclude=exclude)
-        new_nlp = load_model_from_path(tmpdir, disable=exclude)
-    assert "sentencizer" not in new_nlp.pipe_names
-    assert "entity_ruler" in new_nlp.pipe_names
--- a/spacy/tests/regression/test_issue4725.py
+++ b/spacy/tests/regression/test_issue4725.py
@ -1,41 +0,0 @@
-import pickle
-import numpy
-
-from spacy.lang.en import English
-from spacy.vocab import Vocab
-
-from spacy.tests.util import make_tempdir
-
-
-def test_pickle_ner():
-    """ Ensure the pickling of the NER goes well"""
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
-    with make_tempdir() as tmp_path:
-        with (tmp_path / "ner.pkl").open("wb") as file_:
-            pickle.dump(ner, file_)
-            assert ner.cfg["min_action_freq"] == 342
-
-        with (tmp_path / "ner.pkl").open("rb") as file_:
-            ner2 = pickle.load(file_)
-            assert ner2.cfg["min_action_freq"] == 342
-
-
-def test_issue4725():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-
-    nlp = English(vocab=vocab)
-    ner = nlp.create_pipe("ner")
-    nlp.add_pipe(ner)
-    nlp.begin_training()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
--- a/spacy/tests/regression/test_issue4849.py
+++ b/spacy/tests/regression/test_issue4849.py
@ -1,34 +0,0 @@
-from spacy.lang.en import English
-from spacy.pipeline import EntityRuler
-
-
-def test_issue4849():
-    nlp = English()
-
-    ruler = EntityRuler(
-        nlp,
-        patterns=[
-            {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
-            {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
-        ],
-        phrase_matcher_attr="LOWER",
-    )
-
-    nlp.add_pipe(ruler)
-
-    text = """
-    The left is starting to take aim at Democratic front-runner Joe Biden.
-    Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
-    """
-
-    # USING 1 PROCESS
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=1):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
-
-    # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
--- a/spacy/tests/regression/test_issue4903.py
+++ b/spacy/tests/regression/test_issue4903.py
@ -1,40 +0,0 @@
-from spacy.lang.en import English
-from spacy.tokens import Span, Doc
-
-
-class CustomPipe:
-    name = "my_pipe"
-
-    def __init__(self):
-        Span.set_extension("my_ext", getter=self._get_my_ext)
-        Doc.set_extension("my_ext", default=None)
-
-    def __call__(self, doc):
-        gathered_ext = []
-        for sent in doc.sents:
-            sent_ext = self._get_my_ext(sent)
-            sent._.set("my_ext", sent_ext)
-            gathered_ext.append(sent_ext)
-
-        doc._.set("my_ext", "\n".join(gathered_ext))
-
-        return doc
-
-    @staticmethod
-    def _get_my_ext(span):
-        return str(span.end)
-
-
-def test_issue4903():
-    # ensures that this runs correctly and doesn't hang or crash on Windows / macOS
-
-    nlp = English()
-    custom_component = CustomPipe()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    nlp.add_pipe(custom_component, after="sentencizer")
-
-    text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
--- a/spacy/tests/regression/test_issue4924.py
+++ b/spacy/tests/regression/test_issue4924.py
@ -1,8 +0,0 @@
-from spacy.gold import Example
-from spacy.language import Language
-
-
-def test_issue4924():
-    nlp = Language()
-    example = Example.from_dict(nlp.make_doc(""), {})
-    nlp.evaluate([example])
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,8 @@
+import pytest
 from spacy.lang.en import English


+@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_issue5152():
    # Test that the comparison between a Span and a Token, goes well
    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
@ -8,7 +10,6 @@ def test_issue5152():
    text = nlp("Talk about being boring!")
    text_var = nlp("Talk of being boring!")
    y = nlp("Let")
-
    span = text[0:3]  # Talk about being
    span_2 = text[0:3]  # Talk about being
    span_3 = text_var[0:3]  # Talk of being
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -63,7 +63,8 @@ def tagger():
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
-    tagger.begin_training(pipeline=nlp.pipeline)
+    with pytest.warns(UserWarning):
+        tagger.begin_training(pipeline=nlp.pipeline)
    return tagger


--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -1,10 +1,11 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
+from spacy.gold import spans_from_biluo_tags, iob_to_biluo
 from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.gold.converters import json2docs
 from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
@ -271,75 +272,76 @@ def test_split_sentences(en_vocab):
    assert split_examples[1].text == "had loads of fun "


-@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
-    words = ["I", "flew to", "San Francisco Valley", "."]
-    spaces = [True, True, False, False]
+    words = ["Mr and ", "Mrs Smith", "flew to", "San Francisco Valley", "."]
+    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr and Mrs Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "U-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "U-LOC", "O"]

    entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
+    assert ner_tags == ["O", "U-PERSON", "O", "U-LOC", "O"]

    entities = [
-        (len("I "), len("I flew"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr and "), len("Mr and Mrs"), "PERSON"),  # "Mrs" is a Person
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    gold_words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", None, "U-LOC", "O"]
+    assert ner_tags == ["O", None, "O", "U-LOC", "O"]


 def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
-    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
+    words = ["Mr and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    spaces = [True, True, True, True, True, True, True, False, False]
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr and Mrs Smith", "flew to", "San Francisco Valley", "."]
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+
+    entities = [
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San Francisco Valley", "."]
+    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
+    ner_tags = example.get_aligned_ner()
+    assert ner_tags == ["O", "B-PERSON", "L-PERSON", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+
+
+def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
+    words = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
+    gold_words = ["Mr", "and Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]

    entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+        (len("Mr and "), len("Mr and Mrs Smith"), "PERSON"),  # "Mrs Smith" is a PERSON
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
    ]
-    gold_words = ["I", "flew to", "San Francisco Valley", "."]
+    gold_words = ["Mr and", "Mrs Smith", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
-
-
-@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
-def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
-    words = ["I flew", "to", "San Francisco", "Valley", "."]
-    spaces = [True, True, True, False, False]
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
-    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
-    ner_tags = example.get_aligned_ner()
-    assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
-
-    entities = [
-        (len("I "), len("I flew to"), "ORG"),
-        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
-    ]
-    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
-    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
-    ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
+    assert ner_tags == [None, None, "O", "O", "B-LOC", "L-LOC", "O"]


 def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
@ -349,7 +351,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
        "I flew  to San Francisco Valley.",
    )
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew  to "), len("I flew  to San Francisco Valley"), "LOC")]
+    prefix = "I flew  to "
+    entities = [(len(prefix), len(prefix + "San Francisco Valley"), "LOC")]
    gold_words = ["I", "flew", " ", "to", "San Francisco Valley", "."]
    gold_spaces = [True, True, False, True, False, False]
    example = Example.from_dict(
@ -405,6 +408,49 @@ def test_biluo_spans(en_tokenizer):
    assert spans[1].label_ == "GPE"


+def test_aligned_spans_y2x(en_vocab, en_tokenizer):
+    words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."]
+    spaces = [True, True, True, False, False]
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "."]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    ents_ref = example.reference.ents
+    assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)]
+    ents_y2x = example.get_aligned_spans_y2x(ents_ref)
+    assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
+
+
+def test_aligned_spans_x2y(en_vocab, en_tokenizer):
+    text = "Mr and Mrs Smith flew to San Francisco Valley"
+    nlp = English()
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": "Mr and Mrs Smith"},
+                {"label": "LOC", "pattern": "San Francisco Valley"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+    doc = nlp(text)
+    assert [(ent.start, ent.end) for ent in doc.ents] == [(0, 4), (6, 9)]
+    prefix = "Mr and Mrs Smith flew to "
+    entities = [
+        (0, len("Mr and Mrs Smith"), "PERSON"),
+        (len(prefix), len(prefix + "San Francisco Valley"), "LOC"),
+    ]
+    tokens_ref = ["Mr and Mrs", "Smith", "flew", "to", "San Francisco", "Valley"]
+    example = Example.from_dict(doc, {"words": tokens_ref, "entities": entities})
+    assert [(ent.start, ent.end) for ent in example.reference.ents] == [(0, 2), (4, 6)]
+
+    # Ensure that 'get_aligned_spans_x2y' has the aligned entities correct
+    ents_pred = example.predicted.ents
+    assert [(ent.start, ent.end) for ent in ents_pred] == [(0, 4), (6, 9)]
+    ents_x2y = example.get_aligned_spans_x2y(ents_pred)
+    assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
+
+
 def test_gold_ner_missing_tags(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
@ -412,6 +458,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
    assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2]


+def test_projectivize(en_tokenizer):
+    doc = en_tokenizer("He pretty quickly walks away")
+    heads = [3, 2, 3, 0, 2]
+    example = Example.from_dict(doc, {"heads": heads})
+    proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
+    nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
+    assert proj_heads == [3, 2, 3, 0, 3]
+    assert nonproj_heads == [3, 2, 3, 0, 2]
+
+
 def test_iob_to_biluo():
    good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
    good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
@ -514,6 +570,7 @@ def test_make_orth_variants(doc):
        make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)


+@pytest.mark.skip("Outdated")
@pytest.mark.parametrize(
    "tokens_a,tokens_b,expected",
    [
@ -537,12 +594,12 @@ def test_make_orth_variants(doc):
        ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
    ],
 )
-def test_align(tokens_a, tokens_b, expected):
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
-    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
+def test_align(tokens_a, tokens_b, expected):  # noqa
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa
+    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa
    # check symmetry
-    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
-    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
+    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa
+    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa


 def test_goldparse_startswith_space(en_tokenizer):
@ -556,7 +613,7 @@ def test_goldparse_startswith_space(en_tokenizer):
        doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads}
    )
    ner_tags = example.get_aligned_ner()
-    assert ner_tags == [None, "U-DATE"]
+    assert ner_tags == ["O", "U-DATE"]
    assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"]


--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -55,7 +55,7 @@ def test_aligned_tags():
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    aligned_tags = example.get_aligned("tag", as_string=True)
-    assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"]
+    assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]


 def test_aligned_tags_multi():
--- a/spacy/tests/test_projects.py
+++ b/spacy/tests/test_projects.py
@ -0,0 +1,31 @@
+import pytest
+from spacy.cli.project.util import validate_project_commands
+from spacy.schemas import ProjectConfigSchema, validate
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        {"commands": [{"name": "a"}, {"name": "a"}]},
+        {"commands": [{"name": "a"}], "workflows": {"a": []}},
+        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
+    ],
+)
+def test_project_config_validation1(config):
+    with pytest.raises(SystemExit):
+        validate_project_commands(config)
+
+
+@pytest.mark.parametrize(
+    "config,n_errors",
+    [
+        ({"commands": {"a": []}}, 1),
+        ({"commands": [{"help": "..."}]}, 1),
+        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
+        ({"commands": [{"extra": "b"}]}, 2),
+        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
+    ],
+)
+def test_project_config_validation2(config, n_errors):
+    errors = validate(ProjectConfigSchema, config)
+    assert len(errors) == n_errors
--- a/spacy/util.py
+++ b/spacy/util.py
@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
    return shlex.split(command, posix=not is_windows)


+def join_command(command: List[str]) -> str:
+    """Join a command using shlex. shlex.join is only available for Python 3.8+,
+    so we're using a workaround here.
+
+    command (List[str]): The command to join.
+    RETURNS (str): The joined command
+    """
+    return " ".join(shlex.quote(cmd) for cmd in command)
+
+
 def run_command(command: Union[str, List[str]]) -> None:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
    return hashlib.md5(Path(path).read_bytes()).hexdigest()


+def is_cwd(path: Union[Path, str]) -> bool:
+    """Check whether a path is the current working directory.
+
+    path (Union[Path, str]): The directory path.
+    RETURNS (bool): Whether the path is the current working directory.
+    """
+    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
+
+
 def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.