Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-05 21:00:19 +03:00 · 2020-07-09 19:34:11 +02:00 · 2020-07-09 19:34:11 +02:00 · 7010f1a2be
commit 7010f1a2be
parent 0becc5954b 8f9552d9e7
14 changed files with 856 additions and 809 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,8 +15,10 @@ from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
-from .project import project_clone, project_assets, project_run  # noqa: F401
-from .project import project_run_all  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_app.py
+++ b/spacy/cli/_app.py
@ -8,9 +8,16 @@ HELP = """spaCy Command-line Interface

 DOCS: https://spacy.io/api/cli
 """
+PROJECT_HELP = f"""Command-line interface for spaCy projects and working with
+project templates. You'd typically start by cloning a project template to a local
+directory and fetching its assets like datasets etc. See the project's
+project.yml for the available commands.
+"""


 app = typer.Typer(name=NAME, help=HELP)
+project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
+app.add_typer(project_cli)

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
--- a/spacy/cli/project.py
+++ b/spacy/cli/project.py
@ -1,805 +0,0 @@
-from typing import List, Dict, Any, Optional, Sequence, Union
-import typer
-import srsly
-from pathlib import Path
-from wasabi import msg
-import subprocess
-import os
-import re
-import shutil
-import sys
-import requests
-import tqdm
-
-from ._app import app, Arg, Opt, COMMAND, NAME
-from .. import about
-from ..schemas import ProjectConfigSchema, validate
-from ..util import ensure_path, run_command, make_tempdir, working_dir
-from ..util import get_hash, get_checksum, split_command
-
-
-PROJECT_FILE = "project.yml"
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-DIRS = [
-    "assets",
-    "metas",
-    "configs",
-    "packages",
-    "metrics",
-    "scripts",
-    "notebooks",
-    "training",
-    "corpus",
-]
-CACHES = [
-    Path.home() / ".torch",
-    Path.home() / ".caches" / "torch",
-    os.environ.get("TORCH_HOME"),
-    Path.home() / ".keras",
-]
-DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. Do not edit
-# it directly and edit the {PROJECT_FILE} instead and re-run the project."""
-CLI_HELP = f"""Command-line interface for spaCy projects and working with project
-templates. You'd typically start by cloning a project template to a local
-directory and fetching its assets like datasets etc. See the project's
-{PROJECT_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
-Version Control) to manage input and output files and to ensure steps are only
-re-run if their inputs change.
-"""
-
-project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
-
-
-@project_cli.callback(invoke_without_command=True)
-def callback(ctx: typer.Context):
-    """This runs before every project command and ensures DVC is installed."""
-    ensure_dvc()
-
-
-################
-# CLI COMMANDS #
-################
-
-
-@project_cli.command("clone")
-def project_clone_cli(
-    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
-    # fmt: on
-):
-    """Clone a project template from a repository. Calls into "git" and will
-    only download the files from the given subdirectory. The GitHub repo
-    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
-    initialize the project directory as a Git repo. If the project is intended
-    to be a Git repo, it should be initialized with Git first, before
-    initializing DVC (Data Version Control). This allows DVC to integrate with
-    Git.
-    """
-    if dest == Path.cwd():
-        dest = dest / name
-    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
-
-
-@project_cli.command("init")
-def project_init_cli(
-    # fmt: off
-    path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
-    force: bool = Opt(False, "--force", "-F", "-f", help="Force initiziation"),
-    # fmt: on
-):
-    """Initialize a project directory with DVC and optionally Git. This should
-    typically be taken care of automatically when you run the "project clone"
-    command, but you can also run it separately. If the project is intended to
-    be a Git repo, it should be initialized with Git first, before initializing
-    DVC. This allows DVC to integrate with Git.
-    """
-    project_init(path, git=git, force=force)
-
-
-@project_cli.command("assets")
-def project_assets_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Use DVC (Data Version Control) to fetch project assets. Assets are
-    defined in the "assets" section of the project.yml. If possible, DVC
-    will try to track the files so you can pull changes from upstream. It will
-    also try and store the checksum so the assets are versioned. If the file
-    can't be tracked or checked, it will be downloaded without DVC. If a checksum
-    is provided in the project.yml, the file is only downloaded if no local
-    file with the same checksum exists.
-    """
-    project_assets(project_dir)
-
-
-@project_cli.command(
-    "run-all",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_all_cli(
-    # fmt: off
-    ctx: typer.Context,
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run all commands defined in the project. This command will use DVC and
-    the defined outputs and dependencies in the project.yml to determine
-    which steps need to be re-run and where to start. This means you're only
-    re-generating data if the inputs have changed.
-
-    This command calls into "dvc repro" and all additional arguments are passed
-    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
-    """
-    if show_help:
-        print_run_help(project_dir)
-    else:
-        project_run_all(project_dir, *ctx.args)
-
-
-@project_cli.command(
-    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_run_cli(
-    # fmt: off
-    ctx: typer.Context,
-    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
-    # fmt: on
-):
-    """Run a named script defined in the project.yml. If the command is
-    part of the default pipeline defined in the "run" section, DVC is used to
-    determine whether the step should re-run if its inputs have changed, or
-    whether everything is up to date. If the script is not part of the default
-    pipeline, it will be called separately without DVC.
-
-    If DVC is used, the command calls into "dvc repro" and all additional
-    arguments are passed to the "dvc repro" command:
-    https://dvc.org/doc/command-reference/repro
-    """
-    if show_help or not subcommand:
-        print_run_help(project_dir, subcommand)
-    else:
-        project_run(project_dir, subcommand, *ctx.args)
-
-
-@project_cli.command("exec", hidden=True)
-def project_exec_cli(
-    # fmt: off
-    subcommand: str = Arg(..., help=f"Name of command defined in the {PROJECT_FILE}"),
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    # fmt: on
-):
-    """Execute a command defined in the project.yml. This CLI command is
-    only called internally in auto-generated DVC pipelines, as a shortcut for
-    multi-step commands in the project.yml. You typically shouldn't have to
-    call it yourself. To run a command, call "run" or "run-all".
-    """
-    project_exec(project_dir, subcommand)
-
-
-@project_cli.command("update-dvc")
-def project_update_dvc_cli(
-    # fmt: off
-    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
-    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
-    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
-    # fmt: on
-):
-    """Update the auto-generated DVC config file. Uses the steps defined in the
-    "run" section of the project.yml. This typically happens automatically
-    when running a command, but can also be triggered manually if needed.
-    """
-    config = load_project_config(project_dir)
-    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
-    if updated:
-        msg.good(f"Updated DVC config from {PROJECT_FILE}")
-    else:
-        msg.info(f"No changes found in {PROJECT_FILE}, no update needed")
-
-
-app.add_typer(project_cli, name="project")
-
-
-#################
-# CLI FUNCTIONS #
-#################
-
-
-def project_clone(
-    name: str,
-    dest: Path,
-    *,
-    repo: str = about.__projects__,
-    git: bool = False,
-    no_init: bool = False,
-) -> None:
-    """Clone a project template from a repository.
-
-    name (str): Name of subdirectory to clone.
-    dest (Path): Destination path of cloned project.
-    repo (str): URL of Git repo containing project templates.
-    git (bool): Initialize project as Git repo. Should be set to True if project
-        is intended as a repo, since it will allow DVC to integrate with Git.
-    no_init (bool): Don't initialize DVC and Git automatically. If True, the
-        "init" command or "git init" and "dvc init" need to be run manually.
-    """
-    dest = ensure_path(dest)
-    check_clone(name, dest, repo)
-    project_dir = dest.resolve()
-    # We're using Git and sparse checkout to only clone the files we need
-    with make_tempdir() as tmp_dir:
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
-        try:
-            run_command(cmd)
-        except DVCError:
-            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
-            msg.fail(err)
-        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
-            f.write(name)
-        try:
-            run_command(["git", "-C", str(tmp_dir), "fetch"])
-            run_command(["git", "-C", str(tmp_dir), "checkout"])
-        except DVCError:
-            err = f"Could not clone '{name}' in the repo '{repo}'."
-            msg.fail(err)
-        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
-    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
-    for sub_dir in DIRS:
-        dir_path = project_dir / sub_dir
-        if not dir_path.exists():
-            dir_path.mkdir(parents=True)
-    if not no_init:
-        project_init(project_dir, git=git, force=True, silent=True)
-    msg.good(f"Your project is now ready!", dest)
-    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def project_init(
-    project_dir: Path,
-    *,
-    git: bool = False,
-    force: bool = False,
-    silent: bool = False,
-    analytics: bool = False,
-):
-    """Initialize a project as a DVC and (optionally) as a Git repo.
-
-    project_dir (Path): Path to project directory.
-    git (bool): Also call "git init" to initialize directory as a Git repo.
-    silent (bool): Don't print any output (via DVC).
-    analytics (bool): Opt-in to DVC analytics (defaults to False).
-    """
-    with working_dir(project_dir) as cwd:
-        if git:
-            run_command(["git", "init"])
-        flags = {"--force": force, "--quiet": silent, "--no-scm": not git}
-        try:
-            run_dvc_command(["init"], flags=flags)
-        except DVCError:
-            msg.fail(
-                "Failed to initialize project. This likely means that the "
-                "project is already initialized and has a .dvc directory. "
-                "To force-initialize, use the --force flag.",
-                exits=1,
-            )
-        # We don't want to have analytics on by default – our users should
-        # opt-in explicitly. If they want it, they can always enable it.
-        if not analytics:
-            run_dvc_command(["config", "core.analytics", "false"])
-        # Remove unused and confusing plot templates from .dvc directory.
-        # Otherwise super confusing once you commit your changes via Git and it
-        # creates a bunch of files that have no purpose.
-        plots_dir = cwd / DVC_DIR / "plots"
-        if plots_dir.exists():
-            shutil.rmtree(str(plots_dir))
-        config = load_project_config(cwd)
-        setup_check_dvc(cwd, config)
-    msg.good("Initialized project")
-
-
-def project_assets(project_dir: Path) -> None:
-    """Fetch assets for a project using DVC if possible.
-
-    project_dir (Path): Path to project directory.
-    """
-    project_path = ensure_path(project_dir)
-    config = load_project_config(project_path)
-    setup_check_dvc(project_path, config)
-    assets = config.get("assets", {})
-    if not assets:
-        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
-    msg.info(f"Fetching {len(assets)} asset(s)")
-    variables = config.get("variables", {})
-    fetched_assets = []
-    for asset in assets:
-        dest = asset["dest"].format(**variables)
-        url = asset.get("url")
-        checksum = asset.get("checksum")
-        if not url:
-            # project.yml defines asset without URL that the user has to place
-            if not Path(dest).exists():
-                err = f"No URL provided for asset. You need to add this file yourself: {dest}"
-                msg.warn(err)
-            else:
-                if checksum == get_checksum(dest):
-                    msg.good(f"Asset exists with matching checksum: {dest}")
-                    fetched_assets.append((project_path / dest).resolve())
-                else:
-                    msg.fail(f"Asset available but with incorrect checksum: {dest}")
-            continue
-        url = url.format(**variables)
-        fetched_path = fetch_asset(project_path, url, dest, checksum)
-        if fetched_path:
-            fetched_assets.append(str(fetched_path))
-    if fetched_assets:
-        with working_dir(project_path):
-            run_dvc_command(["add", *fetched_assets, "--external"])
-
-
-def fetch_asset(
-    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> Optional[Path]:
-    """Fetch an asset from a given URL or path. Will try to import the file
-    using DVC's import-url if possible (fully tracked and versioned) and falls
-    back to get-url (versioned) and a non-DVC download if necessary. If a
-    checksum is provided and a local file exists, it's only re-downloaded if the
-    checksum doesn't match.
-
-    project_path (Path): Path to project directory.
-    url (str): URL or path to asset.
-    checksum (Optional[str]): Optional expected checksum of local file.
-    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
-        the asset failed.
-    """
-    url = convert_asset_url(url)
-    dest_path = (project_path / dest).resolve()
-    if dest_path.exists() and checksum:
-        # If there's already a file, check for checksum
-        # TODO: add support for caches (dvc import-url with local path)
-        if checksum == get_checksum(dest_path):
-            msg.good(f"Skipping download with matching checksum: {dest}")
-            return dest_path
-    with working_dir(project_path):
-        try:
-            # If these fail, we don't want to output an error or info message.
-            # Try with tracking the source first, then just downloading with
-            # DVC, then a regular non-DVC download.
-            try:
-                run_dvc_command(["import-url", url, str(dest_path)])
-            except DVCError:
-                run_dvc_command(["get-url", url, str(dest_path)])
-        except DVCError:
-            try:
-                download_file(url, dest_path)
-            except requests.exceptions.HTTPError as e:
-                msg.fail(f"Download failed: {dest}", e)
-                return None
-    if checksum and checksum != get_checksum(dest_path):
-        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
-    msg.good(f"Fetched asset {dest}")
-    return dest_path
-
-
-def project_run_all(project_dir: Path, *dvc_args) -> None:
-    """Run all commands defined in the project using DVC.
-
-    project_dir (Path): Path to project directory.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    with working_dir(project_dir):
-        try:
-            run_dvc_command(["repro", *dvc_args])
-        except DVCError:
-            # We could raise a custom error here, but the output produced by
-            # DVC is already pretty substantial.
-            sys.exit(1)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
-    """Simulate a CLI help prompt using the info available in the project.yml.
-
-    project_dir (Path): The project directory.
-    subcommand (Optional[str]): The subcommand or None. If a subcommand is
-        provided, the subcommand help is shown. Otherwise, the top-level help
-        and a list of available commands is printed.
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    if subcommand:
-        validate_subcommand(commands.keys(), subcommand)
-        print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
-        help_text = commands[subcommand].get("help")
-        if help_text:
-            msg.text(f"\n{help_text}\n")
-    else:
-        print(f"\nAvailable commands in {PROJECT_FILE}")
-        print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
-        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
-        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
-        print(f"{COMMAND} project run-all {project_dir}")
-
-
-def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
-    """Run a named script defined in the project.yml. If the script is part
-    of the default pipeline (defined in the "run" section), DVC is used to
-    execute the command, so it can determine whether to rerun it. It then
-    calls into "exec" to execute it.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    *dvc_args: Other arguments passed to "dvc repro".
-    """
-    config = load_project_config(project_dir)
-    setup_check_dvc(project_dir, config)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    validate_subcommand(commands.keys(), subcommand)
-    if subcommand in config.get("run", []):
-        # This is one of the pipeline commands tracked in DVC
-        with working_dir(project_dir):
-            try:
-                run_dvc_command(["repro", subcommand, *dvc_args])
-            except DVCError:
-                # We could raise a custom error here, but the output produced by
-                # DVC is already pretty substantial.
-                sys.exit(1)
-    else:
-        cmd = commands[subcommand]
-        # Deps in non-DVC commands aren't tracked, but if they're defined,
-        # make sure they exist before running the command
-        for dep in cmd.get("deps", []):
-            if not (project_dir / dep).exists():
-                err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                msg.fail(err, exits=1)
-        with working_dir(project_dir):
-            run_commands(cmd["script"], variables)
-
-
-def project_exec(project_dir: Path, subcommand: str) -> None:
-    """Execute a command defined in the project.yml.
-
-    project_dir (Path): Path to project directory.
-    subcommand (str): Name of command to run.
-    """
-    config = load_project_config(project_dir)
-    config_commands = config.get("commands", [])
-    variables = config.get("variables", {})
-    commands = {cmd["name"]: cmd for cmd in config_commands}
-    with working_dir(project_dir):
-        run_commands(commands[subcommand]["script"], variables)
-
-
-###########
-# HELPERS #
-###########
-
-
-def load_project_config(path: Path) -> Dict[str, Any]:
-    """Load the project.yml file from a directory and validate it.
-
-    path (Path): The path to the project directory.
-    RETURNS (Dict[str, Any]): The loaded project.yml.
-    """
-    config_path = path / PROJECT_FILE
-    if not config_path.exists():
-        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
-    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
-    try:
-        config = srsly.read_yaml(config_path)
-    except ValueError as e:
-        msg.fail(invalid_err, e, exits=1)
-    errors = validate(ProjectConfigSchema, config)
-    if errors:
-        msg.fail(invalid_err, "\n".join(errors), exits=1)
-    return config
-
-
-def update_dvc_config(
-    path: Path,
-    config: Dict[str, Any],
-    verbose: bool = False,
-    silent: bool = False,
-    force: bool = False,
-) -> bool:
-    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
-    project directory. The file is auto-generated based on the config. The
-    first line of the auto-generated file specifies the hash of the config
-    dict, so if any of the config values change, the DVC config is regenerated.
-
-    path (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    verbose (bool): Whether to print additional info (via DVC).
-    silent (bool): Don't output anything (via DVC).
-    force (bool): Force update, even if hashes match.
-    RETURNS (bool): Whether the DVC config file was updated.
-    """
-    config_hash = get_hash(config)
-    path = path.resolve()
-    dvc_config_path = path / DVC_CONFIG
-    if dvc_config_path.exists():
-        # Check if the file was generated using the current config, if not, redo
-        with dvc_config_path.open("r", encoding="utf8") as f:
-            ref_hash = f.readline().strip().replace("# ", "")
-        if ref_hash == config_hash and not force:
-            return False  # Nothing has changed in project.yml, don't need to update
-        dvc_config_path.unlink()
-    variables = config.get("variables", {})
-    dvc_commands = []
-    # We only want to include commands that are part of the main list of "run"
-    # commands in project.yml and should be run in sequence
-    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    for name in config.get("run", []):
-        validate_subcommand(config_commands.keys(), name)
-        command = config_commands[name]
-        deps = command.get("deps", [])
-        outputs = command.get("outputs", [])
-        outputs_no_cache = command.get("outputs_no_cache", [])
-        if not deps and not outputs and not outputs_no_cache:
-            continue
-        # Default to the working dir as the project path since dvc.yaml is auto-generated
-        # and we don't want arbitrary paths in there
-        project_cmd = ["python", "-m", NAME, "project", "exec", name]
-        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
-        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
-        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
-        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
-        dvc_commands.append(" ".join(full_cmd))
-    with working_dir(path):
-        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
-    with dvc_config_path.open("r+", encoding="utf8") as f:
-        content = f.read()
-        f.seek(0, 0)
-        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
-    return True
-
-
-def ensure_dvc() -> None:
-    """Ensure that the "dvc" command is available and show an error if not."""
-    try:
-        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
-            "You can install the Python package from pip (pip install dvc) or "
-            "conda (conda install -c conda-forge dvc). For more details, see the "
-            "documentation: https://dvc.org/doc/install",
-            exits=1,
-        )
-
-
-def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
-    """Check that the project is set up correctly with DVC and update its
-    config if needed. Will raise an error if the project is not an initialized
-    DVC project.
-
-    project_dir (Path): The path to the project directory.
-    config (Dict[str, Any]): The loaded project.yml.
-    """
-    if not project_dir.exists():
-        msg.fail(f"Can't find project directory: {project_dir}")
-    if not (project_dir / ".dvc").exists():
-        msg.fail(
-            "Project not initialized as a DVC project.",
-            f"Make sure that the project template was cloned correctly. To "
-            f"initialize the project directory manually, you can run: "
-            f"{COMMAND} project init {project_dir}",
-            exits=1,
-        )
-    with msg.loading("Updating DVC config..."):
-        updated = update_dvc_config(project_dir, config, silent=True)
-    if updated:
-        msg.good(f"Updated DVC config from changed {PROJECT_FILE}")
-
-
-def convert_asset_url(url: str) -> str:
-    """Check and convert the asset URL if needed.
-
-    url (str): The asset URL.
-    RETURNS (str): The converted URL.
-    """
-    # If the asset URL is a regular GitHub URL it's likely a mistake
-    if re.match(r"(http(s?)):\/\/github.com", url):
-        converted = url.replace("github.com", "raw.githubusercontent.com")
-        converted = re.sub(r"/(tree|blob)/", "/", converted)
-        msg.warn(
-            "Downloading from a regular GitHub URL. This will only download "
-            "the source of the page, not the actual file. Converting the URL "
-            "to a raw URL.",
-            converted,
-        )
-        return converted
-    return url
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
-    """Check and validate that the destination path can be used to clone. Will
-    check that Git is available and that the destination path is suitable.
-
-    name (str): Name of the directory to clone from the repo.
-    dest (Path): Local destination of cloned directory.
-    repo (str): URL of the repo to clone from.
-    """
-    try:
-        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
-    except Exception:
-        msg.fail(
-            f"Cloning spaCy project templates requires Git and the 'git' command. ",
-            f"To clone a project without Git, copy the files from the '{name}' "
-            f"directory in the {repo} to {dest} manually and then run:",
-            f"{COMMAND} project init {dest}",
-            exits=1,
-        )
-    if not dest:
-        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
-    if dest.exists():
-        # Directory already exists (not allowed, clone needs to create it)
-        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
-    if not dest.parent.exists():
-        # We're not creating parents, parent dir should exist
-        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
-            exits=1,
-        )
-
-
-def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
-    """Check that a subcommand is valid and defined. Raises an error otherwise.
-
-    commands (Sequence[str]): The available commands.
-    subcommand (str): The subcommand.
-    """
-    if subcommand not in commands:
-        msg.fail(
-            f"Can't find command '{subcommand}' in {PROJECT_FILE}. "
-            f"Available commands: {', '.join(commands)}",
-            exits=1,
-        )
-
-
-def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
-    """Download a file using requests.
-
-    url (str): The URL of the file.
-    dest (Path): The destination path.
-    chunk_size (int): The size of chunks to read/write.
-    """
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    total = int(response.headers.get("content-length", 0))
-    progress_settings = {
-        "total": total,
-        "unit": "iB",
-        "unit_scale": True,
-        "unit_divisor": chunk_size,
-        "leave": False,
-    }
-    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
-        for data in response.iter_content(chunk_size=chunk_size):
-            size = f.write(data)
-            bar.update(size)
-
-
-def run_commands(
-    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
-) -> None:
-    """Run a sequence of commands in a subprocess, in order.
-
-    commands (List[str]): The string commands.
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    silent (bool): Don't print the commands.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        # Not sure if this is needed or a good idea. Motivation: users may often
-        # use commands in their config that reference "python" and we want to
-        # make sure that it's always executing the same Python that spaCy is
-        # executed with and the pip in the same env, not some other Python/pip.
-        # Also ensures cross-compatibility if user 1 writes "python3" (because
-        # that's how it's set up on their system), and user 2 without the
-        # shortcut tries to re-run the command.
-        if len(command) and command[0] in ("python", "python3"):
-            command[0] = sys.executable
-        elif len(command) and command[0] in ("pip", "pip3"):
-            command = [sys.executable, "-m", "pip", *command[1:]]
-        if not silent:
-            print(f"Running command: {' '.join(command)}")
-        run_command(command)
-
-
-def run_dvc_commands(
-    commands: List[str] = tuple(),
-    variables: Dict[str, str] = {},
-    flags: Dict[str, bool] = {},
-) -> None:
-    """Run a sequence of DVC commands in a subprocess, in order.
-
-    commands (List[str]): The string commands without the leading "dvc".
-    variables (Dict[str, str]): Dictionary of variable names, mapped to their
-        values. Will be used to substitute format string variables in the
-        commands.
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    """
-    for command in commands:
-        # Substitute variables, e.g. "./{NAME}.json"
-        command = command.format(**variables)
-        command = split_command(command)
-        run_dvc_command(command, flags=flags)
-
-
-def run_dvc_command(
-    command: Union[str, List[str]], flags: Dict[str, bool] = {}, silent: bool = False
-) -> None:
-    """Run a DVC command in a subprocess. This wrapper gives us a bit more
-    control over how the output and errors are presented. Raises a DVC error if
-    the "dvc" command returns a non-zero exit code and uses the error message
-    logged by DVC.
-
-    command (Union[str, List[str]]): The command, without the leading "dvc".
-    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
-        easier to pass flags like --quiet that depend on a variable or
-        command-line setting while avoiding lots of nested conditionals.
-    silent (bool): Don't print any output.
-    """
-    if isinstance(command, str):
-        command = split_command(command)
-    dvc_command = ["dvc", *command]
-    # Add the flags if they are set to True
-    for flag, is_active in flags.items():
-        if is_active:
-            dvc_command.append(flag)
-    proc = subprocess.Popen(dvc_command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    if not silent:
-        lines = proc.stdout.read().decode("utf8").split("\n\n")
-        for line in lines:
-            line = line.strip()
-            if is_relevant_dvc_output(line):
-                print(f"{line}\n")
-    _, err = proc.communicate()  # Important: otherwise returncode will be None!
-    if proc.returncode != 0:
-        if isinstance(err, bytes):
-            err = err.decode("utf8")
-        raise DVCError(err)
-
-
-def is_relevant_dvc_output(line: str) -> bool:
-    """Check whether the output by DVC is something we want to keep.
-
-    line (str): A line written to stdout,.
-    RETURNS (bool): Whether to use/print the line.
-    """
-    # Writing them like this for readability but maybe replace with regex?
-    conditions = [
-        not line,
-        line.startswith("What's next?"),
-        line.startswith("Having any troubles?"),
-    ]
-    return not any(conditions)
-
-
-class DVCError(RuntimeError):
-    """Custom error type for anything produced by the DVC CLI."""
-
-    pass
--- a/spacy/cli/project/init.py
+++ b/spacy/cli/project/init.py
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -0,0 +1,154 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import requests
+import tqdm
+import re
+import shutil
+
+from ...util import ensure_path, get_checksum, working_dir
+from .._app import project_cli, Arg
+from .util import PROJECT_FILE, load_project_config
+
+
+# TODO: find a solution for caches
+# CACHES = [
+#     Path.home() / ".torch",
+#     Path.home() / ".caches" / "torch",
+#     os.environ.get("TORCH_HOME"),
+#     Path.home() / ".keras",
+# ]
+
+
+@project_cli.command("assets")
+def project_assets_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+    """
+    project_assets(project_dir)
+
+
+def project_assets(project_dir: Path) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path)
+    assets = config.get("assets", {})
+    if not assets:
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+    msg.info(f"Fetching {len(assets)} asset(s)")
+    variables = config.get("variables", {})
+    for asset in assets:
+        dest = asset["dest"].format(**variables)
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            check_private_asset(dest, checksum)
+            continue
+        url = url.format(**variables)
+        fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Desintation path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if checksum and checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    # TODO: add support for caches
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists() and checksum:
+        # If there's already a file, check for checksum
+        if checksum == get_checksum(dest_path):
+            msg.good(f"Skipping download with matching checksum: {dest}")
+            return dest_path
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+                return
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if re.match(r"(http(s?)):\/\/github.com", url):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
+
+
+def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
+    """Download a file using requests.
+
+    url (str): The URL of the file.
+    dest (Path): The destination path.
+    chunk_size (int): The size of chunks to read/write.
+    """
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total = int(response.headers.get("content-length", 0))
+    progress_settings = {
+        "total": total,
+        "unit": "iB",
+        "unit_scale": True,
+        "unit_divisor": chunk_size,
+        "leave": False,
+    }
+    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
+        for data in response.iter_content(chunk_size=chunk_size):
+            size = f.write(data)
+            bar.update(size)
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -0,0 +1,110 @@
+from pathlib import Path
+from wasabi import msg
+import subprocess
+import shutil
+
+from ... import about
+from ...util import ensure_path, run_command, make_tempdir
+from .._app import project_cli, Arg, Opt, COMMAND
+
+
+DIRS = [
+    "assets",
+    "metas",
+    "configs",
+    "packages",
+    "metrics",
+    "scripts",
+    "notebooks",
+    "training",
+    "corpus",
+]
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to fetch"),
+    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo). Setting the --git flag will also
+    initialize the project directory as a Git repo. If the project is intended
+    to be a Git repo, it should be initialized with Git first, before
+    initializing DVC (Data Version Control). This allows DVC to integrate with
+    Git.
+    """
+    if dest == Path.cwd():
+        dest = dest / name
+    project_clone(name, dest, repo=repo)
+
+
+def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    # We're using Git and sparse checkout to only clone the files we need
+    with make_tempdir() as tmp_dir:
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
+        try:
+            run_command(cmd)
+        except subprocess.CalledProcessError:
+            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
+            msg.fail(err)
+        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
+            f.write(name)
+        try:
+            run_command(["git", "-C", str(tmp_dir), "fetch"])
+            run_command(["git", "-C", str(tmp_dir), "checkout"])
+        except subprocess.CalledProcessError:
+            err = f"Could not clone '{name}' in the repo '{repo}'."
+            msg.fail(err)
+        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
+    msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
+    for sub_dir in DIRS:
+        dir_path = project_dir / sub_dir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    msg.good(f"Your project is now ready!", dest)
+    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    try:
+        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+            f"To clone a project without Git, copy the files from the '{name}' "
+            f"directory in the {repo} to {dest} manually and then run:",
+            f"{COMMAND} project init {dest}",
+            exits=1,
+        )
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            exits=1,
+        )
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -0,0 +1,206 @@
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+from typing import Dict, Any, List, Optional
+import subprocess
+from pathlib import Path
+from wasabi import msg
+
+from .util import PROJECT_FILE, load_project_config
+from .._app import project_cli, Arg, Opt, NAME, COMMAND
+from ...util import get_hash, working_dir, split_command, join_command, run_command
+
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    silent: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    silent (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    variables = config.get("variables", {})
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+    with working_dir(path):
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        dvc_command = ["dvc", *command]
+        # Add the flags if they are set to True
+        for flag, is_active in flags.items():
+            if is_active:
+                dvc_command.append(flag)
+        run_command(dvc_command)
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -0,0 +1,250 @@
+from typing import Optional, List, Dict, Sequence, Any
+from pathlib import Path
+from wasabi import msg
+import typer
+import sys
+import srsly
+
+from ...util import working_dir, run_command, split_command, is_cwd, get_checksum
+from ...util import get_hash, join_command
+from .._app import project_cli, Arg, Opt, COMMAND
+from .util import PROJECT_FILE, PROJECT_LOCK, load_project_config
+
+
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def project_run_cli(
+    # fmt: off
+    ctx: typer.Context,
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named script or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define inputs and/or outputs, they will only be re-run if state
+    has changed.
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        project_run(project_dir, subcommand, *ctx.args, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    """
+    config = load_project_config(project_dir)
+    variables = config.get("variables", {})
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(project_dir, cmd, force=force, dry=dry)
+    else:
+        cmd = commands[subcommand]
+        variables = config.get("variables", {})
+        for dep in cmd.get("deps", []):
+            dep = dep.format(**variables)
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_kwargs = {"exits": 1} if not dry else {}
+                msg.fail(err, **err_kwargs)
+        with working_dir(project_dir) as current_dir:
+            rerun = check_rerun(current_dir, cmd, variables)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                msg.divider(subcommand)
+                run_commands(cmd["script"], variables, dry=dry)
+                update_lockfile(current_dir, cmd, variables)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(commands.keys(), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        help_text = commands[subcommand].get("help")
+        if help_text:
+            msg.text(f"\n{help_text}\n")
+    else:
+        print(f"\nAvailable commands in {PROJECT_FILE}")
+        print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        msg.text(f"Run all commands defined in the 'run' block of the {PROJECT_FILE}:")
+        print(f"{COMMAND} project run {project_loc}")
+
+
+def run_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, Any] = {},
+    silent: bool = False,
+    dry: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (bool): Whether to re-run the command.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+
+
+def update_lockfile(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+    }
+
+
+def get_fileinfo(
+    project_dir: Path, paths: List[str], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        path = path.format(**variables)
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
--- a/spacy/cli/project/util.py
+++ b/spacy/cli/project/util.py
@ -0,0 +1,57 @@
+from typing import Dict, Any
+from pathlib import Path
+from wasabi import msg
+import srsly
+
+from ...schemas import ProjectConfigSchema, validate
+
+
+PROJECT_FILE = "project.yml"
+PROJECT_LOCK = "project.lock"
+
+
+def load_project_config(path: Path) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it.
+
+    path (Path): The path to the project directory.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err, "\n".join(errors), exits=1)
+    validate_project_commands(config)
+    return config
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -523,7 +523,18 @@ class SentenceRecognizer(Tagger):
    def get_loss(self, examples, scores):
        labels = self.labels
        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
-        truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
+        truths = []
+        for eg in examples:
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
+                else:
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError("nan value when computing loss")
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -246,7 +246,7 @@ class ProjectConfigSchema(BaseModel):
    # fmt: off
    variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
    assets: List[ProjectConfigAsset] = Field([], title="Data assets")
-    run: List[StrictStr] = Field([], title="Names of project commands to execute, in order")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
    # fmt: on

--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -38,6 +38,11 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # add some cases where SENT_START == -1
+    train_examples[0].reference[10].is_sent_start = False
+    train_examples[1].reference[1].is_sent_start = False
+    train_examples[1].reference[11].is_sent_start = False
+
    nlp.add_pipe(senter)
    optimizer = nlp.begin_training()

--- a/spacy/tests/test_projects.py
+++ b/spacy/tests/test_projects.py
@ -0,0 +1,31 @@
+import pytest
+from spacy.cli.project.util import validate_project_commands
+from spacy.schemas import ProjectConfigSchema, validate
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        {"commands": [{"name": "a"}, {"name": "a"}]},
+        {"commands": [{"name": "a"}], "workflows": {"a": []}},
+        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
+    ],
+)
+def test_project_config_validation1(config):
+    with pytest.raises(SystemExit):
+        validate_project_commands(config)
+
+
+@pytest.mark.parametrize(
+    "config,n_errors",
+    [
+        ({"commands": {"a": []}}, 1),
+        ({"commands": [{"help": "..."}]}, 1),
+        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
+        ({"commands": [{"extra": "b"}]}, 2),
+        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
+    ],
+)
+def test_project_config_validation2(config, n_errors):
+    errors = validate(ProjectConfigSchema, config)
+    assert len(errors) == n_errors
--- a/spacy/util.py
+++ b/spacy/util.py
@ -449,6 +449,16 @@ def split_command(command: str) -> List[str]:
    return shlex.split(command, posix=not is_windows)


+def join_command(command: List[str]) -> str:
+    """Join a command using shlex. shlex.join is only available for Python 3.8+,
+    so we're using a workaround here.
+
+    command (List[str]): The command to join.
+    RETURNS (str): The joined command
+    """
+    return " ".join(shlex.quote(cmd) for cmd in command)
+
+
 def run_command(command: Union[str, List[str]]) -> None:
    """Run a command on the command line as a subprocess. If the subprocess
    returns a non-zero exit code, a system exit is performed.
@ -520,6 +530,15 @@ def get_checksum(path: Union[Path, str]) -> str:
    return hashlib.md5(Path(path).read_bytes()).hexdigest()


+def is_cwd(path: Union[Path, str]) -> bool:
+    """Check whether a path is the current working directory.
+
+    path (Union[Path, str]): The directory path.
+    RETURNS (bool): Whether the path is the current working directory.
+    """
+    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
+
+
 def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.