Allow spacy project to push and pull to/from remote storage (#5949)

* Add utils for working with remote storage * WIP add remote_cache for project * WIP add push and pull commands * Use pathy in remote_cache * Updarte util * Update remote_cache * Update util * Update project assets * Update pull script * Update push script * Fix type annotation in util * Work on remote storage * Remove site and env hash * Fix imports * Fix type annotation * Require pathy * Require pathy * Fix import * Add a util to handle project variable substitution * Import push and pull commands * Fix pull command * Fix push command * Fix tarfile in remote_storage * Improve printing * Fiddle with status messages * Set version to v3.0.0a9 * Draft docs for spacy project remote storages * Update docs [ci skip] * Use Thinc config to simplify and unify template variables * Auto-format * Don't import Pathy globally for now Causes slow and annoying Google Cloud warning * Tidy up test * Tidy up and update tests * Update to latest Thinc * Update docs * variables -> vars * Update docs [ci skip] * Update docs [ci skip] Co-authored-by: Ines Montani <ines@ines.io>
2025-09-16 09:02:35 +03:00 · 2020-08-23 18:32:09 +02:00 · 2020-08-23 18:32:09 +02:00 · e559867605
commit e559867605
parent 9bdc9e81f5
20 changed files with 712 additions and 107 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,9 +6,10 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a28,<8.0.0a30",
+    "thinc>=8.0.0a29,<8.0.0a40",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
-    "smart_open>=2.0.0,<3.0.0"
+    "smart_open>=2.0.0,<3.0.0",
    "pathy"
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a28,<8.0.0a30
+thinc>=8.0.0a29,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
@ -9,6 +9,7 @@ wasabi>=0.7.1,<1.1.0
 srsly>=2.1.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
 typer>=0.3.0,<0.4.0
 pathy
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,18 +34,19 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a28,<8.0.0a30
+    thinc>=8.0.0a29,<8.0.0a40
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a28,<8.0.0a30
+    thinc>=8.0.0a29,<8.0.0a40
    blis>=0.4.0,<0.5.0
    wasabi>=0.7.1,<1.1.0
    srsly>=2.1.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
    typer>=0.3.0,<0.4.0
    pathy
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a8"
+__version__ = "3.0.0a9"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -21,6 +21,8 @@ from .project.clone import project_clone  # noqa: F401
 from .project.assets import project_assets  # noqa: F401
 from .project.run import project_run  # noqa: F401
 from .project.dvc import project_update_dvc  # noqa: F401
 from .project.push import project_push # noqa: F401
 from .project.pull import project_pull # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -1,4 +1,5 @@
-from typing import Dict, Any, Union, List, Optional
+from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
 import sys
 from pathlib import Path
 from wasabi import msg
 import srsly
@ -8,11 +9,13 @@ from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError
 import sys
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
 PROJECT_FILE = "project.yml"
 PROJECT_LOCK = "project.lock"
@ -93,11 +96,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
    return result
-def load_project_config(path: Path) -> Dict[str, Any]:
+def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
    """Load the project.yml file from a directory and validate it. Also make
    sure that all directories defined in the config exist.
    path (Path): The path to the project directory.
    interpolate (bool): Whether to substitute project variables.
    RETURNS (Dict[str, Any]): The loaded project.yml.
    """
    config_path = path / PROJECT_FILE
@ -119,9 +123,25 @@ def load_project_config(path: Path) -> Dict[str, Any]:
        dir_path = path / subdir
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    if interpolate:
        err = "project.yml validation error"
        with show_validation_error(title=err, hint_fill=False):
            config = substitute_project_variables(config)
    return config
 def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
    key = "vars"
    config.setdefault(key, {})
    config[key].update(overrides)
    # Need to put variables in the top scope again so we can have a top-level
    # section "project" (otherwise, a list of commands in the top scope wouldn't)
    # be allowed by Thinc's config system
    cfg = Config({"project": config, key: config[key]})
    interpolated = cfg.interpolate()
    return dict(interpolated["project"])
 def validate_project_commands(config: Dict[str, Any]) -> None:
    """Check that project commands and workflows are valid, don't contain
    duplicates, don't clash  and only refer to commands that exist.
@ -232,3 +252,39 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
        for name, cfg in config.get("components", {}).items()
        if "factory" not in cfg and "source" in cfg
    ]
 def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
    """Upload a file.
    src (Path): The source path.
    url (str): The destination URL to upload to.
    """
    dest = ensure_pathy(dest)
    with dest.open(mode="wb") as output_file:
        with src.open(mode="rb") as input_file:
            output_file.write(input_file.read())
 def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
    """Download a file using smart_open.
    url (str): The URL of the file.
    dest (Path): The destination path.
    force (bool): Whether to force download even if file exists.
        If False, the download will be skipped.
    """
    if dest.exists() and not force:
        return None
    src = ensure_pathy(src)
    with src.open(mode="rb") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())
 def ensure_pathy(path):
    """Temporary helper to prevent importing Pathy globally (which can cause
    slow and annoying Google Cloud warning)."""
    from pathy import Pathy  # noqa: F811
    return Pathy(path)
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -4,10 +4,10 @@ from wasabi import msg
 import re
 import shutil
 import requests
 import smart_open
 from ...util import ensure_path, working_dir
 from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
 from .._util import download_file
 # TODO: find a solution for caches
@ -44,16 +44,14 @@ def project_assets(project_dir: Path) -> None:
    if not assets:
        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
    msg.info(f"Fetching {len(assets)} asset(s)")
    variables = config.get("variables", {})
    for asset in assets:
-        dest = asset["dest"].format(**variables)
+        dest = asset["dest"]
        url = asset.get("url")
        checksum = asset.get("checksum")
        if not url:
            # project.yml defines asset without URL that the user has to place
            check_private_asset(dest, checksum)
            continue
        url = url.format(**variables)
        fetch_asset(project_path, url, dest, checksum)
@ -132,15 +130,3 @@ def convert_asset_url(url: str) -> str:
        )
        return converted
    return url
 def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
    """Download a file using smart_open.
    url (str): The URL of the file.
    dest (Path): The destination path.
    chunk_size (int): The size of chunks to read/write.
    """
    with smart_open.open(url, mode="rb") as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -99,7 +99,6 @@ def update_dvc_config(
        if ref_hash == config_hash and not force:
            return False  # Nothing has changed in project.yml, don't need to update
        dvc_config_path.unlink()
    variables = config.get("variables", {})
    dvc_commands = []
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    for name in workflows[workflow]:
@ -122,7 +121,7 @@ def update_dvc_config(
        dvc_commands.append(join_command(full_cmd))
    with working_dir(path):
        dvc_flags = {"--verbose": verbose, "--quiet": silent}
-        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+        run_dvc_commands(dvc_commands, flags=dvc_flags)
    with dvc_config_path.open("r+", encoding="utf8") as f:
        content = f.read()
        f.seek(0, 0)
@ -131,23 +130,16 @@ def update_dvc_config(
 def run_dvc_commands(
-    commands: List[str] = tuple(),
+    commands: List[str] = tuple(), flags: Dict[str, bool] = {},
    variables: Dict[str, str] = {},
    flags: Dict[str, bool] = {},
 ) -> None:
    """Run a sequence of DVC commands in a subprocess, in order.
    commands (List[str]): The string commands without the leading "dvc".
    variables (Dict[str, str]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
        easier to pass flags like --quiet that depend on a variable or
        command-line setting while avoiding lots of nested conditionals.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = split_command(command)
        dvc_command = ["dvc", *command]
        # Add the flags if they are set to True
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@ -0,0 +1,36 @@
 from pathlib import Path
 from wasabi import msg
 from .remote_storage import RemoteStorage
 from .remote_storage import get_command_hash
 from .._util import project_cli, Arg
 from .._util import load_project_config
@project_cli.command("pull")
 def project_pull_cli(
    # fmt: off
    remote: str = Arg("default", help="Name or path of remote storage"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Retrieve any precomputed outputs from a remote storage that are available.
    You can alias remotes in your project.yml by mapping them to storage paths.
    A storage can be anything that the smart-open library can upload to, e.g.
    gcs, aws, ssh, local directories etc
    """
    for url, output_path in project_pull(project_dir, remote):
        if url is not None:
            msg.good(f"Pulled {output_path} from {url}")
 def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
    config = load_project_config(project_dir)
    if remote in config.get("remotes", {}):
        remote = config["remotes"][remote]
    storage = RemoteStorage(project_dir, remote)
    for cmd in config.get("commands", []):
        deps = [project_dir / dep for dep in cmd.get("deps", [])]
        cmd_hash = get_command_hash("", "", deps, cmd["script"])
        for output_path in cmd.get("outputs", []):
            url = storage.pull(output_path, command_hash=cmd_hash)
            yield url, output_path
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@ -0,0 +1,48 @@
 from pathlib import Path
 from wasabi import msg
 from .remote_storage import RemoteStorage
 from .remote_storage import get_content_hash, get_command_hash
 from .._util import load_project_config
 from .._util import project_cli, Arg
@project_cli.command("push")
 def project_push_cli(
    # fmt: off
    remote: str = Arg("default", help="Name or path of remote storage"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    # fmt: on
 ):
    """Persist outputs to a remote storage. You can alias remotes in your project.yml
    by mapping them to storage paths. A storage can be anything that the smart-open
    library can upload to, e.g. gcs, aws, ssh, local directories etc
    """
    for output_path, url in project_push(project_dir, remote):
        if url is None:
            msg.info(f"Skipping {output_path}")
        else:
            msg.good(f"Pushed {output_path} to {url}")
 def project_push(project_dir: Path, remote: str):
    """Persist outputs to a remote storage. You can alias remotes in your project.yml
    by mapping them to storage paths. A storage can be anything that the smart-open
    library can upload to, e.g. gcs, aws, ssh, local directories etc
    """
    config = load_project_config(project_dir)
    if remote in config.get("remotes", {}):
        remote = config["remotes"][remote]
    storage = RemoteStorage(project_dir, remote)
    for cmd in config.get("commands", []):
        cmd_hash = get_command_hash(
            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
        )
        for output_path in cmd.get("outputs", []):
            output_loc = project_dir / output_path
            if output_loc.exists():
                url = storage.push(
                    output_path,
                    command_hash=cmd_hash,
                    content_hash=get_content_hash(output_loc),
                )
                yield output_path, url
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@ -0,0 +1,169 @@
 from typing import Optional, List, Dict, TYPE_CHECKING
 import os
 import site
 import hashlib
 import urllib.parse
 import tarfile
 from pathlib import Path
 from .._util import get_hash, get_checksum, download_file, ensure_pathy
 from ...util import make_tempdir
 if TYPE_CHECKING:
    from pathy import Pathy  # noqa: F401
 class RemoteStorage:
    """Push and pull outputs to and from a remote file storage.
    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
    ssh, etc.
    """
    def __init__(self, project_root: Path, url: str, *, compression="gz"):
        self.root = project_root
        self.url = ensure_pathy(url)
        self.compression = compression
    def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
        """Compress a file or directory within a project and upload it to a remote
        storage. If an object exists at the full URL, nothing is done.
        Within the remote storage, files are addressed by their project path
        (url encoded) and two user-supplied hashes, representing their creation
        context and their file contents. If the URL already exists, the data is
        not uploaded. Paths are archived and compressed prior to upload.
        """
        loc = self.root / path
        if not loc.exists():
            raise IOError(f"Cannot push {loc}: does not exist.")
        url = self.make_url(path, command_hash, content_hash)
        if url.exists():
            return None
        tmp: Path
        with make_tempdir() as tmp:
            tar_loc = tmp / self.encode_name(str(path))
            mode_string = f"w:{self.compression}" if self.compression else "w"
            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                tar_file.add(str(loc), arcname=str(path))
            with tar_loc.open(mode="rb") as input_file:
                with url.open(mode="wb") as output_file:
                    output_file.write(input_file.read())
        return url
    def pull(
        self,
        path: Path,
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
    ) -> Optional["Pathy"]:
        """Retrieve a file from the remote cache. If the file already exists,
        nothing is done.
        If the command_hash and/or content_hash are specified, only matching
        results are returned. If no results are available, an error is raised.
        """
        dest = self.root / path
        if dest.exists():
            return None
        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
        if url is None:
            return url
        else:
            # Make sure the destination exists
            if not dest.parent.exists():
                dest.parent.mkdir(parents=True)
            tmp: Path
            with make_tempdir() as tmp:
                tar_loc = tmp / url.parts[-1]
                download_file(url, tar_loc)
                mode_string = f"r:{self.compression}" if self.compression else "r"
                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
                    # This requires that the path is added correctly, relative
                    # to root. This is how we set things up in push()
                    tar_file.extractall(self.root)
        return url
    def find(
        self,
        path: Path,
        *,
        command_hash: Optional[str] = None,
        content_hash: Optional[str] = None,
    ) -> Optional["Pathy"]:
        """Find the best matching version of a file within the storage,
        or `None` if no match can be found. If both the creation and content hash
        are specified, only exact matches will be returned. Otherwise, the most
        recent matching file is preferred.
        """
        name = self.encode_name(str(path))
        if command_hash is not None and content_hash is not None:
            url = self.make_url(path, command_hash, content_hash)
            urls = [url] if url.exists() else []
        elif command_hash is not None:
            urls = list((self.url / name / command_hash).iterdir())
        else:
            urls = list((self.url / name).iterdir())
            if content_hash is not None:
                urls = [url for url in urls if url.parts[-1] == content_hash]
        return urls[-1] if urls else None
    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
        """Construct a URL from a subpath, a creation hash and a content hash."""
        return self.url / self.encode_name(str(path)) / command_hash / content_hash
    def encode_name(self, name: str) -> str:
        """Encode a subpath into a URL-safe name."""
        return urllib.parse.quote_plus(name)
 def get_content_hash(loc: Path) -> str:
    return get_checksum(loc)
 def get_command_hash(
    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
 ) -> str:
    """Create a hash representing the execution of a command. This includes the
    currently installed packages, whatever environment variables have been marked
    as relevant, and the command.
    """
    hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
    hashes.extend(cmd)
    creation_bytes = "".join(hashes).encode("utf8")
    return hashlib.md5(creation_bytes).hexdigest()
 def get_site_hash():
    """Hash the current Python environment's site-packages contents, including
    the name and version of the libraries. The list we're hashing is what
    `pip freeze` would output.
    """
    site_dirs = site.getsitepackages()
    if site.ENABLE_USER_SITE:
        site_dirs.extend(site.getusersitepackages())
    packages = set()
    for site_dir in site_dirs:
        site_dir = Path(site_dir)
        for subpath in site_dir.iterdir():
            if subpath.parts[-1].endswith("dist-info"):
                packages.add(subpath.parts[-1].replace(".dist-info", ""))
    package_bytes = "".join(sorted(packages)).encode("utf8")
    return hashlib.md5sum(package_bytes).hexdigest()
 def get_env_hash(env: Dict[str, str]) -> str:
    """Construct a hash of the environment variables that will be passed into
    the commands.
    Values in the env dict may be references to the current os.environ, using
    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
    """
    env_vars = {}
    for key, value in env.items():
        if value.startswith("$"):
            env_vars[key] = os.environ.get(value[1:], "")
        else:
            env_vars[key] = value
    return get_hash(env_vars)
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -44,7 +44,6 @@ def project_run(
    dry (bool): Perform a dry run and don't execute commands.
    """
    config = load_project_config(project_dir)
    variables = config.get("variables", {})
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
@ -54,22 +53,20 @@ def project_run(
            project_run(project_dir, cmd, force=force, dry=dry)
    else:
        cmd = commands[subcommand]
        variables = config.get("variables", {})
        for dep in cmd.get("deps", []):
            dep = dep.format(**variables)
            if not (project_dir / dep).exists():
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
                err_kwargs = {"exits": 1} if not dry else {}
                msg.fail(err, **err_kwargs)
        with working_dir(project_dir) as current_dir:
-            rerun = check_rerun(current_dir, cmd, variables)
+            rerun = check_rerun(current_dir, cmd)
            if not rerun and not force:
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
            else:
                msg.divider(subcommand)
-                run_commands(cmd["script"], variables, dry=dry)
+                run_commands(cmd["script"], dry=dry)
                if not dry:
-                    update_lockfile(current_dir, cmd, variables)
+                    update_lockfile(current_dir, cmd)
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
@ -115,23 +112,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
 def run_commands(
-    commands: List[str] = tuple(),
+    commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
    variables: Dict[str, Any] = {},
    silent: bool = False,
    dry: bool = False,
 ) -> None:
    """Run a sequence of commands in a subprocess, in order.
    commands (List[str]): The string commands.
    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
        values. Will be used to substitute format string variables in the
        commands.
    silent (bool): Don't print the commands.
    dry (bool): Perform a dry run and don't execut anything.
    """
    for command in commands:
        # Substitute variables, e.g. "./{NAME}.json"
        command = command.format(**variables)
        command = split_command(command)
        # Not sure if this is needed or a good idea. Motivation: users may often
        # use commands in their config that reference "python" and we want to
@ -173,15 +162,12 @@ def validate_subcommand(
        )
-def check_rerun(
+def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> bool:
    """Check if a command should be rerun because its settings or inputs/outputs
    changed.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (bool): Whether to re-run the command.
    """
    lock_path = project_dir / PROJECT_LOCK
@ -197,19 +183,16 @@ def check_rerun(
    # If the entry in the lockfile matches the lockfile entry that would be
    # generated from the current command, we don't rerun because it means that
    # all inputs/outputs, hashes and scripts are the same and nothing changed
-    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+    return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
-def update_lockfile(
+def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> None:
    """Update the lockfile after running a command. Will create a lockfile if
    it doesn't yet exist and will add an entry for the current command, its
    script and dependencies/outputs.
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    """
    lock_path = project_dir / PROJECT_LOCK
    if not lock_path.exists():
@ -217,13 +200,11 @@ def update_lockfile(
        data = {}
    else:
        data = srsly.read_yaml(lock_path)
-    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    data[command["name"]] = get_lock_entry(project_dir, command)
    srsly.write_yaml(lock_path, data)
-def get_lock_entry(
+def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
 ) -> Dict[str, Any]:
    """Get a lockfile entry for a given command. An entry includes the command,
    the script (command steps) and a list of dependencies and outputs with
    their paths and file hashes, if available. The format is based on the
@ -231,12 +212,11 @@ def get_lock_entry(
    project_dir (Path): The current project directory.
    command (Dict[str, Any]): The command, as defined in the project.yml.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (Dict[str, Any]): The lockfile entry.
    """
-    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    deps = get_fileinfo(project_dir, command.get("deps", []))
-    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []))
-    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
    return {
        "cmd": f"{COMMAND} run {command['name']}",
        "script": command["script"],
@ -245,20 +225,16 @@ def get_lock_entry(
    }
-def get_fileinfo(
+def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
    project_dir: Path, paths: List[str], variables: Dict[str, Any]
 ) -> List[Dict[str, str]]:
    """Generate the file information for a list of paths (dependencies, outputs).
    Includes the file path and the file's checksum.
    project_dir (Path): The current project directory.
    paths (List[str]): The file paths.
    variables (Dict[str, Any]): The variables defined in the project.yml.
    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
    """
    data = []
    for path in paths:
        path = path.format(**variables)
        file_path = project_dir / path
        md5 = get_checksum(file_path) if file_path.exists() else None
        data.append({"path": path, "md5": md5})
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -303,7 +303,7 @@ class ProjectConfigCommand(BaseModel):
 class ProjectConfigSchema(BaseModel):
    # fmt: off
-    variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands")
+    vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
    assets: List[ProjectConfigAsset] = Field([], title="Data assets")
    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -6,9 +6,12 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.pretrain import make_docs
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
-from spacy.util import get_lang_class
+from spacy.cli._util import load_project_config, substitute_project_variables
 from thinc.config import ConfigValidationError
 import srsly
 from .util import make_tempdir
 def test_cli_converters_conllu2json():
    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -295,6 +298,24 @@ def test_project_config_validation2(config, n_errors):
    assert len(errors) == n_errors
 def test_project_config_interpolation():
    variables = {"a": 10, "b": {"c": "foo", "d": True}}
    commands = [
        {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
        {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
    ]
    project = {"commands": commands, "vars": variables}
    with make_tempdir() as d:
        srsly.write_yaml(d / "project.yml", project)
        cfg = load_project_config(d)
    assert cfg["commands"][0]["script"][0] == "hello 10 foo"
    assert cfg["commands"][1]["script"][0] == "foo true"
    commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
    project = {"commands": commands, "vars": variables}
    with pytest.raises(ConfigValidationError):
        substitute_project_variables(project)
@pytest.mark.parametrize(
    "args,expected",
    [
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,5 +1,5 @@
 from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
-from typing import Iterator, Type, Pattern, TYPE_CHECKING
+from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
 from types import ModuleType
 import os
 import importlib
@ -610,7 +610,7 @@ def working_dir(path: Union[str, Path]) -> None:
@contextmanager
-def make_tempdir() -> None:
+def make_tempdir() -> Generator[Path, None, None]:
    """Execute a block in a temporary directory and remove the directory and
    its contents at the end of the with block.
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -847,6 +847,92 @@ $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry]
 | `--help`, `-h`  | Show help message and available arguments. ~~bool (flag)~~                              |
 | **EXECUTES**    | The command defined in the `project.yml`.                                               |
 ### project push {#project-push tag="command"}
 Upload all available files or directories listed as in the `outputs` section of
 commands to a remote storage. Outputs are archived and compressed prior to
 upload, and addressed in the remote storage using the output's relative path
 (URL encoded), a hash of its command string and dependencies, and a hash of its
 file contents. This means `push` should **never overwrite** a file in your
 remote. If all the hashes match, the contents are the same and nothing happens.
 If the contents are different, the new version of the file is uploaded. Deleting
 obsolete files is left up to you.
 Remotes can be defined in the `remotes` section of the
 [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
 [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
 communicate with the remote storages, so you can use any protocol that
 `smart-open` supports, including [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
 you may need to install extra dependencies to use certain protocols.
 ```cli
 $ python -m spacy project push [remote] [project_dir]
 ```
 > #### Example
 >
 > ```cli
 > $ python -m spacy project push my_bucket
 > ```
 >
 > ```yaml
 > ### project.yml
 > remotes:
 >   my_bucket: 's3://my-spacy-bucket'
 > ```
 | Name           | Description                                                                             |
 | -------------- | --------------------------------------------------------------------------------------- |
 | `remote`       | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~      |
 | `project_dir`  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
 | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
 | **UPLOADS**    | All project outputs that exist and are not already stored in the remote.                |
 ### project pull {#project-pull tag="command"}
 Download all files or directories listed as `outputs` for commands, unless they
 are not already present locally. When searching for files in the remote, `pull`
 won't just look at the output path, but will also consider the **command
 string** and the **hashes of the dependencies**. For instance, let's say you've
 previously pushed a model checkpoint to the remote, but now you've changed some
 hyper-parameters. Because you've changed the inputs to the command, if you run
 `pull`, you won't retrieve the stale result. If you train your model and push
 the outputs to the remote, the outputs will be saved alongside the prior
 outputs, so if you change the config back, you'll be able to fetch back the
 result.
 Remotes can be defined in the `remotes` section of the
 [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the
 [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
 communicate with the remote storages, so you can use any protocol that
 `smart-open` supports, including [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
 you may need to install extra dependencies to use certain protocols.
 ```cli
 $ python -m spacy project pull [remote] [project_dir]
 ```
 > #### Example
 >
 > ```cli
 > $ python -m spacy project pull my_bucket
 > ```
 >
 > ```yaml
 > ### project.yml
 > remotes:
 >   my_bucket: 's3://my-spacy-bucket'
 > ```
 | Name           | Description                                                                             |
 | -------------- | --------------------------------------------------------------------------------------- |
 | `remote`       | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~  |
 | `project_dir`  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
 | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
 | **DOWNLOADS**  | All project outputs that do not exist locally and can be found in the remote.           |
 ### project dvc {#project-dvc tag="command"}
 Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
--- a/website/docs/images/projects.svg
+++ b/website/docs/images/projects.svg
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -5,9 +5,12 @@ menu:
  - ['Intro & Workflow', 'intro']
  - ['Directory & Assets', 'directory']
  - ['Custom Projects', 'custom']
  - ['Remote Storage', 'remote']
  - ['Integrations', 'integrations']
 ---
 ## Introduction and workflow {#intro hidden="true"}
 > #### 🪐 Project templates
 >
 > Our [`projects`](https://github.com/explosion/projects) repo includes various
@ -19,20 +22,17 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
 different **use cases and domains**, and orchestrate training, packaging and
 serving your custom models. You can start off by cloning a pre-defined project
 template, adjust it to fit your needs, load in your data, train a model, export
-it as a Python package and share the project templates with your team. spaCy
+it as a Python package, upload your outputs to a remote storage and share your
-projects can be used via the new [`spacy project`](/api/cli#project) command.
+results with your team. spaCy projects can be used via the new
-For an overview of the available project templates, check out the
+[`spacy project`](/api/cli#project) command and we provide templates in our
-[`projects`](https://github.com/explosion/projects) repo. spaCy projects also
+[`projects`](https://github.com/explosion/projects) repo.
 [integrate](#integrations) with many other cool machine learning and data
 science tools to track and manage your data and experiments, iterate on demos
 and prototypes and ship your models into production.
 <!-- TODO: mention integrations -->
 ## Introduction and workflow {#intro}
 <!-- TODO: decide how to introduce concept -->
 ![Illustration of project workflow and commands](../images/projects.svg)
 <!-- TODO:
 <Project id="some_example_project">
@ -155,8 +155,8 @@ other. For instance, to generate a packaged model, you might start by converting
 your data, then run [`spacy train`](/api/cli#train) to train your model on the
 converted data and if that's successful, run [`spacy package`](/api/cli#package)
 to turn the best model artifact into an installable Python package. The
-following command runs the workflow named `all` defined in the `project.yml`, and
+following command runs the workflow named `all` defined in the `project.yml`,
-executes the commands it specifies, in order:
+and executes the commands it specifies, in order:
 ```cli
 $ python -m spacy project run all
@ -171,6 +171,31 @@ advanced data pipelines and track your changes in Git, check out the
 from a workflow defined in your `project.yml` so you can manage your spaCy
 project as a DVC repo.
 ### 5. Optional: Push to remote storage {#push}
 > ```yaml
 > ### project.yml
 > remotes:
 >   default: 's3://my-spacy-bucket'
 >   local: '/mnt/scratch/cache'
 > ```
 After training a model, you can optionally use the
 [`spacy project push`](/api/cli#project-push) command to upload your outputs to
 a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
 you **export** your model packages, **share** work with your team, or **cache
 results** to avoid repeating work.
 ```cli
 $ python -m spacy project push
 ```
 The `remotes` section in your `project.yml` lets you assign names to the
 different storages. To download state from a remote storage, you can use the
 [`spacy project pull`](/api/cli#project-pull) command. For more details, see the
 docs on [remote storage](#remote).
 ## Project directory and assets {#directory}
 ### project.yml {#project-yml}
@ -190,7 +215,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `variables`   | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`.                                                                                                                                                                                                                                                                                                                                                                |
+| `vars`        | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
 | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
 | `assets`      | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match.                                                                                                                                                                                                     |
 | `workflows`   | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
@ -349,9 +374,9 @@ if __name__ == "__main__":
 In your `project.yml`, you can then run the script by calling
 `python scripts/custom_evaluation.py` with the function arguments. You can also
-use the `variables` section to define reusable variables that will be
+use the `vars` section to define reusable variables that will be substituted in
-substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is
+commands, paths and URLs. In this example, the batch size is defined as a
-defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
+variable will be added in place of `${vars.batch_size}` in the script.
 > #### Calling into Python
 >
@ -363,13 +388,13 @@ defined as a variable will be added in place of `{BATCH_SIZE}` in the script.
 <!-- prettier-ignore -->
 ```yaml
 ### project.yml
-variables:
+vars:
-  BATCH_SIZE: 128
+  batch_size: 128
 commands:
  - name: evaluate
    script:
-      - 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json'
+      - 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json'
    deps:
      - 'training/model-best'
      - 'corpus/eval.json'
@ -421,6 +446,114 @@ assets:
    checksum: '5113dc04e03f079525edd8df3f4f39e3'
 ```
 ## Remote Storage {#remote}
 You can persist your project outputs to a remote storage using the
 [`project push`](/api/cli#project-push) command. This can help you **export**
 your model packages, **share** work with your team, or **cache results** to
 avoid repeating work. The [`project pull`](/api/cli#project-pull) command will
 download any outputs that are in the remote storage and aren't available
 locally.
 You can list one or more remotes in the `remotes` section of your
 [`project.yml`](#project-yml) by mapping a string name to the URL of the
 storage. Under the hood, spaCy uses the
 [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to
 communicate with the remote storages, so you can use any protocol that
 `smart-open` supports, including [S3](https://aws.amazon.com/s3/),
 [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although
 you may need to install extra dependencies to use certain protocols.
 > #### Example
 >
 > ```cli
 > $ python -m spacy project pull local
 > ```
 ```yaml
 ### project.yml
 remotes:
  default: 's3://my-spacy-bucket'
  local: '/mnt/scratch/cache'
  stuff: 'ssh://myserver.example.com/whatever'
 ```
 <Infobox title="How it works" emoji="💡">
 Inside the remote storage, spaCy uses a clever **directory structure** to avoid
 overwriting files. The top level of the directory structure is a URL-encoded
 version of the output's path. Within this directory are subdirectories named
 according to a hash of the command string and the command's dependencies.
 Finally, within those directories are files, named according to an MD5 hash of
 their contents.
 <!-- TODO: update with actual real example? -->
 <!-- prettier-ignore -->
 ```yaml
 └── urlencoded_file_path            # Path of original file
    ├── some_command_hash           # Hash of command you ran
    │   ├── some_content_hash       # Hash of file content
    │   └── another_content_hash
    └── another_command_hash
        └── third_content_hash
 ```
 </Infobox>
 For instance, let's say you had the following command in your `project.yml`:
 ```yaml
 ### project.yml
 - name: train
  help: 'Train a spaCy model using the specified corpus and config'
  script:
    - 'spacy train ./config.cfg --output training/'
  deps:
    - 'corpus/train'
    - 'corpus/dev'
    - 'config.cfg'
  outputs:
    - 'training/model-best'
 ```
 > #### Example
 >
 > ```
 > └── s3://my-spacy-bucket/training%2Fmodel-best
 >     └── 1d8cb33a06cc345ad3761c6050934a1b
 >         └── d8e20c3537a084c5c10d95899fe0b1ff
 > ```
 After you finish training, you run [`project push`](/api/cli#project-push) to
 make sure the `training/model-best` output is saved to remote storage. spaCy
 will then construct a hash from your command script and the listed dependencies,
 `corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
 execution context of your output. It would then compute an MD5 hash of the
 `training/model-best` directory, and use those three pieces of information to
 construct the storage URL.
 ```cli
 $ python -m spacy project run train
 $ python -m spacy project push
 ```
 If you change the command or one of its dependencies (for instance, by editing
 the [`config.cfg`](/usage/training#config) file to tune the hyperparameters, a
 different creation hash will be calculated, so when you use
 [`project push`](/api/cli#project-push) you won't be overwriting your previous
 file. The system even supports multiple outputs for the same file and the same
 context, which can happen if your training process is not deterministic, or if
 you have dependencies that aren't represented in the command.
 In summary, the [`spacy project`](/api/cli#project) remote storages are designed
 to make a particular set of trade-offs. Priority is placed on **convenience**,
 **correctness** and **avoiding data loss**. You can use
 [`project push`](/api/cli#project-push) freely, as you'll never overwrite remote
 state, and you don't have to come up with names or version numbers. However,
 it's up to you to manage the size of your remote storage, and to remove files
 that are no longer relevant to you.
 ## Integrations {#integrations}
 ### Data Version Control (DVC) {#dvc} <IntegrationLogo name="dvc" title="DVC" width={70} height="auto" align="right" />
@ -517,16 +650,17 @@ and evaluation set.
 <!-- prettier-ignore -->
 ```yaml
 ### project.yml
-variables:
+vars:
-  PRODIGY_DATASET: 'ner_articles'
+  prodigy:
-  PRODIGY_LABELS: 'PERSON,ORG,PRODUCT'
+    dataset: 'ner_articles'
-  PRODIGY_MODEL: 'en_core_web_md'
+    labels: 'PERSON,ORG,PRODUCT'
    model: 'en_core_web_md'
 commands:
  - name: annotate
  - script:
-      - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}'
+      - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
-      - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}'
+      - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
      - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
      - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
  - deps:
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -104,11 +104,15 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for
 different **use cases and domains**, and orchestrate training, packaging and
 serving your custom models. You can start off by cloning a pre-defined project
 template, adjust it to fit your needs, load in your data, train a model, export
-it as a Python package and share the project templates with your team. spaCy
+it as a Python package, upload your outputs to a remote storage and share your
-projects also make it easy to **integrate with other tools** in the data science
+results with your team.
-and machine learning ecosystem, including [DVC](/usage/projects#dvc) for data
+
-version control, [Prodigy](/usage/projects#prodigy) for creating labelled data,
+![Illustration of project workflow and commands](../images/projects.svg)
-[Streamlit](/usage/projects#streamlit) for building interactive apps,
+
 spaCy projects also make it easy to **integrate with other tools** in the data
 science and machine learning ecosystem, including [DVC](/usage/projects#dvc) for
 data version control, [Prodigy](/usage/projects#prodigy) for creating labelled
 data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
 [FastAPI](/usage/projects#fastapi) for serving models in production,
 [Ray](/usage/projects#ray) for parallel training,
 [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
--- a/website/src/components/table.js
+++ b/website/src/components/table.js
@ -5,6 +5,8 @@ import Icon from './icon'
 import { isString } from './util'
 import classes from '../styles/table.module.sass'
 const FOOT_ROW_REGEX = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES|UPLOADS|DOWNLOADS)/
 function isNum(children) {
    return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children)
 }
@ -43,7 +45,6 @@ function isDividerRow(children) {
 }
 function isFootRow(children) {
    const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES)/
    if (children.length && children[0].props.name === 'td') {
        const cellChildren = children[0].props.children
        if (
@ -52,7 +53,7 @@ function isFootRow(children) {
            cellChildren.props.children &&
            isString(cellChildren.props.children)
        ) {
-            return rowRegex.test(cellChildren.props.children)
+            return FOOT_ROW_REGEX.test(cellChildren.props.children)
        }
    }
    return false