Merge pull request #7000 from explosion/feature/project-yml-overrides

Support env vars and CLI overrides for project.yml
2026-01-09 18:21:14 +03:00 · 2021-02-11 12:31:45 +11:00 · 2021-02-11 12:31:45 +11:00 · 6b9026a219
commit 6b9026a219
parent e21639fa56 c08b3f294c
5 changed files with 105 additions and 31 deletions
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -16,7 +16,7 @@ import os

 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, ENV_VARS
+from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 from .. import about

 if TYPE_CHECKING:
@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
                    value = "true"
                else:
                    value = args.pop(0)
-            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd be unintuitive to
-            # explicitly mark strings with escaped quotes. So we're working
-            # around that here by falling back to a string if parsing fails.
-            # TODO: improve logic to handle simple types like list of strings?
-            try:
-                result[opt] = srsly.json_loads(value)
-            except ValueError:
-                result[opt] = str(value)
+            result[opt] = _parse_override(value)
        else:
            msg.fail(f"{err}: name should start with --", exits=1)
    return result


-def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
+def _parse_override(value: Any) -> Any:
+    # Just like we do in the config, we're calling json.loads on the
+    # values. But since they come from the CLI, it'd be unintuitive to
+    # explicitly mark strings with escaped quotes. So we're working
+    # around that here by falling back to a string if parsing fails.
+    # TODO: improve logic to handle simple types like list of strings?
+    try:
+        return srsly.json_loads(value)
+    except ValueError:
+        return str(value)
+
+
+def load_project_config(
+    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
+) -> Dict[str, Any]:
    """Load the project.yml file from a directory and validate it. Also make
    sure that all directories defined in the config exist.

    path (Path): The path to the project directory.
    interpolate (bool): Whether to substitute project variables.
+    overrides (Dict[str, Any]): Optional config overrides.
    RETURNS (Dict[str, Any]): The loaded project.yml.
    """
    config_path = path / PROJECT_FILE
@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
        if not dir_path.exists():
            dir_path.mkdir(parents=True)
    if interpolate:
-        err = "project.yml validation error"
+        err = f"{PROJECT_FILE} validation error"
        with show_validation_error(title=err, hint_fill=False):
-            config = substitute_project_variables(config)
+            config = substitute_project_variables(config, overrides)
    return config


-def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
-    key = "vars"
+def substitute_project_variables(
+    config: Dict[str, Any],
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    key: str = "vars",
+    env_key: str = "env",
+) -> Dict[str, Any]:
+    """Interpolate variables in the project file using the config system.
+
+    config (Dict[str, Any]): The project config.
+    overrides (Dict[str, Any]): Optional config overrides.
+    key (str): Key containing variables in project config.
+    env_key (str): Key containing environment variable mapping in project config.
+    RETURNS (Dict[str, Any]): The interpolated project config.
+    """
    config.setdefault(key, {})
-    config[key].update(overrides)
+    config.setdefault(env_key, {})
+    # Substitute references to env vars with their values
+    for config_var, env_var in config[env_key].items():
+        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
    # Need to put variables in the top scope again so we can have a top-level
    # section "project" (otherwise, a list of commands in the top scope wouldn't)
    # be allowed by Thinc's config system
-    cfg = Config({"project": config, key: config[key]})
+    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
+    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
    interpolated = cfg.interpolate()
    return dict(interpolated["project"])

--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -3,19 +3,23 @@ from pathlib import Path
 from wasabi import msg
 import sys
 import srsly
+import typer

 from ... import about
 from ...git_info import GIT_VERSION
 from ...util import working_dir, run_command, split_command, is_cwd, join_command
 from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
-from ...util import check_bool_env_var
+from ...util import check_bool_env_var, SimpleFrozenDict
 from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
-from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
+from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides


-@project_cli.command("run")
+@project_cli.command(
+    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
+)
 def project_run_cli(
    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
@ -33,13 +37,15 @@ def project_run_cli(
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
    else:
-        project_run(project_dir, subcommand, force=force, dry=dry)
+        overrides = parse_config_overrides(ctx.args)
+        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)


 def project_run(
    project_dir: Path,
    subcommand: str,
    *,
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
    force: bool = False,
    dry: bool = False,
    capture: bool = False,
@ -59,7 +65,7 @@ def project_run(
        when you want to turn over execution to the command, and capture=True
        when you want to run the command more like a function.
    """
-    config = load_project_config(project_dir)
+    config = load_project_config(project_dir, overrides=overrides)
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
    workflows = config.get("workflows", {})
    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel):
 class ProjectConfigSchema(BaseModel):
    # fmt: off
    vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
+    env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
    assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -325,6 +325,23 @@ def test_project_config_interpolation():
        substitute_project_variables(project)


+def test_project_config_interpolation_env():
+    variables = {"a": 10}
+    env_var = "SPACY_TEST_FOO"
+    env_vars = {"foo": env_var}
+    commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
+    project = {"commands": commands, "vars": variables, "env": env_vars}
+    with make_tempdir() as d:
+        srsly.write_yaml(d / "project.yml", project)
+        cfg = load_project_config(d)
+    assert cfg["commands"][0]["script"][0] == "hello 10 "
+    os.environ[env_var] = "123"
+    with make_tempdir() as d:
+        srsly.write_yaml(d / "project.yml", project)
+        cfg = load_project_config(d)
+    assert cfg["commands"][0]["script"][0] == "hello 10 123"
+
+
@pytest.mark.parametrize(
    "args,expected",
    [
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud

 By default, the project will be cloned into the current working directory. You
 can specify an optional second argument to define the output directory. The
-`--repo` option lets you define a custom repo to clone from if you don't want
-to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
-can also use any private repo you have access to with Git.
+`--repo` option lets you define a custom repo to clone from if you don't want to
+use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can
+also use any private repo you have access to with Git.

 ### 2. Fetch the project assets {#assets}

@ -221,6 +221,7 @@ pipelines.
 | `title`         | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | `description`   | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | `vars`          | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
+| `env`           | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`.                                                                                                                                                                                                                                                                                          |
 | `directories`   | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
 | `assets`        | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
 | `workflows`     | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can
 specify the destination paths and a checksum, and leave out the URL. When your
 teammates clone and run your project, they can place the files in the respective
 directory themselves. The [`project assets`](/api/cli#project-assets) command
-will alert you about missing files and mismatched checksums, so you can ensure that
-others are running your project with the same data.
+will alert you about missing files and mismatched checksums, so you can ensure
+that others are running your project with the same data.

 ### Dependencies and outputs {#deps-outputs}

@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
 automatically. For instance, if you only run the command `train` that depends on
 data created by `preprocess` and those files are missing, spaCy will show an
 error – it won't just re-run `preprocess`. If you're looking for more advanced
-data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
-can also use `outputs_no_cache` instead of `outputs` to define outputs that
-won't be cached or tracked.
+data management, check out the [Data Version Control (DVC) integration](#dvc).
+If you're planning on integrating your spaCy project with DVC, you can also use
+`outputs_no_cache` instead of `outputs` to define outputs that won't be cached
+or tracked.

 ### Files and directory structure {#project-files}

@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling
 `python scripts/custom_evaluation.py` with the function arguments. You can also
 use the `vars` section to define reusable variables that will be substituted in
 commands, paths and URLs. In this example, the batch size is defined as a
-variable will be added in place of `${vars.batch_size}` in the script.
+variable will be added in place of `${vars.batch_size}` in the script. Just like
+in the [training config](/usage/training##config-overrides), you can also
+override settings on the command line – for example using `--vars.batch_size`.

 > #### Calling into Python
 >
@ -491,6 +495,29 @@ commands:
      - 'corpus/eval.json'
 ```

+You can also use the `env` section to reference **environment variables** and
+make their values available to the commands. This can be useful for overriding
+settings on the command line and passing through system-level settings.
+
+> #### Usage example
+>
+> ```bash
+> export GPU_ID=1
+> BATCH_SIZE=128 python -m spacy project run evaluate
+> ```
+
+```yaml
+### project.yml
+env:
+  batch_size: BATCH_SIZE
+  gpu_id: GPU_ID
+
+commands:
+  - name: evaluate
+    script:
+      - 'python scripts/custom_evaluation.py ${env.batch_size}'
+```
+
 ### Documenting your project {#custom-docs}

 > #### Readme Example