Merge pull request #7000 from explosion/feature/project-yml-overrides

Support env vars and CLI overrides for project.yml
This commit is contained in:
Ines Montani 2021-02-11 12:31:45 +11:00 committed by GitHub
commit 6b9026a219
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 31 deletions

View File

@ -16,7 +16,7 @@ import os
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, ENV_VARS
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about
if TYPE_CHECKING:
@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
value = "true"
else:
value = args.pop(0)
result[opt] = _parse_override(value)
else:
msg.fail(f"{err}: name should start with --", exits=1)
return result
def _parse_override(value: Any) -> Any:
# Just like we do in the config, we're calling json.loads on the
# values. But since they come from the CLI, it'd be unintuitive to
# explicitly mark strings with escaped quotes. So we're working
# around that here by falling back to a string if parsing fails.
# TODO: improve logic to handle simple types like list of strings?
try:
result[opt] = srsly.json_loads(value)
return srsly.json_loads(value)
except ValueError:
result[opt] = str(value)
else:
msg.fail(f"{err}: name should start with --", exits=1)
return result
return str(value)
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]:
"""Load the project.yml file from a directory and validate it. Also make
sure that all directories defined in the config exist.
path (Path): The path to the project directory.
interpolate (bool): Whether to substitute project variables.
overrides (Dict[str, Any]): Optional config overrides.
RETURNS (Dict[str, Any]): The loaded project.yml.
"""
config_path = path / PROJECT_FILE
@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
if not dir_path.exists():
dir_path.mkdir(parents=True)
if interpolate:
err = "project.yml validation error"
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config)
config = substitute_project_variables(config, overrides)
return config
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
key = "vars"
def substitute_project_variables(
config: Dict[str, Any],
overrides: Dict[str, Any] = SimpleFrozenDict(),
key: str = "vars",
env_key: str = "env",
) -> Dict[str, Any]:
"""Interpolate variables in the project file using the config system.
config (Dict[str, Any]): The project config.
overrides (Dict[str, Any]): Optional config overrides.
key (str): Key containing variables in project config.
env_key (str): Key containing environment variable mapping in project config.
RETURNS (Dict[str, Any]): The interpolated project config.
"""
config.setdefault(key, {})
config[key].update(overrides)
config.setdefault(env_key, {})
# Substitute references to env vars with their values
for config_var, env_var in config[env_key].items():
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
# Need to put variables in the top scope again so we can have a top-level
# section "project" (otherwise, a list of commands in the top scope wouldn't)
# be allowed by Thinc's config system
cfg = Config({"project": config, key: config[key]})
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
interpolated = cfg.interpolate()
return dict(interpolated["project"])

View File

@ -3,19 +3,23 @@ from pathlib import Path
from wasabi import msg
import sys
import srsly
import typer
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var
from ...util import check_bool_env_var, SimpleFrozenDict
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
@project_cli.command("run")
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
def project_run_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
@ -33,13 +37,15 @@ def project_run_cli(
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
project_run(project_dir, subcommand, force=force, dry=dry)
overrides = parse_config_overrides(ctx.args)
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
def project_run(
project_dir: Path,
subcommand: str,
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
force: bool = False,
dry: bool = False,
capture: bool = False,
@ -59,7 +65,7 @@ def project_run(
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
"""
config = load_project_config(project_dir)
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(commands.keys(), workflows.keys(), subcommand)

View File

@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel):
class ProjectConfigSchema(BaseModel):
# fmt: off
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")

View File

@ -325,6 +325,23 @@ def test_project_config_interpolation():
substitute_project_variables(project)
def test_project_config_interpolation_env():
variables = {"a": 10}
env_var = "SPACY_TEST_FOO"
env_vars = {"foo": env_var}
commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
project = {"commands": commands, "vars": variables, "env": env_vars}
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
assert cfg["commands"][0]["script"][0] == "hello 10 "
os.environ[env_var] = "123"
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
assert cfg["commands"][0]["script"][0] == "hello 10 123"
@pytest.mark.parametrize(
"args,expected",
[

View File

@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud
By default, the project will be cloned into the current working directory. You
can specify an optional second argument to define the output directory. The
`--repo` option lets you define a custom repo to clone from if you don't want
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
can also use any private repo you have access to with Git.
`--repo` option lets you define a custom repo to clone from if you don't want to
use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can
also use any private repo you have access to with Git.
### 2. Fetch the project assets {#assets}
@ -221,6 +221,7 @@ pipelines.
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can
specify the destination paths and a checksum, and leave out the URL. When your
teammates clone and run your project, they can place the files in the respective
directory themselves. The [`project assets`](/api/cli#project-assets) command
will alert you about missing files and mismatched checksums, so you can ensure that
others are running your project with the same data.
will alert you about missing files and mismatched checksums, so you can ensure
that others are running your project with the same data.
### Dependencies and outputs {#deps-outputs}
@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
automatically. For instance, if you only run the command `train` that depends on
data created by `preprocess` and those files are missing, spaCy will show an
error it won't just re-run `preprocess`. If you're looking for more advanced
data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
can also use `outputs_no_cache` instead of `outputs` to define outputs that
won't be cached or tracked.
data management, check out the [Data Version Control (DVC) integration](#dvc).
If you're planning on integrating your spaCy project with DVC, you can also use
`outputs_no_cache` instead of `outputs` to define outputs that won't be cached
or tracked.
### Files and directory structure {#project-files}
@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling
`python scripts/custom_evaluation.py` with the function arguments. You can also
use the `vars` section to define reusable variables that will be substituted in
commands, paths and URLs. In this example, the batch size is defined as a
variable will be added in place of `${vars.batch_size}` in the script.
variable will be added in place of `${vars.batch_size}` in the script. Just like
in the [training config](/usage/training##config-overrides), you can also
override settings on the command line for example using `--vars.batch_size`.
> #### Calling into Python
>
@ -491,6 +495,29 @@ commands:
- 'corpus/eval.json'
```
You can also use the `env` section to reference **environment variables** and
make their values available to the commands. This can be useful for overriding
settings on the command line and passing through system-level settings.
> #### Usage example
>
> ```bash
> export GPU_ID=1
> BATCH_SIZE=128 python -m spacy project run evaluate
> ```
```yaml
### project.yml
env:
batch_size: BATCH_SIZE
gpu_id: GPU_ID
commands:
- name: evaluate
script:
- 'python scripts/custom_evaluation.py ${env.batch_size}'
```
### Documenting your project {#custom-docs}
> #### Readme Example