Merge pull request #7000 from explosion/feature/project-yml-overrides

Support env vars and CLI overrides for project.yml
This commit is contained in:
Ines Montani 2021-02-11 12:31:45 +11:00 committed by GitHub
commit 6b9026a219
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 105 additions and 31 deletions

View File

@ -16,7 +16,7 @@ import os
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, ENV_VARS from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
value = "true" value = "true"
else: else:
value = args.pop(0) value = args.pop(0)
# Just like we do in the config, we're calling json.loads on the result[opt] = _parse_override(value)
# values. But since they come from the CLI, it'd be unintuitive to
# explicitly mark strings with escaped quotes. So we're working
# around that here by falling back to a string if parsing fails.
# TODO: improve logic to handle simple types like list of strings?
try:
result[opt] = srsly.json_loads(value)
except ValueError:
result[opt] = str(value)
else: else:
msg.fail(f"{err}: name should start with --", exits=1) msg.fail(f"{err}: name should start with --", exits=1)
return result return result
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: def _parse_override(value: Any) -> Any:
# Just like we do in the config, we're calling json.loads on the
# values. But since they come from the CLI, it'd be unintuitive to
# explicitly mark strings with escaped quotes. So we're working
# around that here by falling back to a string if parsing fails.
# TODO: improve logic to handle simple types like list of strings?
try:
return srsly.json_loads(value)
except ValueError:
return str(value)
def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]:
"""Load the project.yml file from a directory and validate it. Also make """Load the project.yml file from a directory and validate it. Also make
sure that all directories defined in the config exist. sure that all directories defined in the config exist.
path (Path): The path to the project directory. path (Path): The path to the project directory.
interpolate (bool): Whether to substitute project variables. interpolate (bool): Whether to substitute project variables.
overrides (Dict[str, Any]): Optional config overrides.
RETURNS (Dict[str, Any]): The loaded project.yml. RETURNS (Dict[str, Any]): The loaded project.yml.
""" """
config_path = path / PROJECT_FILE config_path = path / PROJECT_FILE
@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
if not dir_path.exists(): if not dir_path.exists():
dir_path.mkdir(parents=True) dir_path.mkdir(parents=True)
if interpolate: if interpolate:
err = "project.yml validation error" err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False): with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config) config = substitute_project_variables(config, overrides)
return config return config
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): def substitute_project_variables(
key = "vars" config: Dict[str, Any],
overrides: Dict[str, Any] = SimpleFrozenDict(),
key: str = "vars",
env_key: str = "env",
) -> Dict[str, Any]:
"""Interpolate variables in the project file using the config system.
config (Dict[str, Any]): The project config.
overrides (Dict[str, Any]): Optional config overrides.
key (str): Key containing variables in project config.
env_key (str): Key containing environment variable mapping in project config.
RETURNS (Dict[str, Any]): The interpolated project config.
"""
config.setdefault(key, {}) config.setdefault(key, {})
config[key].update(overrides) config.setdefault(env_key, {})
# Substitute references to env vars with their values
for config_var, env_var in config[env_key].items():
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
# Need to put variables in the top scope again so we can have a top-level # Need to put variables in the top scope again so we can have a top-level
# section "project" (otherwise, a list of commands in the top scope wouldn't) # section "project" (otherwise, a list of commands in the top scope wouldn't)
# be allowed by Thinc's config system # be allowed by Thinc's config system
cfg = Config({"project": config, key: config[key]}) cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
interpolated = cfg.interpolate() interpolated = cfg.interpolate()
return dict(interpolated["project"]) return dict(interpolated["project"])

View File

@ -3,19 +3,23 @@ from pathlib import Path
from wasabi import msg from wasabi import msg
import sys import sys
import srsly import srsly
import typer
from ... import about from ... import about
from ...git_info import GIT_VERSION from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var from ...util import check_bool_env_var, SimpleFrozenDict
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
@project_cli.command("run") @project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
def project_run_cli( def project_run_cli(
# fmt: off # fmt: off
ctx: typer.Context, # This is only used to read additional arguments
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
@ -33,13 +37,15 @@ def project_run_cli(
if show_help or not subcommand: if show_help or not subcommand:
print_run_help(project_dir, subcommand) print_run_help(project_dir, subcommand)
else: else:
project_run(project_dir, subcommand, force=force, dry=dry) overrides = parse_config_overrides(ctx.args)
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
def project_run( def project_run(
project_dir: Path, project_dir: Path,
subcommand: str, subcommand: str,
*, *,
overrides: Dict[str, Any] = SimpleFrozenDict(),
force: bool = False, force: bool = False,
dry: bool = False, dry: bool = False,
capture: bool = False, capture: bool = False,
@ -59,7 +65,7 @@ def project_run(
when you want to turn over execution to the command, and capture=True when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function. when you want to run the command more like a function.
""" """
config = load_project_config(project_dir) config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {}) workflows = config.get("workflows", {})
validate_subcommand(commands.keys(), workflows.keys(), subcommand) validate_subcommand(commands.keys(), workflows.keys(), subcommand)

View File

@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel):
class ProjectConfigSchema(BaseModel): class ProjectConfigSchema(BaseModel):
# fmt: off # fmt: off
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")

View File

@ -325,6 +325,23 @@ def test_project_config_interpolation():
substitute_project_variables(project) substitute_project_variables(project)
def test_project_config_interpolation_env():
variables = {"a": 10}
env_var = "SPACY_TEST_FOO"
env_vars = {"foo": env_var}
commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
project = {"commands": commands, "vars": variables, "env": env_vars}
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
assert cfg["commands"][0]["script"][0] == "hello 10 "
os.environ[env_var] = "123"
with make_tempdir() as d:
srsly.write_yaml(d / "project.yml", project)
cfg = load_project_config(d)
assert cfg["commands"][0]["script"][0] == "hello 10 123"
@pytest.mark.parametrize( @pytest.mark.parametrize(
"args,expected", "args,expected",
[ [

View File

@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud
By default, the project will be cloned into the current working directory. You By default, the project will be cloned into the current working directory. You
can specify an optional second argument to define the output directory. The can specify an optional second argument to define the output directory. The
`--repo` option lets you define a custom repo to clone from if you don't want `--repo` option lets you define a custom repo to clone from if you don't want to
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can
can also use any private repo you have access to with Git. also use any private repo you have access to with Git.
### 2. Fetch the project assets {#assets} ### 2. Fetch the project assets {#assets}
@ -221,6 +221,7 @@ pipelines.
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can
specify the destination paths and a checksum, and leave out the URL. When your specify the destination paths and a checksum, and leave out the URL. When your
teammates clone and run your project, they can place the files in the respective teammates clone and run your project, they can place the files in the respective
directory themselves. The [`project assets`](/api/cli#project-assets) command directory themselves. The [`project assets`](/api/cli#project-assets) command
will alert you about missing files and mismatched checksums, so you can ensure that will alert you about missing files and mismatched checksums, so you can ensure
others are running your project with the same data. that others are running your project with the same data.
### Dependencies and outputs {#deps-outputs} ### Dependencies and outputs {#deps-outputs}
@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
automatically. For instance, if you only run the command `train` that depends on automatically. For instance, if you only run the command `train` that depends on
data created by `preprocess` and those files are missing, spaCy will show an data created by `preprocess` and those files are missing, spaCy will show an
error it won't just re-run `preprocess`. If you're looking for more advanced error it won't just re-run `preprocess`. If you're looking for more advanced
data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you data management, check out the [Data Version Control (DVC) integration](#dvc).
can also use `outputs_no_cache` instead of `outputs` to define outputs that If you're planning on integrating your spaCy project with DVC, you can also use
won't be cached or tracked. `outputs_no_cache` instead of `outputs` to define outputs that won't be cached
or tracked.
### Files and directory structure {#project-files} ### Files and directory structure {#project-files}
@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling
`python scripts/custom_evaluation.py` with the function arguments. You can also `python scripts/custom_evaluation.py` with the function arguments. You can also
use the `vars` section to define reusable variables that will be substituted in use the `vars` section to define reusable variables that will be substituted in
commands, paths and URLs. In this example, the batch size is defined as a commands, paths and URLs. In this example, the batch size is defined as a
variable will be added in place of `${vars.batch_size}` in the script. variable will be added in place of `${vars.batch_size}` in the script. Just like
in the [training config](/usage/training##config-overrides), you can also
override settings on the command line for example using `--vars.batch_size`.
> #### Calling into Python > #### Calling into Python
> >
@ -491,6 +495,29 @@ commands:
- 'corpus/eval.json' - 'corpus/eval.json'
``` ```
You can also use the `env` section to reference **environment variables** and
make their values available to the commands. This can be useful for overriding
settings on the command line and passing through system-level settings.
> #### Usage example
>
> ```bash
> export GPU_ID=1
> BATCH_SIZE=128 python -m spacy project run evaluate
> ```
```yaml
### project.yml
env:
batch_size: BATCH_SIZE
gpu_id: GPU_ID
commands:
- name: evaluate
script:
- 'python scripts/custom_evaluation.py ${env.batch_size}'
```
### Documenting your project {#custom-docs} ### Documenting your project {#custom-docs}
> #### Readme Example > #### Readme Example