mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #7000 from explosion/feature/project-yml-overrides
Support env vars and CLI overrides for project.yml
This commit is contained in:
commit
6b9026a219
|
@ -16,7 +16,7 @@ import os
|
|||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import is_compatible_version, ENV_VARS
|
||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
|||
value = "true"
|
||||
else:
|
||||
value = args.pop(0)
|
||||
# Just like we do in the config, we're calling json.loads on the
|
||||
# values. But since they come from the CLI, it'd be unintuitive to
|
||||
# explicitly mark strings with escaped quotes. So we're working
|
||||
# around that here by falling back to a string if parsing fails.
|
||||
# TODO: improve logic to handle simple types like list of strings?
|
||||
try:
|
||||
result[opt] = srsly.json_loads(value)
|
||||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
result[opt] = _parse_override(value)
|
||||
else:
|
||||
msg.fail(f"{err}: name should start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||
def _parse_override(value: Any) -> Any:
|
||||
# Just like we do in the config, we're calling json.loads on the
|
||||
# values. But since they come from the CLI, it'd be unintuitive to
|
||||
# explicitly mark strings with escaped quotes. So we're working
|
||||
# around that here by falling back to a string if parsing fails.
|
||||
# TODO: improve logic to handle simple types like list of strings?
|
||||
try:
|
||||
return srsly.json_loads(value)
|
||||
except ValueError:
|
||||
return str(value)
|
||||
|
||||
|
||||
def load_project_config(
|
||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||
) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it. Also make
|
||||
sure that all directories defined in the config exist.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
interpolate (bool): Whether to substitute project variables.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
|
@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
|||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = "project.yml validation error"
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config)
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||
key = "vars"
|
||||
def substitute_project_variables(
|
||||
config: Dict[str, Any],
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
key: str = "vars",
|
||||
env_key: str = "env",
|
||||
) -> Dict[str, Any]:
|
||||
"""Interpolate variables in the project file using the config system.
|
||||
|
||||
config (Dict[str, Any]): The project config.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
key (str): Key containing variables in project config.
|
||||
env_key (str): Key containing environment variable mapping in project config.
|
||||
RETURNS (Dict[str, Any]): The interpolated project config.
|
||||
"""
|
||||
config.setdefault(key, {})
|
||||
config[key].update(overrides)
|
||||
config.setdefault(env_key, {})
|
||||
# Substitute references to env vars with their values
|
||||
for config_var, env_var in config[env_key].items():
|
||||
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
||||
# Need to put variables in the top scope again so we can have a top-level
|
||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||
# be allowed by Thinc's config system
|
||||
cfg = Config({"project": config, key: config[key]})
|
||||
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
||||
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
||||
interpolated = cfg.interpolate()
|
||||
return dict(interpolated["project"])
|
||||
|
||||
|
|
|
@ -3,19 +3,23 @@ from pathlib import Path
|
|||
from wasabi import msg
|
||||
import sys
|
||||
import srsly
|
||||
import typer
|
||||
|
||||
from ... import about
|
||||
from ...git_info import GIT_VERSION
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||
from ...util import check_bool_env_var
|
||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||
|
||||
|
||||
@project_cli.command("run")
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
|
@ -33,13 +37,15 @@ def project_run_cli(
|
|||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, force=force, dry=dry)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path,
|
||||
subcommand: str,
|
||||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
|
@ -59,7 +65,7 @@ def project_run(
|
|||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
|
|
|
@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel):
|
|||
class ProjectConfigSchema(BaseModel):
|
||||
# fmt: off
|
||||
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||
env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
|
||||
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
|
||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||
|
|
|
@ -325,6 +325,23 @@ def test_project_config_interpolation():
|
|||
substitute_project_variables(project)
|
||||
|
||||
|
||||
def test_project_config_interpolation_env():
|
||||
variables = {"a": 10}
|
||||
env_var = "SPACY_TEST_FOO"
|
||||
env_vars = {"foo": env_var}
|
||||
commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
|
||||
project = {"commands": commands, "vars": variables, "env": env_vars}
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
assert cfg["commands"][0]["script"][0] == "hello 10 "
|
||||
os.environ[env_var] = "123"
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
assert cfg["commands"][0]["script"][0] == "hello 10 123"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args,expected",
|
||||
[
|
||||
|
|
|
@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud
|
|||
|
||||
By default, the project will be cloned into the current working directory. You
|
||||
can specify an optional second argument to define the output directory. The
|
||||
`--repo` option lets you define a custom repo to clone from if you don't want
|
||||
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
|
||||
can also use any private repo you have access to with Git.
|
||||
`--repo` option lets you define a custom repo to clone from if you don't want to
|
||||
use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can
|
||||
also use any private repo you have access to with Git.
|
||||
|
||||
### 2. Fetch the project assets {#assets}
|
||||
|
||||
|
@ -221,6 +221,7 @@ pipelines.
|
|||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||
|
@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can
|
|||
specify the destination paths and a checksum, and leave out the URL. When your
|
||||
teammates clone and run your project, they can place the files in the respective
|
||||
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
||||
will alert you about missing files and mismatched checksums, so you can ensure that
|
||||
others are running your project with the same data.
|
||||
will alert you about missing files and mismatched checksums, so you can ensure
|
||||
that others are running your project with the same data.
|
||||
|
||||
### Dependencies and outputs {#deps-outputs}
|
||||
|
||||
|
@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
|
|||
automatically. For instance, if you only run the command `train` that depends on
|
||||
data created by `preprocess` and those files are missing, spaCy will show an
|
||||
error – it won't just re-run `preprocess`. If you're looking for more advanced
|
||||
data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
|
||||
can also use `outputs_no_cache` instead of `outputs` to define outputs that
|
||||
won't be cached or tracked.
|
||||
data management, check out the [Data Version Control (DVC) integration](#dvc).
|
||||
If you're planning on integrating your spaCy project with DVC, you can also use
|
||||
`outputs_no_cache` instead of `outputs` to define outputs that won't be cached
|
||||
or tracked.
|
||||
|
||||
### Files and directory structure {#project-files}
|
||||
|
||||
|
@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling
|
|||
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
||||
use the `vars` section to define reusable variables that will be substituted in
|
||||
commands, paths and URLs. In this example, the batch size is defined as a
|
||||
variable will be added in place of `${vars.batch_size}` in the script.
|
||||
variable will be added in place of `${vars.batch_size}` in the script. Just like
|
||||
in the [training config](/usage/training##config-overrides), you can also
|
||||
override settings on the command line – for example using `--vars.batch_size`.
|
||||
|
||||
> #### Calling into Python
|
||||
>
|
||||
|
@ -491,6 +495,29 @@ commands:
|
|||
- 'corpus/eval.json'
|
||||
```
|
||||
|
||||
You can also use the `env` section to reference **environment variables** and
|
||||
make their values available to the commands. This can be useful for overriding
|
||||
settings on the command line and passing through system-level settings.
|
||||
|
||||
> #### Usage example
|
||||
>
|
||||
> ```bash
|
||||
> export GPU_ID=1
|
||||
> BATCH_SIZE=128 python -m spacy project run evaluate
|
||||
> ```
|
||||
|
||||
```yaml
|
||||
### project.yml
|
||||
env:
|
||||
batch_size: BATCH_SIZE
|
||||
gpu_id: GPU_ID
|
||||
|
||||
commands:
|
||||
- name: evaluate
|
||||
script:
|
||||
- 'python scripts/custom_evaluation.py ${env.batch_size}'
|
||||
```
|
||||
|
||||
### Documenting your project {#custom-docs}
|
||||
|
||||
> #### Readme Example
|
||||
|
|
Loading…
Reference in New Issue
Block a user