mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #7000 from explosion/feature/project-yml-overrides
Support env vars and CLI overrides for project.yml
This commit is contained in:
commit
6b9026a219
|
@ -16,7 +16,7 @@ import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import is_compatible_version, ENV_VARS
|
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
|
||||||
value = "true"
|
value = "true"
|
||||||
else:
|
else:
|
||||||
value = args.pop(0)
|
value = args.pop(0)
|
||||||
# Just like we do in the config, we're calling json.loads on the
|
result[opt] = _parse_override(value)
|
||||||
# values. But since they come from the CLI, it'd be unintuitive to
|
|
||||||
# explicitly mark strings with escaped quotes. So we're working
|
|
||||||
# around that here by falling back to a string if parsing fails.
|
|
||||||
# TODO: improve logic to handle simple types like list of strings?
|
|
||||||
try:
|
|
||||||
result[opt] = srsly.json_loads(value)
|
|
||||||
except ValueError:
|
|
||||||
result[opt] = str(value)
|
|
||||||
else:
|
else:
|
||||||
msg.fail(f"{err}: name should start with --", exits=1)
|
msg.fail(f"{err}: name should start with --", exits=1)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
def _parse_override(value: Any) -> Any:
|
||||||
|
# Just like we do in the config, we're calling json.loads on the
|
||||||
|
# values. But since they come from the CLI, it'd be unintuitive to
|
||||||
|
# explicitly mark strings with escaped quotes. So we're working
|
||||||
|
# around that here by falling back to a string if parsing fails.
|
||||||
|
# TODO: improve logic to handle simple types like list of strings?
|
||||||
|
try:
|
||||||
|
return srsly.json_loads(value)
|
||||||
|
except ValueError:
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def load_project_config(
|
||||||
|
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Load the project.yml file from a directory and validate it. Also make
|
"""Load the project.yml file from a directory and validate it. Also make
|
||||||
sure that all directories defined in the config exist.
|
sure that all directories defined in the config exist.
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
path (Path): The path to the project directory.
|
||||||
interpolate (bool): Whether to substitute project variables.
|
interpolate (bool): Whether to substitute project variables.
|
||||||
|
overrides (Dict[str, Any]): Optional config overrides.
|
||||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||||
"""
|
"""
|
||||||
config_path = path / PROJECT_FILE
|
config_path = path / PROJECT_FILE
|
||||||
|
@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
if interpolate:
|
if interpolate:
|
||||||
err = "project.yml validation error"
|
err = f"{PROJECT_FILE} validation error"
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
with show_validation_error(title=err, hint_fill=False):
|
||||||
config = substitute_project_variables(config)
|
config = substitute_project_variables(config, overrides)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
def substitute_project_variables(
|
||||||
key = "vars"
|
config: Dict[str, Any],
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
|
key: str = "vars",
|
||||||
|
env_key: str = "env",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Interpolate variables in the project file using the config system.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The project config.
|
||||||
|
overrides (Dict[str, Any]): Optional config overrides.
|
||||||
|
key (str): Key containing variables in project config.
|
||||||
|
env_key (str): Key containing environment variable mapping in project config.
|
||||||
|
RETURNS (Dict[str, Any]): The interpolated project config.
|
||||||
|
"""
|
||||||
config.setdefault(key, {})
|
config.setdefault(key, {})
|
||||||
config[key].update(overrides)
|
config.setdefault(env_key, {})
|
||||||
|
# Substitute references to env vars with their values
|
||||||
|
for config_var, env_var in config[env_key].items():
|
||||||
|
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
||||||
# Need to put variables in the top scope again so we can have a top-level
|
# Need to put variables in the top scope again so we can have a top-level
|
||||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||||
# be allowed by Thinc's config system
|
# be allowed by Thinc's config system
|
||||||
cfg = Config({"project": config, key: config[key]})
|
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
||||||
|
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
||||||
interpolated = cfg.interpolate()
|
interpolated = cfg.interpolate()
|
||||||
return dict(interpolated["project"])
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
|
@ -3,19 +3,23 @@ from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
|
import typer
|
||||||
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||||
from ...util import check_bool_env_var
|
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("run")
|
@project_cli.command(
|
||||||
|
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||||
|
)
|
||||||
def project_run_cli(
|
def project_run_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||||
|
@ -33,13 +37,15 @@ def project_run_cli(
|
||||||
if show_help or not subcommand:
|
if show_help or not subcommand:
|
||||||
print_run_help(project_dir, subcommand)
|
print_run_help(project_dir, subcommand)
|
||||||
else:
|
else:
|
||||||
project_run(project_dir, subcommand, force=force, dry=dry)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
|
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
||||||
|
|
||||||
|
|
||||||
def project_run(
|
def project_run(
|
||||||
project_dir: Path,
|
project_dir: Path,
|
||||||
subcommand: str,
|
subcommand: str,
|
||||||
*,
|
*,
|
||||||
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
dry: bool = False,
|
dry: bool = False,
|
||||||
capture: bool = False,
|
capture: bool = False,
|
||||||
|
@ -59,7 +65,7 @@ def project_run(
|
||||||
when you want to turn over execution to the command, and capture=True
|
when you want to turn over execution to the command, and capture=True
|
||||||
when you want to run the command more like a function.
|
when you want to run the command more like a function.
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir, overrides=overrides)
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||||
|
|
|
@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel):
|
||||||
class ProjectConfigSchema(BaseModel):
|
class ProjectConfigSchema(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
|
||||||
|
env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
|
||||||
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
|
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
|
||||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
|
|
|
@ -325,6 +325,23 @@ def test_project_config_interpolation():
|
||||||
substitute_project_variables(project)
|
substitute_project_variables(project)
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_config_interpolation_env():
|
||||||
|
variables = {"a": 10}
|
||||||
|
env_var = "SPACY_TEST_FOO"
|
||||||
|
env_vars = {"foo": env_var}
|
||||||
|
commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
|
||||||
|
project = {"commands": commands, "vars": variables, "env": env_vars}
|
||||||
|
with make_tempdir() as d:
|
||||||
|
srsly.write_yaml(d / "project.yml", project)
|
||||||
|
cfg = load_project_config(d)
|
||||||
|
assert cfg["commands"][0]["script"][0] == "hello 10 "
|
||||||
|
os.environ[env_var] = "123"
|
||||||
|
with make_tempdir() as d:
|
||||||
|
srsly.write_yaml(d / "project.yml", project)
|
||||||
|
cfg = load_project_config(d)
|
||||||
|
assert cfg["commands"][0]["script"][0] == "hello 10 123"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"args,expected",
|
"args,expected",
|
||||||
[
|
[
|
||||||
|
|
|
@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud
|
||||||
|
|
||||||
By default, the project will be cloned into the current working directory. You
|
By default, the project will be cloned into the current working directory. You
|
||||||
can specify an optional second argument to define the output directory. The
|
can specify an optional second argument to define the output directory. The
|
||||||
`--repo` option lets you define a custom repo to clone from if you don't want
|
`--repo` option lets you define a custom repo to clone from if you don't want to
|
||||||
to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You
|
use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can
|
||||||
can also use any private repo you have access to with Git.
|
also use any private repo you have access to with Git.
|
||||||
|
|
||||||
### 2. Fetch the project assets {#assets}
|
### 2. Fetch the project assets {#assets}
|
||||||
|
|
||||||
|
@ -221,6 +221,7 @@ pipelines.
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
|
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can
|
||||||
specify the destination paths and a checksum, and leave out the URL. When your
|
specify the destination paths and a checksum, and leave out the URL. When your
|
||||||
teammates clone and run your project, they can place the files in the respective
|
teammates clone and run your project, they can place the files in the respective
|
||||||
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
directory themselves. The [`project assets`](/api/cli#project-assets) command
|
||||||
will alert you about missing files and mismatched checksums, so you can ensure that
|
will alert you about missing files and mismatched checksums, so you can ensure
|
||||||
others are running your project with the same data.
|
that others are running your project with the same data.
|
||||||
|
|
||||||
### Dependencies and outputs {#deps-outputs}
|
### Dependencies and outputs {#deps-outputs}
|
||||||
|
|
||||||
|
@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps
|
||||||
automatically. For instance, if you only run the command `train` that depends on
|
automatically. For instance, if you only run the command `train` that depends on
|
||||||
data created by `preprocess` and those files are missing, spaCy will show an
|
data created by `preprocess` and those files are missing, spaCy will show an
|
||||||
error – it won't just re-run `preprocess`. If you're looking for more advanced
|
error – it won't just re-run `preprocess`. If you're looking for more advanced
|
||||||
data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you
|
data management, check out the [Data Version Control (DVC) integration](#dvc).
|
||||||
can also use `outputs_no_cache` instead of `outputs` to define outputs that
|
If you're planning on integrating your spaCy project with DVC, you can also use
|
||||||
won't be cached or tracked.
|
`outputs_no_cache` instead of `outputs` to define outputs that won't be cached
|
||||||
|
or tracked.
|
||||||
|
|
||||||
### Files and directory structure {#project-files}
|
### Files and directory structure {#project-files}
|
||||||
|
|
||||||
|
@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling
|
||||||
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
`python scripts/custom_evaluation.py` with the function arguments. You can also
|
||||||
use the `vars` section to define reusable variables that will be substituted in
|
use the `vars` section to define reusable variables that will be substituted in
|
||||||
commands, paths and URLs. In this example, the batch size is defined as a
|
commands, paths and URLs. In this example, the batch size is defined as a
|
||||||
variable will be added in place of `${vars.batch_size}` in the script.
|
variable will be added in place of `${vars.batch_size}` in the script. Just like
|
||||||
|
in the [training config](/usage/training##config-overrides), you can also
|
||||||
|
override settings on the command line – for example using `--vars.batch_size`.
|
||||||
|
|
||||||
> #### Calling into Python
|
> #### Calling into Python
|
||||||
>
|
>
|
||||||
|
@ -491,6 +495,29 @@ commands:
|
||||||
- 'corpus/eval.json'
|
- 'corpus/eval.json'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also use the `env` section to reference **environment variables** and
|
||||||
|
make their values available to the commands. This can be useful for overriding
|
||||||
|
settings on the command line and passing through system-level settings.
|
||||||
|
|
||||||
|
> #### Usage example
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> export GPU_ID=1
|
||||||
|
> BATCH_SIZE=128 python -m spacy project run evaluate
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
env:
|
||||||
|
batch_size: BATCH_SIZE
|
||||||
|
gpu_id: GPU_ID
|
||||||
|
|
||||||
|
commands:
|
||||||
|
- name: evaluate
|
||||||
|
script:
|
||||||
|
- 'python scripts/custom_evaluation.py ${env.batch_size}'
|
||||||
|
```
|
||||||
|
|
||||||
### Documenting your project {#custom-docs}
|
### Documenting your project {#custom-docs}
|
||||||
|
|
||||||
> #### Readme Example
|
> #### Readme Example
|
||||||
|
|
Loading…
Reference in New Issue
Block a user