Secondary functionality and documentation

This commit is contained in:
richardpaulhudson 2022-05-10 09:40:10 +02:00
parent e3b4ee7b15
commit a2bd489a8c
4 changed files with 35 additions and 8 deletions

View File

@ -14,8 +14,8 @@ can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-ru
Commands are only re-run if their inputs have changed."""
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
and will run the specified commands in order. Commands are only re-run if their
inputs have changed."""
and will run the specified commands in order. Commands grouped within square brackets
are run in parallel. Commands are only re-run if their inputs have changed."""
INTRO_ASSETS = f"""The following assets are defined by the project. They can
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
in the project directory."""
@ -69,7 +69,15 @@ def project_document(
md.add(md.table(data, ["Command", "Description"]))
# Workflows
wfs = config.get("workflows", {}).items()
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
data = []
for n, steps in wfs:
rendered_steps = []
for step in steps:
if isinstance(step, str):
rendered_steps.append(md.code(step))
else:
rendered_steps.append('[' + ', '.join(md.code(p_step) for p_step in step) + ']')
data.append([md.code(n), " → ".join(rendered_steps)])
if data:
md.add(md.title(3, "Workflows", ""))
md.add(INTRO_WORKFLOWS)

View File

@ -1,5 +1,5 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
with Data Version Control (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
@ -105,7 +105,13 @@ def update_dvc_config(
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in workflows[workflow]:
names = []
for cmdOrMultiprocessingGroup in workflows[workflow]:
if isinstance(cmdOrMultiprocessingGroup, str):
names.append(cmdOrMultiprocessingGroup)
else:
names.extend(cmdOrMultiprocessingGroup)
for name in names:
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])

View File

@ -154,7 +154,12 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
if help_text:
print(f"\n{help_text}\n")
elif subcommand in workflows:
steps = workflows[subcommand]
steps = []
for cmdOrMultiprocessingGroup in workflows[subcommand]:
if isinstance(cmdOrMultiprocessingGroup, str):
steps.append(cmdOrMultiprocessingGroup)
else:
steps.extend(cmdOrMultiprocessingGroup)
print(f"\nWorkflow consisting of {len(steps)} commands:")
steps_data = [
(f"{i + 1}. {step}", commands[step].get("help", ""))

View File

@ -144,7 +144,7 @@ skipped. You can also set `--force` to force re-running a command, or `--dry` to
perform a "dry run" and see what would happen (without actually running the
script).
### 4. Run a workflow {#run-workfow}
### 4. Run a workflow {#run-workflow}
> #### project.yml
>
@ -153,6 +153,9 @@ script).
> all:
> - preprocess
> - train
> -
> - multiprocessingGroupCommand1
> - multiprocessingGroupCommand2
> - package
> ```
@ -168,6 +171,11 @@ defined in the `project.yml`, and executes the commands it specifies, in order:
$ python -m spacy project run all
```
Sometimes it makes sense to execute two or more commands in parallel. A group
of commands executed at once is known as a multiprocessing group; a multiprocessing group
is defined by indenting the commands it contains. You are responsible for making sure that no
deadlocks, race conditions or other issues can arise from the parallel execution.
Using the expected [dependencies and outputs](#deps-outputs) defined in the
commands, spaCy can determine whether to re-run a command (if its inputs or
outputs have changed) or whether to skip it. If you're looking to implement more
@ -231,7 +239,7 @@ pipelines.
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Nested lists represent groups of commands to execute concurrently. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |