From a2bd489a8cb528b0ff9e10452f0a03a3cde521b4 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Tue, 10 May 2022 09:40:10 +0200 Subject: [PATCH] Secondary functionality and documentation --- spacy/cli/project/document.py | 14 +++++++++++--- spacy/cli/project/dvc.py | 10 ++++++++-- spacy/cli/project/run.py | 7 ++++++- website/docs/usage/projects.md | 12 ++++++++++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py index 1ba43a958..3813b083d 100644 --- a/spacy/cli/project/document.py +++ b/spacy/cli/project/document.py @@ -14,8 +14,8 @@ can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-ru Commands are only re-run if their inputs have changed.""" INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run) -and will run the specified commands in order. Commands are only re-run if their -inputs have changed.""" +and will run the specified commands in order. Commands grouped within square brackets +are run in parallel. Commands are only re-run if their inputs have changed.""" INTRO_ASSETS = f"""The following assets are defined by the project. They can be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets) in the project directory.""" @@ -69,7 +69,15 @@ def project_document( md.add(md.table(data, ["Command", "Description"])) # Workflows wfs = config.get("workflows", {}).items() - data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] + data = [] + for n, steps in wfs: + rendered_steps = [] + for step in steps: + if isinstance(step, str): + rendered_steps.append(md.code(step)) + else: + rendered_steps.append('[' + ', '.join(md.code(p_step) for p_step in step) + ']') + data.append([md.code(n), " → ".join(rendered_steps)]) if data: md.add(md.title(3, "Workflows", "⏭")) md.add(INTRO_WORKFLOWS) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index 83dc5efbf..b03a95635 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -1,5 +1,5 @@ """This module contains helpers and subcommands for integrating spaCy projects -with Data Version Controk (DVC). https://dvc.org""" +with Data Version Control (DVC). https://dvc.org""" from typing import Dict, Any, List, Optional, Iterable import subprocess from pathlib import Path @@ -105,7 +105,13 @@ def update_dvc_config( dvc_config_path.unlink() dvc_commands = [] config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} - for name in workflows[workflow]: + names = [] + for cmdOrMultiprocessingGroup in workflows[workflow]: + if isinstance(cmdOrMultiprocessingGroup, str): + names.append(cmdOrMultiprocessingGroup) + else: + names.extend(cmdOrMultiprocessingGroup) + for name in names: command = config_commands[name] deps = command.get("deps", []) outputs = command.get("outputs", []) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index da1bdef90..8f04006b5 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -154,7 +154,12 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: if help_text: print(f"\n{help_text}\n") elif subcommand in workflows: - steps = workflows[subcommand] + steps = [] + for cmdOrMultiprocessingGroup in workflows[subcommand]: + if isinstance(cmdOrMultiprocessingGroup, str): + steps.append(cmdOrMultiprocessingGroup) + else: + steps.extend(cmdOrMultiprocessingGroup) print(f"\nWorkflow consisting of {len(steps)} commands:") steps_data = [ (f"{i + 1}. {step}", commands[step].get("help", "")) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 57d226913..839d589b0 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -144,7 +144,7 @@ skipped. You can also set `--force` to force re-running a command, or `--dry` to perform a "dry run" and see what would happen (without actually running the script). -### 4. Run a workflow {#run-workfow} +### 4. Run a workflow {#run-workflow} > #### project.yml > @@ -153,6 +153,9 @@ script). > all: > - preprocess > - train +> - +> - multiprocessingGroupCommand1 +> - multiprocessingGroupCommand2 > - package > ``` @@ -168,6 +171,11 @@ defined in the `project.yml`, and executes the commands it specifies, in order: $ python -m spacy project run all ``` +Sometimes it makes sense to execute two or more commands in parallel. A group +of commands executed at once is known as a multiprocessing group; a multiprocessing group +is defined by indenting the commands it contains. You are responsible for making sure that no +deadlocks, race conditions or other issues can arise from the parallel execution. + Using the expected [dependencies and outputs](#deps-outputs) defined in the commands, spaCy can determine whether to re-run a command (if its inputs or outputs have changed) or whether to skip it. If you're looking to implement more @@ -231,7 +239,7 @@ pipelines. | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | -| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | +| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Nested lists represent groups of commands to execute concurrently. Workflows can be run with the [`project run`](/api/cli#project-run) command. | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | | `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |