mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-10 15:14:56 +03:00
Secondary functionality and documentation
This commit is contained in:
parent
e3b4ee7b15
commit
a2bd489a8c
|
@ -14,8 +14,8 @@ can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-ru
|
|||
Commands are only re-run if their inputs have changed."""
|
||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
|
||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
|
||||
and will run the specified commands in order. Commands are only re-run if their
|
||||
inputs have changed."""
|
||||
and will run the specified commands in order. Commands grouped within square brackets
|
||||
are run in parallel. Commands are only re-run if their inputs have changed."""
|
||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
|
||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
|
||||
in the project directory."""
|
||||
|
@ -69,7 +69,15 @@ def project_document(
|
|||
md.add(md.table(data, ["Command", "Description"]))
|
||||
# Workflows
|
||||
wfs = config.get("workflows", {}).items()
|
||||
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
|
||||
data = []
|
||||
for n, steps in wfs:
|
||||
rendered_steps = []
|
||||
for step in steps:
|
||||
if isinstance(step, str):
|
||||
rendered_steps.append(md.code(step))
|
||||
else:
|
||||
rendered_steps.append('[' + ', '.join(md.code(p_step) for p_step in step) + ']')
|
||||
data.append([md.code(n), " → ".join(rendered_steps)])
|
||||
if data:
|
||||
md.add(md.title(3, "Workflows", "⏭"))
|
||||
md.add(INTRO_WORKFLOWS)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
with Data Version Control (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
@ -105,7 +105,13 @@ def update_dvc_config(
|
|||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
names = []
|
||||
for cmdOrMultiprocessingGroup in workflows[workflow]:
|
||||
if isinstance(cmdOrMultiprocessingGroup, str):
|
||||
names.append(cmdOrMultiprocessingGroup)
|
||||
else:
|
||||
names.extend(cmdOrMultiprocessingGroup)
|
||||
for name in names:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
|
|
|
@ -154,7 +154,12 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|||
if help_text:
|
||||
print(f"\n{help_text}\n")
|
||||
elif subcommand in workflows:
|
||||
steps = workflows[subcommand]
|
||||
steps = []
|
||||
for cmdOrMultiprocessingGroup in workflows[subcommand]:
|
||||
if isinstance(cmdOrMultiprocessingGroup, str):
|
||||
steps.append(cmdOrMultiprocessingGroup)
|
||||
else:
|
||||
steps.extend(cmdOrMultiprocessingGroup)
|
||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||
steps_data = [
|
||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||
|
|
|
@ -144,7 +144,7 @@ skipped. You can also set `--force` to force re-running a command, or `--dry` to
|
|||
perform a "dry run" and see what would happen (without actually running the
|
||||
script).
|
||||
|
||||
### 4. Run a workflow {#run-workfow}
|
||||
### 4. Run a workflow {#run-workflow}
|
||||
|
||||
> #### project.yml
|
||||
>
|
||||
|
@ -153,6 +153,9 @@ script).
|
|||
> all:
|
||||
> - preprocess
|
||||
> - train
|
||||
> -
|
||||
> - multiprocessingGroupCommand1
|
||||
> - multiprocessingGroupCommand2
|
||||
> - package
|
||||
> ```
|
||||
|
||||
|
@ -168,6 +171,11 @@ defined in the `project.yml`, and executes the commands it specifies, in order:
|
|||
$ python -m spacy project run all
|
||||
```
|
||||
|
||||
Sometimes it makes sense to execute two or more commands in parallel. A group
|
||||
of commands executed at once is known as a multiprocessing group; a multiprocessing group
|
||||
is defined by indenting the commands it contains. You are responsible for making sure that no
|
||||
deadlocks, race conditions or other issues can arise from the parallel execution.
|
||||
|
||||
Using the expected [dependencies and outputs](#deps-outputs) defined in the
|
||||
commands, spaCy can determine whether to re-run a command (if its inputs or
|
||||
outputs have changed) or whether to skip it. If you're looking to implement more
|
||||
|
@ -231,7 +239,7 @@ pipelines.
|
|||
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Nested lists represent groups of commands to execute concurrently. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user