mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Update docs [ci skip]
This commit is contained in:
parent
2e3d067a7b
commit
47acb45850
|
@ -1141,9 +1141,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
|||
The `spacy ray` CLI includes commands for parallel and distributed computing via
|
||||
[Ray](https://ray.io).
|
||||
|
||||
<!-- TODO: add links to parallel training docs and project template -->
|
||||
|
||||
<Infobox variant="warning" title="Important note">
|
||||
<Infobox variant="warning">
|
||||
|
||||
To use this command, you need the
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||
|
@ -1155,10 +1153,13 @@ CLI.
|
|||
### ray train {#ray-train tag="command"}
|
||||
|
||||
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
|
||||
command works just like [`spacy train`](/api/cli#train).
|
||||
command works just like [`spacy train`](/api/cli#train). For more details and
|
||||
examples, see the usage guide on
|
||||
[parallel training](/usage/training#parallel-training) and the spaCy project
|
||||
[integration](/usage/projects#ray).
|
||||
|
||||
```cli
|
||||
$ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
|
||||
$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -1171,8 +1172,9 @@ $ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-worker
|
|||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
|
||||
| `--address`, `-a` | Optional address of the Ray cluster. Defaults to `None`. ~~Optional[str] \(option)~~ |
|
||||
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
|
|
55
website/docs/images/spacy-ray.svg
Normal file
55
website/docs/images/spacy-ray.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 67 KiB |
|
@ -815,7 +815,7 @@ full embedded visualizer, as well as individual components.
|
|||
> #### Installation
|
||||
>
|
||||
> ```bash
|
||||
> $ pip install "spacy_streamlit>=1.0.0a0"
|
||||
> $ pip install "spacy-streamlit>=1.0.0a0"
|
||||
> ```
|
||||
|
||||
![](../images/spacy-streamlit.png)
|
||||
|
@ -913,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
|
|||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: document -->
|
||||
> #### Installation
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray for parallel and distributed
|
||||
training with spaCy via our lightweight
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
|
||||
package is installed in the same environment as spaCy, it will automatically add
|
||||
[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
|
||||
|
||||
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
|
||||
`project.yml` just like the regular training command:
|
||||
|
||||
<!-- prettier-ignore -->
|
||||
```yaml
|
||||
### project.yml
|
||||
- name: "ray"
|
||||
help: "Train a model via parallel training with Ray"
|
||||
script:
|
||||
- "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
|
||||
deps:
|
||||
- "corpus/train.spacy"
|
||||
- "corpus/dev.spacy"
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
|
||||
---
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ menu:
|
|||
- ['Quickstart', 'quickstart']
|
||||
- ['Config System', 'config']
|
||||
- ['Custom Functions', 'custom-functions']
|
||||
# - ['Parallel Training', 'parallel-training']
|
||||
- ['Parallel Training', 'parallel-training']
|
||||
- ['Internal API', 'api']
|
||||
---
|
||||
|
||||
|
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
return create_model(output_width)
|
||||
```
|
||||
|
||||
## Parallel & distributed training with Ray {#parallel-training}
|
||||
|
||||
> #### Installation
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||
remote machines, potentially speeding up your training process. Parallel
|
||||
training won't always be faster though – it depends on your batch size, models,
|
||||
and hardware.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
To use Ray with spaCy, you need the
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||
Installing the package will automatically add the `ray` command to the spaCy
|
||||
CLI.
|
||||
|
||||
</Infobox>
|
||||
|
||||
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
|
||||
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
|
||||
setup. You can optionally set the `--address` option to point to your Ray
|
||||
cluster. If it's not set, Ray will run locally.
|
||||
|
||||
```cli
|
||||
python -m spacy ray train config.cfg --n-workers 2
|
||||
```
|
||||
|
||||
<!-- TODO: <Project id="integrations/ray">
|
||||
|
||||
</Project> -->
|
||||
|
||||
### How parallel training works {#parallel-training-details}
|
||||
|
||||
Each worker receives a shard of the **data** and builds a copy of the **model
|
||||
and optimizer** from the [`config.cfg`](#config). It also has a communication
|
||||
channel to **pass gradients and parameters** to the other workers. Additionally,
|
||||
each worker is given ownership of a subset of the parameter arrays. Every
|
||||
parameter array is owned by exactly one worker, and the workers are given a
|
||||
mapping so they know which worker owns which parameter.
|
||||
|
||||
![Illustration of setup](../images/spacy-ray.svg)
|
||||
|
||||
As training proceeds, every worker will be computing gradients for **all** of
|
||||
the model parameters. When they compute gradients for parameters they don't own,
|
||||
they'll **send them to the worker** that does own that parameter, along with a
|
||||
version identifier so that the owner can decide whether the discard the
|
||||
gradient. Workers use the gradients they receive and the ones they compute
|
||||
locally to update the parameters they own, and then broadcast the updated array
|
||||
and a new version ID to the other workers.
|
||||
|
||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
||||
push their gradient increments and parameter updates, they do not have to pull
|
||||
them and block on the result, so the transfers can happen in the background,
|
||||
overlapped with the actual training work. The workers also do not have to stop
|
||||
and wait for each other ("synchronize") at the start of each batch. This is very
|
||||
useful for spaCy, because spaCy is often trained on long documents, which means
|
||||
**batches can vary in size** significantly. Uneven workloads make synchronous
|
||||
gradient descent inefficient, because if one batch is slow, all of the other
|
||||
workers are stuck waiting for it to complete before they can continue.
|
||||
|
||||
## Internal training API {#api}
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
|
|
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
|
|||
- [Training & config system](#features-training)
|
||||
- [Custom models](#features-custom-models)
|
||||
- [End-to-end project workflows](#features-projects)
|
||||
- [Parallel training with Ray](#features-parallel-training)
|
||||
- [New built-in components](#features-pipeline-components)
|
||||
- [New custom component API](#features-components)
|
||||
- [Dependency matching](#features-dep-matcher)
|
||||
|
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Parallel and distributed training with Ray {#features-parallel-training}
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ pip install spacy-ray
|
||||
> # Check that the CLI is registered
|
||||
> $ python -m spacy ray --help
|
||||
> # Train a pipeline
|
||||
> $ python -m spacy ray train config.cfg --n-workers 2
|
||||
> ```
|
||||
|
||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||
remote machines, potentially speeding up your training process. The Ray
|
||||
integration is powered by a lightweight extension package,
|
||||
[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
|
||||
the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
|
||||
same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
|
||||
parallel training.
|
||||
|
||||
![Illustration of setup](../images/spacy-ray.svg)
|
||||
|
||||
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||
|
||||
- **Usage: **
|
||||
[Parallel and distributed training](/usage/training#parallel-training),
|
||||
[spaCy Projects integration](/usage/projects#ray)
|
||||
- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
|
||||
- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
|
||||
|
||||
</Infobox>
|
||||
|
||||
### New built-in pipeline components {#features-pipeline-components}
|
||||
|
||||
spaCy v3.0 includes several new trainable and rule-based components that you can
|
||||
|
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
||||
|
||||
### New and updated documentation {#new-docs}
|
||||
|
||||
|
|
|
@ -1,5 +1,16 @@
|
|||
{
|
||||
"resources": [
|
||||
{
|
||||
"id": "spacy-ray",
|
||||
"title": "spacy-ray",
|
||||
"slogan": "Parallel and distributed training with spaCy and Ray",
|
||||
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
|
||||
"github": "explosion/spacy-ray",
|
||||
"pip": "spacy-ray",
|
||||
"category": ["training"],
|
||||
"author": "Explosion / Anyscale",
|
||||
"thumb": "https://i.imgur.com/7so6ZpS.png"
|
||||
},
|
||||
{
|
||||
"id": "spacy-sentence-bert",
|
||||
"title": "spaCy - sentence-transformers",
|
||||
|
@ -2518,14 +2529,14 @@
|
|||
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
|
||||
"pip": "cov-bsv",
|
||||
"code_example": [
|
||||
"import cov_bsv",
|
||||
"",
|
||||
"nlp = cov_bsv.load()",
|
||||
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
||||
"",
|
||||
"print(doc.ents)",
|
||||
"print(doc._.cov_classification)",
|
||||
"cov_bsv.visualize_doc(doc)"
|
||||
"import cov_bsv",
|
||||
"",
|
||||
"nlp = cov_bsv.load()",
|
||||
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
||||
"",
|
||||
"print(doc.ents)",
|
||||
"print(doc._.cov_classification)",
|
||||
"cov_bsv.visualize_doc(doc)"
|
||||
],
|
||||
"category": ["pipeline", "standalone", "biomedical", "scientific"],
|
||||
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
|
||||
|
|
|
@ -108,7 +108,12 @@ function parseArgs(raw) {
|
|||
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
|
||||
result[opt] = isFlag ? true : args.shift()
|
||||
} else {
|
||||
const key = CLI_GROUPS.includes(opt) ? `${opt} ${args.shift()}` : opt
|
||||
let key = opt
|
||||
if (CLI_GROUPS.includes(opt)) {
|
||||
if (args.length && !args[0].startsWith('-')) {
|
||||
key = `${opt} ${args.shift()}`
|
||||
}
|
||||
}
|
||||
result[key] = null
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user