mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update docs [ci skip]
This commit is contained in:
parent
2e3d067a7b
commit
47acb45850
|
@ -1141,9 +1141,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||||
The `spacy ray` CLI includes commands for parallel and distributed computing via
|
The `spacy ray` CLI includes commands for parallel and distributed computing via
|
||||||
[Ray](https://ray.io).
|
[Ray](https://ray.io).
|
||||||
|
|
||||||
<!-- TODO: add links to parallel training docs and project template -->
|
<Infobox variant="warning">
|
||||||
|
|
||||||
<Infobox variant="warning" title="Important note">
|
|
||||||
|
|
||||||
To use this command, you need the
|
To use this command, you need the
|
||||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||||
|
@ -1155,10 +1153,13 @@ CLI.
|
||||||
### ray train {#ray-train tag="command"}
|
### ray train {#ray-train tag="command"}
|
||||||
|
|
||||||
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
|
Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
|
||||||
command works just like [`spacy train`](/api/cli#train).
|
command works just like [`spacy train`](/api/cli#train). For more details and
|
||||||
|
examples, see the usage guide on
|
||||||
|
[parallel training](/usage/training#parallel-training) and the spaCy project
|
||||||
|
[integration](/usage/projects#ray).
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
|
$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -1171,8 +1172,9 @@ $ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-worker
|
||||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
|
| `--output`, `-o` | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
|
||||||
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
|
| `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~ |
|
||||||
| `--address`, `-a` | Optional address of the Ray cluster. Defaults to `None`. ~~Optional[str] \(option)~~ |
|
| `--address`, `-a` | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
|
55
website/docs/images/spacy-ray.svg
Normal file
55
website/docs/images/spacy-ray.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 67 KiB |
|
@ -815,7 +815,7 @@ full embedded visualizer, as well as individual components.
|
||||||
> #### Installation
|
> #### Installation
|
||||||
>
|
>
|
||||||
> ```bash
|
> ```bash
|
||||||
> $ pip install "spacy_streamlit>=1.0.0a0"
|
> $ pip install "spacy-streamlit>=1.0.0a0"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
![](../images/spacy-streamlit.png)
|
![](../images/spacy-streamlit.png)
|
||||||
|
@ -913,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
|
||||||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<!-- TODO: document -->
|
> #### Installation
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ pip install spacy-ray
|
||||||
|
> # Check that the CLI is registered
|
||||||
|
> $ python -m spacy ray --help
|
||||||
|
> ```
|
||||||
|
|
||||||
|
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||||
|
**distributed applications**. You can use Ray for parallel and distributed
|
||||||
|
training with spaCy via our lightweight
|
||||||
|
[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
|
||||||
|
package is installed in the same environment as spaCy, it will automatically add
|
||||||
|
[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
|
||||||
|
|
||||||
|
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
|
||||||
|
`project.yml` just like the regular training command:
|
||||||
|
|
||||||
|
<!-- prettier-ignore -->
|
||||||
|
```yaml
|
||||||
|
### project.yml
|
||||||
|
- name: "ray"
|
||||||
|
help: "Train a model via parallel training with Ray"
|
||||||
|
script:
|
||||||
|
- "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
|
||||||
|
deps:
|
||||||
|
- "corpus/train.spacy"
|
||||||
|
- "corpus/dev.spacy"
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- TODO: <Project id="integrations/ray">
|
||||||
|
|
||||||
|
</Project> -->
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ menu:
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Config System', 'config']
|
- ['Config System', 'config']
|
||||||
- ['Custom Functions', 'custom-functions']
|
- ['Custom Functions', 'custom-functions']
|
||||||
# - ['Parallel Training', 'parallel-training']
|
- ['Parallel Training', 'parallel-training']
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
return create_model(output_width)
|
return create_model(output_width)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Parallel & distributed training with Ray {#parallel-training}
|
||||||
|
|
||||||
|
> #### Installation
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ pip install spacy-ray
|
||||||
|
> # Check that the CLI is registered
|
||||||
|
> $ python -m spacy ray --help
|
||||||
|
> ```
|
||||||
|
|
||||||
|
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||||
|
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||||
|
remote machines, potentially speeding up your training process. Parallel
|
||||||
|
training won't always be faster though – it depends on your batch size, models,
|
||||||
|
and hardware.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
To use Ray with spaCy, you need the
|
||||||
|
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
||||||
|
Installing the package will automatically add the `ray` command to the spaCy
|
||||||
|
CLI.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
|
||||||
|
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
|
||||||
|
setup. You can optionally set the `--address` option to point to your Ray
|
||||||
|
cluster. If it's not set, Ray will run locally.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
python -m spacy ray train config.cfg --n-workers 2
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- TODO: <Project id="integrations/ray">
|
||||||
|
|
||||||
|
</Project> -->
|
||||||
|
|
||||||
|
### How parallel training works {#parallel-training-details}
|
||||||
|
|
||||||
|
Each worker receives a shard of the **data** and builds a copy of the **model
|
||||||
|
and optimizer** from the [`config.cfg`](#config). It also has a communication
|
||||||
|
channel to **pass gradients and parameters** to the other workers. Additionally,
|
||||||
|
each worker is given ownership of a subset of the parameter arrays. Every
|
||||||
|
parameter array is owned by exactly one worker, and the workers are given a
|
||||||
|
mapping so they know which worker owns which parameter.
|
||||||
|
|
||||||
|
![Illustration of setup](../images/spacy-ray.svg)
|
||||||
|
|
||||||
|
As training proceeds, every worker will be computing gradients for **all** of
|
||||||
|
the model parameters. When they compute gradients for parameters they don't own,
|
||||||
|
they'll **send them to the worker** that does own that parameter, along with a
|
||||||
|
version identifier so that the owner can decide whether the discard the
|
||||||
|
gradient. Workers use the gradients they receive and the ones they compute
|
||||||
|
locally to update the parameters they own, and then broadcast the updated array
|
||||||
|
and a new version ID to the other workers.
|
||||||
|
|
||||||
|
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
||||||
|
push their gradient increments and parameter updates, they do not have to pull
|
||||||
|
them and block on the result, so the transfers can happen in the background,
|
||||||
|
overlapped with the actual training work. The workers also do not have to stop
|
||||||
|
and wait for each other ("synchronize") at the start of each batch. This is very
|
||||||
|
useful for spaCy, because spaCy is often trained on long documents, which means
|
||||||
|
**batches can vary in size** significantly. Uneven workloads make synchronous
|
||||||
|
gradient descent inefficient, because if one batch is slow, all of the other
|
||||||
|
workers are stuck waiting for it to complete before they can continue.
|
||||||
|
|
||||||
## Internal training API {#api}
|
## Internal training API {#api}
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="warning">
|
||||||
|
|
|
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
|
||||||
- [Training & config system](#features-training)
|
- [Training & config system](#features-training)
|
||||||
- [Custom models](#features-custom-models)
|
- [Custom models](#features-custom-models)
|
||||||
- [End-to-end project workflows](#features-projects)
|
- [End-to-end project workflows](#features-projects)
|
||||||
|
- [Parallel training with Ray](#features-parallel-training)
|
||||||
- [New built-in components](#features-pipeline-components)
|
- [New built-in components](#features-pipeline-components)
|
||||||
- [New custom component API](#features-components)
|
- [New custom component API](#features-components)
|
||||||
- [Dependency matching](#features-dep-matcher)
|
- [Dependency matching](#features-dep-matcher)
|
||||||
|
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
### Parallel and distributed training with Ray {#features-parallel-training}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ pip install spacy-ray
|
||||||
|
> # Check that the CLI is registered
|
||||||
|
> $ python -m spacy ray --help
|
||||||
|
> # Train a pipeline
|
||||||
|
> $ python -m spacy ray train config.cfg --n-workers 2
|
||||||
|
> ```
|
||||||
|
|
||||||
|
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
||||||
|
**distributed applications**. You can use Ray to train spaCy on one or more
|
||||||
|
remote machines, potentially speeding up your training process. The Ray
|
||||||
|
integration is powered by a lightweight extension package,
|
||||||
|
[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
|
||||||
|
the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
|
||||||
|
same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
|
||||||
|
parallel training.
|
||||||
|
|
||||||
|
![Illustration of setup](../images/spacy-ray.svg)
|
||||||
|
|
||||||
|
<Infobox title="Details & Documentation" emoji="📖" list>
|
||||||
|
|
||||||
|
- **Usage: **
|
||||||
|
[Parallel and distributed training](/usage/training#parallel-training),
|
||||||
|
[spaCy Projects integration](/usage/projects#ray)
|
||||||
|
- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
|
||||||
|
- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### New built-in pipeline components {#features-pipeline-components}
|
### New built-in pipeline components {#features-pipeline-components}
|
||||||
|
|
||||||
spaCy v3.0 includes several new trainable and rule-based components that you can
|
spaCy v3.0 includes several new trainable and rule-based components that you can
|
||||||
|
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
||||||
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all pipeline packages installed in the environment. |
|
||||||
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
| [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). |
|
||||||
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). |
|
||||||
|
| [`ray`](/api/cli#ray) | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. |
|
||||||
|
|
||||||
### New and updated documentation {#new-docs}
|
### New and updated documentation {#new-docs}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,16 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "spacy-ray",
|
||||||
|
"title": "spacy-ray",
|
||||||
|
"slogan": "Parallel and distributed training with spaCy and Ray",
|
||||||
|
"description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
|
||||||
|
"github": "explosion/spacy-ray",
|
||||||
|
"pip": "spacy-ray",
|
||||||
|
"category": ["training"],
|
||||||
|
"author": "Explosion / Anyscale",
|
||||||
|
"thumb": "https://i.imgur.com/7so6ZpS.png"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-sentence-bert",
|
"id": "spacy-sentence-bert",
|
||||||
"title": "spaCy - sentence-transformers",
|
"title": "spaCy - sentence-transformers",
|
||||||
|
@ -2518,14 +2529,14 @@
|
||||||
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
|
"description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
|
||||||
"pip": "cov-bsv",
|
"pip": "cov-bsv",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import cov_bsv",
|
"import cov_bsv",
|
||||||
"",
|
"",
|
||||||
"nlp = cov_bsv.load()",
|
"nlp = cov_bsv.load()",
|
||||||
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
"text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
|
||||||
"",
|
"",
|
||||||
"print(doc.ents)",
|
"print(doc.ents)",
|
||||||
"print(doc._.cov_classification)",
|
"print(doc._.cov_classification)",
|
||||||
"cov_bsv.visualize_doc(doc)"
|
"cov_bsv.visualize_doc(doc)"
|
||||||
],
|
],
|
||||||
"category": ["pipeline", "standalone", "biomedical", "scientific"],
|
"category": ["pipeline", "standalone", "biomedical", "scientific"],
|
||||||
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
|
"tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
|
||||||
|
|
|
@ -108,7 +108,12 @@ function parseArgs(raw) {
|
||||||
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
|
const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
|
||||||
result[opt] = isFlag ? true : args.shift()
|
result[opt] = isFlag ? true : args.shift()
|
||||||
} else {
|
} else {
|
||||||
const key = CLI_GROUPS.includes(opt) ? `${opt} ${args.shift()}` : opt
|
let key = opt
|
||||||
|
if (CLI_GROUPS.includes(opt)) {
|
||||||
|
if (args.length && !args[0].startsWith('-')) {
|
||||||
|
key = `${opt} ${args.shift()}`
|
||||||
|
}
|
||||||
|
}
|
||||||
result[key] = null
|
result[key] = null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user