Update docs [ci skip]

2025-08-07 21:54:54 +03:00 · 2020-09-13 22:30:33 +02:00 · 2020-09-13 22:30:33 +02:00 · 47acb45850
commit 47acb45850
parent 2e3d067a7b
7 changed files with 225 additions and 18 deletions
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -1141,9 +1141,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 The `spacy ray` CLI includes commands for parallel and distributed computing via
 [Ray](https://ray.io).

-<!-- TODO: add links to parallel training docs and project template -->
-
-<Infobox variant="warning" title="Important note">
+<Infobox variant="warning">

 To use this command, you need the
 [`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
@ -1155,10 +1153,13 @@ CLI.
 ### ray train {#ray-train tag="command"}

 Train a spaCy pipeline using [Ray](https://ray.io) for parallel training. The
-command works just like [`spacy train`](/api/cli#train).
+command works just like [`spacy train`](/api/cli#train). For more details and
+examples, see the usage guide on
+[parallel training](/usage/training#parallel-training) and the spaCy project
+[integration](/usage/projects#ray).

 ```cli
-$ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
+$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
 ```

 > #### Example
@ -1171,8 +1172,9 @@ $ python -m spacy ray train [config_path] [--code-path] [--strategy] [--n-worker
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
 | `--code`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                           |
 | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~                                                                                                                                   |
-| `--address`, `-a`   | Optional address of the Ray cluster. Defaults to `None`. ~~Optional[str] \(option)~~                                                                                                       |
+| `--address`, `-a`   | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~                                                                               |
 | `--gpu-id`, `-g`    | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--verbose`, `-V`   | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                           |
 | `--help`, `-h`      | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
--- a/website/docs/images/spacy-ray.svg
+++ b/website/docs/images/spacy-ray.svg
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -815,7 +815,7 @@ full embedded visualizer, as well as individual components.
 > #### Installation
 >
 > ```bash
-> $ pip install "spacy_streamlit>=1.0.0a0"
+> $ pip install "spacy-streamlit>=1.0.0a0"
 > ```

 ![](../images/spacy-streamlit.png)
@ -913,7 +913,39 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>

-<!-- TODO: document -->
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray for parallel and distributed
+training with spaCy via our lightweight
+[`spacy-ray`](https://github.com/explosion/spacy-ray) extension package. If the
+package is installed in the same environment as spaCy, it will automatically add
+[`spacy ray`](/api/cli#ray) commands to your spaCy CLI.
+
+You can integrate [`spacy ray train`](/api/cli#ray-train) into your
+`project.yml` just like the regular training command:
+
+<!-- prettier-ignore -->
+```yaml
+### project.yml
+- name: "ray"
+    help: "Train a model via parallel training with Ray"
+    script:
+      - "python -m spacy ray train configs/config.cfg --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy"
+    deps:
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+```
+
+<!-- TODO: <Project id="integrations/ray">
+
+</Project> -->

 ---

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -7,7 +7,7 @@ menu:
  - ['Quickstart', 'quickstart']
  - ['Config System', 'config']
  - ['Custom Functions', 'custom-functions']
-  #   - ['Parallel Training', 'parallel-training']
+  - ['Parallel Training', 'parallel-training']
  - ['Internal API', 'api']
 ---

@ -832,6 +832,73 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
    return create_model(output_width)
 ```

+## Parallel & distributed training with Ray {#parallel-training}
+
+> #### Installation
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. Parallel
+training won't always be faster though – it depends on your batch size, models,
+and hardware.
+
+<Infobox variant="warning">
+
+To use Ray with spaCy, you need the
+[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
+Installing the package will automatically add the `ray` command to the spaCy
+CLI.
+
+</Infobox>
+
+The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
+[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
+setup. You can optionally set the `--address` option to point to your Ray
+cluster. If it's not set, Ray will run locally.
+
+```cli
+python -m spacy ray train config.cfg --n-workers 2
+```
+
+<!-- TODO: <Project id="integrations/ray">
+
+</Project> -->
+
+### How parallel training works {#parallel-training-details}
+
+Each worker receives a shard of the **data** and builds a copy of the **model
+and optimizer** from the [`config.cfg`](#config). It also has a communication
+channel to **pass gradients and parameters** to the other workers. Additionally,
+each worker is given ownership of a subset of the parameter arrays. Every
+parameter array is owned by exactly one worker, and the workers are given a
+mapping so they know which worker owns which parameter.
+
+![Illustration of setup](../images/spacy-ray.svg)
+
+As training proceeds, every worker will be computing gradients for **all** of
+the model parameters. When they compute gradients for parameters they don't own,
+they'll **send them to the worker** that does own that parameter, along with a
+version identifier so that the owner can decide whether the discard the
+gradient. Workers use the gradients they receive and the ones they compute
+locally to update the parameters they own, and then broadcast the updated array
+and a new version ID to the other workers.
+
+This training procedure is **asynchronous** and **non-blocking**. Workers always
+push their gradient increments and parameter updates, they do not have to pull
+them and block on the result, so the transfers can happen in the background,
+overlapped with the actual training work. The workers also do not have to stop
+and wait for each other ("synchronize") at the start of each batch. This is very
+useful for spaCy, because spaCy is often trained on long documents, which means
+**batches can vary in size** significantly. Uneven workloads make synchronous
+gradient descent inefficient, because if one batch is slow, all of the other
+workers are stuck waiting for it to complete before they can continue.
+
 ## Internal training API {#api}

 <Infobox variant="warning">
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -34,6 +34,7 @@ to clone and adapt best-practice projects for your own use cases.
 - [Training & config system](#features-training)
 - [Custom models](#features-custom-models)
 - [End-to-end project workflows](#features-projects)
+- [Parallel training with Ray](#features-parallel-training)
 - [New built-in components](#features-pipeline-components)
 - [New custom component API](#features-components)
 - [Dependency matching](#features-dep-matcher)
@ -223,6 +224,39 @@ workflows, from data preprocessing to training and packaging your pipeline.

 </Infobox>

+### Parallel and distributed training with Ray {#features-parallel-training}
+
+> #### Example
+>
+> ```cli
+> $ pip install spacy-ray
+> # Check that the CLI is registered
+> $ python -m spacy ray --help
+> # Train a pipeline
+> $ python -m spacy ray train config.cfg --n-workers 2
+> ```
+
+[Ray](https://ray.io/) is a fast and simple framework for building and running
+**distributed applications**. You can use Ray to train spaCy on one or more
+remote machines, potentially speeding up your training process. The Ray
+integration is powered by a lightweight extension package,
+[`spacy-ray`](https://github.com/explosion/spacy-ray), that automatically adds
+the [`ray`](/api/cli#ray) command to your spaCy CLI if it's installed in the
+same environment. You can then run [`spacy ray train`](/api/cli#ray-train) for
+parallel training.
+
+![Illustration of setup](../images/spacy-ray.svg)
+
+<Infobox title="Details & Documentation" emoji="📖" list>
+
+- **Usage: **
+  [Parallel and distributed training](/usage/training#parallel-training),
+  [spaCy Projects integration](/usage/projects#ray)
+- **CLI:** [`ray`](/api/cli#ray), [`ray train`](/api/cli#ray-train)
+- **Implementation:** [`spacy-ray`](https://github.com/explosion/spacy-ray)
+
+</Infobox>
+
 ### New built-in pipeline components {#features-pipeline-components}

 spaCy v3.0 includes several new trainable and rule-based components that you can
@ -390,6 +424,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
 | [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
+| [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |

 ### New and updated documentation {#new-docs}

--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,16 @@
 {
    "resources": [
+        {
+            "id": "spacy-ray",
+            "title": "spacy-ray",
+            "slogan": "Parallel and distributed training with spaCy and Ray",
+            "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.",
+            "github": "explosion/spacy-ray",
+            "pip": "spacy-ray",
+            "category": ["training"],
+            "author": "Explosion / Anyscale",
+            "thumb": "https://i.imgur.com/7so6ZpS.png"
+        },
        {
            "id": "spacy-sentence-bert",
            "title": "spaCy - sentence-transformers",
@ -2518,14 +2529,14 @@
            "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.",
            "pip": "cov-bsv",
            "code_example": [
-              "import cov_bsv",
-              "",
-              "nlp = cov_bsv.load()",
-              "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
-              "",
-              "print(doc.ents)",
-              "print(doc._.cov_classification)",
-              "cov_bsv.visualize_doc(doc)"
+                "import cov_bsv",
+                "",
+                "nlp = cov_bsv.load()",
+                "text = 'Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected'",
+                "",
+                "print(doc.ents)",
+                "print(doc._.cov_classification)",
+                "cov_bsv.visualize_doc(doc)"
            ],
            "category": ["pipeline", "standalone", "biomedical", "scientific"],
            "tags": ["clinical", "epidemiology", "covid-19", "surveillance"],
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@ -108,7 +108,12 @@ function parseArgs(raw) {
            const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
            result[opt] = isFlag ? true : args.shift()
        } else {
-            const key = CLI_GROUPS.includes(opt) ? `${opt} ${args.shift()}` : opt
+            let key = opt
+            if (CLI_GROUPS.includes(opt)) {
+                if (args.length && !args[0].startsWith('-')) {
+                    key = `${opt} ${args.shift()}`
+                }
+            }
            result[key] = null
        }
    }