diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 664fc2aaf..cc22cbba6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from pathlib import Path from wasabi import msg import typer @@ -46,12 +46,14 @@ def train_cli( def train( - config_path: Path, - output_path: Optional[Path] = None, + config_path: Union[str, Path], + output_path: Optional[Union[str, Path]] = None, *, use_gpu: int = -1, overrides: Dict[str, Any] = util.SimpleFrozenDict(), ): + config_path = util.ensure_path(config_path) + output_path = util.ensure_path(output_path) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 268ea0703..a4462af56 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -819,6 +819,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The final trained pipeline and the best trained pipeline. | +### Calling the training function from Python {#train-function new="3.2"} + +The training CLI exposes a `train` helper function that lets you run the +training just like `spacy train`. Usually it's easier to use the command line +directly, but if you need to kick off training from code this is how to do it. + +> #### Example +> +> ```python +> from spacy.cli.train import train +> +> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +> +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ | +| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ | +| _keyword-only_ | | +| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ | +| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ | + ## pretrain {#pretrain new="2.1" tag="command,experimental"} Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f6910bd5b..c78a1de03 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -826,17 +826,17 @@ from the specified model. Intended for use in `[initialize.before_init]`. > after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"} > ``` -Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/) -range markers. These markers aid in GPU profiling by attributing specific operations -to a ~~Model~~'s forward or backprop passes. +Recursively wrap the models in each pipe using +[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU +profiling by attributing specific operations to a ~~Model~~'s forward or +backprop passes. | Name | Description | -|------------------|------------------------------------------------------------------------------------------------------------------------------| +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- | | `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ | | `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ | | **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ | - ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 94fdad209..bd5ea7751 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -301,8 +301,6 @@ fly without having to save to and load from disk. $ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy ``` - - ### Using variable interpolation {#config-interpolation} Another very useful feature of the config system is that it supports variable @@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue. ## Internal training API {#api} - + spaCy gives you full control over the training loop. However, for most use cases, it's recommended to train your pipelines via the @@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with +### Training from a Python script {#api-train new="3.2"} + +If you want to run the training from a Python script instead of using the +[`spacy train`](/api/cli#train) CLI command, you can call into the +[`train`](/api/cli#train-function) helper function directly. It takes the path +to the config file, an optional output directory and an optional dictionary of +[config overrides](#config-overrides). + +```python +from spacy.cli.train import train + +train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +``` + +### Internal training loop API {#api-loop} + + + +This section documents how the training loop and updates to the `nlp` object +work internally. You typically shouldn't have to implement this in Python unless +you're writing your own trainable components. To train a pipeline, use +[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper +function instead. + + + The [`Example`](/api/example) object contains annotated training data, also called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the