mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/develop' into feature/update-more-docs
# Conflicts: # website/docs/api/data-formats.md
This commit is contained in:
		
						commit
						f728c00cbb
					
				| 
						 | 
				
			
			@ -5,5 +5,5 @@ include README.md
 | 
			
		|||
include pyproject.toml
 | 
			
		||||
recursive-exclude spacy/lang *.json
 | 
			
		||||
recursive-include spacy/lang *.json.gz
 | 
			
		||||
recursive-include spacy/cli *.json
 | 
			
		||||
recursive-include spacy/cli *.json *.yml
 | 
			
		||||
recursive-include licenses *
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@ requires = [
 | 
			
		|||
    "cymem>=2.0.2,<2.1.0",
 | 
			
		||||
    "preshed>=3.0.2,<3.1.0",
 | 
			
		||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
			
		||||
    "thinc>=8.0.0a27,<8.0.0a30",
 | 
			
		||||
    "thinc>=8.0.0a28,<8.0.0a30",
 | 
			
		||||
    "blis>=0.4.0,<0.5.0",
 | 
			
		||||
    "pytokenizations",
 | 
			
		||||
    "smart_open>=2.0.0,<3.0.0"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
# Our libraries
 | 
			
		||||
cymem>=2.0.2,<2.1.0
 | 
			
		||||
preshed>=3.0.2,<3.1.0
 | 
			
		||||
thinc>=8.0.0a27,<8.0.0a30
 | 
			
		||||
thinc>=8.0.0a28,<8.0.0a30
 | 
			
		||||
blis>=0.4.0,<0.5.0
 | 
			
		||||
ml_datasets>=0.1.1
 | 
			
		||||
murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,13 +34,13 @@ setup_requires =
 | 
			
		|||
    cymem>=2.0.2,<2.1.0
 | 
			
		||||
    preshed>=3.0.2,<3.1.0
 | 
			
		||||
    murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
    thinc>=8.0.0a27,<8.0.0a30
 | 
			
		||||
    thinc>=8.0.0a28,<8.0.0a30
 | 
			
		||||
install_requires =
 | 
			
		||||
    # Our libraries
 | 
			
		||||
    murmurhash>=0.28.0,<1.1.0
 | 
			
		||||
    cymem>=2.0.2,<2.1.0
 | 
			
		||||
    preshed>=3.0.2,<3.1.0
 | 
			
		||||
    thinc>=8.0.0a27,<8.0.0a30
 | 
			
		||||
    thinc>=8.0.0a28,<8.0.0a30
 | 
			
		||||
    blis>=0.4.0,<0.5.0
 | 
			
		||||
    wasabi>=0.7.1,<1.1.0
 | 
			
		||||
    srsly>=2.1.0,<3.0.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -105,7 +105,7 @@ factory = "tok2vec"
 | 
			
		|||
 | 
			
		||||
[components.tok2vec.model.embed]
 | 
			
		||||
@architectures = "spacy.MultiHashEmbed.v1"
 | 
			
		||||
width = ${components.tok2vec.model.encode:width}
 | 
			
		||||
width = ${components.tok2vec.model.encode.width}
 | 
			
		||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
 | 
			
		||||
also_embed_subwords = {{ true if has_letters else false }}
 | 
			
		||||
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
 | 
			
		||||
| 
						 | 
				
			
			@ -127,7 +127,7 @@ nO = null
 | 
			
		|||
 | 
			
		||||
[components.tagger.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
width = ${components.tok2vec.model.encode:width}
 | 
			
		||||
width = ${components.tok2vec.model.encode.width}
 | 
			
		||||
{%- endif %}
 | 
			
		||||
 | 
			
		||||
{% if "parser" in components -%}
 | 
			
		||||
| 
						 | 
				
			
			@ -144,7 +144,7 @@ nO = null
 | 
			
		|||
 | 
			
		||||
[components.parser.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
width = ${components.tok2vec.model.encode:width}
 | 
			
		||||
width = ${components.tok2vec.model.encode.width}
 | 
			
		||||
{%- endif %}
 | 
			
		||||
 | 
			
		||||
{% if "ner" in components %}
 | 
			
		||||
| 
						 | 
				
			
			@ -161,7 +161,7 @@ nO = null
 | 
			
		|||
 | 
			
		||||
[components.ner.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
width = ${components.tok2vec.model.encode:width}
 | 
			
		||||
width = ${components.tok2vec.model.encode.width}
 | 
			
		||||
{% endif %}
 | 
			
		||||
{% endif %}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -194,12 +194,12 @@ initial_rate = 5e-5
 | 
			
		|||
 | 
			
		||||
[training.train_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:train}
 | 
			
		||||
path = ${paths.train}
 | 
			
		||||
max_length = {{ 500 if hardware == "gpu" else 0 }}
 | 
			
		||||
 | 
			
		||||
[training.dev_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:dev}
 | 
			
		||||
path = ${paths.dev}
 | 
			
		||||
max_length = 0
 | 
			
		||||
 | 
			
		||||
{% if use_transformer %}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -23,12 +23,12 @@ after_pipeline_creation = null
 | 
			
		|||
 | 
			
		||||
# Training hyper-parameters and additional features.
 | 
			
		||||
[training]
 | 
			
		||||
seed = ${system:seed}
 | 
			
		||||
seed = ${system.seed}
 | 
			
		||||
dropout = 0.1
 | 
			
		||||
accumulate_gradient = 1
 | 
			
		||||
# Extra resources for transfer-learning or pseudo-rehearsal
 | 
			
		||||
init_tok2vec = ${paths:init_tok2vec}
 | 
			
		||||
raw_text = ${paths:raw}
 | 
			
		||||
init_tok2vec = ${paths.init_tok2vec}
 | 
			
		||||
raw_text = ${paths.raw}
 | 
			
		||||
vectors = null
 | 
			
		||||
# Controls early-stopping. 0 or -1 mean unlimited.
 | 
			
		||||
patience = 1600
 | 
			
		||||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ frozen_components = []
 | 
			
		|||
 | 
			
		||||
[training.train_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:train}
 | 
			
		||||
path = ${paths.train}
 | 
			
		||||
# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
			
		||||
# and tokens. If you set this to true, take care to ensure your run-time
 | 
			
		||||
# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
			
		||||
| 
						 | 
				
			
			@ -54,7 +54,7 @@ limit = 0
 | 
			
		|||
 | 
			
		||||
[training.dev_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:dev}
 | 
			
		||||
path = ${paths.dev}
 | 
			
		||||
# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
			
		||||
# and tokens. If you set this to true, take care to ensure your run-time
 | 
			
		||||
# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
			
		||||
| 
						 | 
				
			
			@ -98,8 +98,8 @@ max_length = 500
 | 
			
		|||
dropout = 0.2
 | 
			
		||||
n_save_every = null
 | 
			
		||||
batch_size = 3000
 | 
			
		||||
seed = ${system:seed}
 | 
			
		||||
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory}
 | 
			
		||||
seed = ${system.seed}
 | 
			
		||||
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
 | 
			
		||||
tok2vec_model = "components.tok2vec.model"
 | 
			
		||||
 | 
			
		||||
[pretraining.objective]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,11 +20,11 @@ dev = ""
 | 
			
		|||
 | 
			
		||||
[training.train_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:train}
 | 
			
		||||
path = ${paths.train}
 | 
			
		||||
 | 
			
		||||
[training.dev_corpus]
 | 
			
		||||
@readers = "spacy.Corpus.v1"
 | 
			
		||||
path = ${paths:dev}
 | 
			
		||||
path = ${paths.dev}
 | 
			
		||||
 | 
			
		||||
[training.batcher]
 | 
			
		||||
@batchers = "batch_by_words.v1"
 | 
			
		||||
| 
						 | 
				
			
			@ -57,7 +57,7 @@ factory = "tagger"
 | 
			
		|||
 | 
			
		||||
[components.tagger.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
width = ${components.tok2vec.model:width}
 | 
			
		||||
width = ${components.tok2vec.model.width}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -284,13 +284,13 @@ def test_config_overrides():
 | 
			
		|||
 | 
			
		||||
def test_config_interpolation():
 | 
			
		||||
    config = Config().from_str(nlp_config_string, interpolate=False)
 | 
			
		||||
    assert config["training"]["train_corpus"]["path"] == "${paths:train}"
 | 
			
		||||
    assert config["training"]["train_corpus"]["path"] == "${paths.train}"
 | 
			
		||||
    interpolated = config.interpolate()
 | 
			
		||||
    assert interpolated["training"]["train_corpus"]["path"] == ""
 | 
			
		||||
    nlp = English.from_config(config)
 | 
			
		||||
    assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
 | 
			
		||||
    assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
 | 
			
		||||
    # Ensure that variables are preserved in nlp config
 | 
			
		||||
    width = "${components.tok2vec.model:width}"
 | 
			
		||||
    width = "${components.tok2vec.model.width}"
 | 
			
		||||
    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
			
		||||
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
			
		||||
    interpolated2 = nlp.config.interpolate()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -94,7 +94,7 @@ blog post for background.
 | 
			
		|||
>
 | 
			
		||||
> [components.tagger.model.tok2vec]
 | 
			
		||||
> @architectures = "spacy.Tok2VecListener.v1"
 | 
			
		||||
> width = ${components.tok2vec.model:width}
 | 
			
		||||
> width = ${components.tok2vec.model.width}
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
A listener is used as a sublayer within a component such as a
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,7 +28,7 @@ streaming.
 | 
			
		|||
>
 | 
			
		||||
> [training.train_corpus]
 | 
			
		||||
> @readers = "spacy.Corpus.v1"
 | 
			
		||||
> path = ${paths:train}
 | 
			
		||||
> path = ${paths.train}
 | 
			
		||||
> gold_preproc = false
 | 
			
		||||
> max_length = 0
 | 
			
		||||
> limit = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -111,7 +111,7 @@ model to copy components from). See the docs on
 | 
			
		|||
### paths, system {#config-variables tag="variables"}
 | 
			
		||||
 | 
			
		||||
These sections define variables that can be referenced across the other sections
 | 
			
		||||
as variables. For example `${paths:train}` uses the value of `train` defined in
 | 
			
		||||
as variables. For example `${paths.train}` uses the value of `train` defined in
 | 
			
		||||
the block `[paths]`. If your config includes custom registered functions that
 | 
			
		||||
need paths, you can define them here. All config values can also be
 | 
			
		||||
[overwritten](/usage/training#config-overrides) on the CLI when you run
 | 
			
		||||
| 
						 | 
				
			
			@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | 
			
		|||
 | 
			
		||||
| Name                  | Description                                                                                                                                                                                                  |
 | 
			
		||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `seed`                | The random seed. Defaults to variable `${system:seed}`. ~~int~~                                                                                                                                              |
 | 
			
		||||
| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
 | 
			
		||||
| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | 
			
		||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
 | 
			
		||||
| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~                                              |
 | 
			
		||||
| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths:raw}`. ~~Optional[str]~~                                            |
 | 
			
		||||
| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
 | 
			
		||||
| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
 | 
			
		||||
| `vectors`             | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~                                          |
 | 
			
		||||
| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
 | 
			
		||||
| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
 | 
			
		||||
| 
						 | 
				
			
			@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
 | 
			
		|||
| `dropout`                    | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                  |
 | 
			
		||||
| `n_save_every`               | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                         |
 | 
			
		||||
| `batch_size`                 | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
 | 
			
		||||
| `seed`                       | The random seed. Defaults to variable `${system:seed}`. ~~int~~                                                                 |
 | 
			
		||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~                              |
 | 
			
		||||
| `seed`                       | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                 |
 | 
			
		||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~                              |
 | 
			
		||||
| `tok2vec_model`              | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~                   |
 | 
			
		||||
| `objective`                  | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                          |
 | 
			
		||||
| `optimizer`                  | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                   |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -341,7 +341,7 @@ See the [`Transformer`](/api/transformer) API reference and
 | 
			
		|||
 | 
			
		||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
 | 
			
		||||
 | 
			
		||||
A batcher implements a batching strategy that essentially turns a stream of
 | 
			
		||||
A data batcher implements a batching strategy that essentially turns a stream of
 | 
			
		||||
items into a stream of batches, with each batch consisting of one item or a list
 | 
			
		||||
of items. During training, the models update their weights after processing one
 | 
			
		||||
batch at a time. Typical batching strategies include presenting the training
 | 
			
		||||
| 
						 | 
				
			
			@ -613,7 +613,7 @@ components are created, as well as all training settings and hyperparameters.
 | 
			
		|||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`        | Path to the model's `config.cfg`. ~~Union[str, Path]~~                                                                                                                      |
 | 
			
		||||
| `overrides`   | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
 | 
			
		||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~                                              |
 | 
			
		||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~                                              |
 | 
			
		||||
| **RETURNS**   | The model's config. ~~Config~~                                                                                                                                              |
 | 
			
		||||
 | 
			
		||||
### util.load_meta {#util.load_meta tag="function" new="3"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -157,8 +157,8 @@ sections of a config file are:
 | 
			
		|||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `nlp`         | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names.                                           |
 | 
			
		||||
| `components`  | Definitions of the [pipeline components](/usage/processing-pipelines) and their models.                                                                         |
 | 
			
		||||
| `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI.          |
 | 
			
		||||
| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | 
			
		||||
| `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI.          |
 | 
			
		||||
| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | 
			
		||||
| `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
 | 
			
		||||
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -325,19 +325,9 @@ compound = 1.001
 | 
			
		|||
Another very useful feature of the config system is that it supports variable
 | 
			
		||||
interpolation for both **values and sections**. This means that you only need to
 | 
			
		||||
define a setting once and can reference it across your config using the
 | 
			
		||||
`${section:value}` or `${section.block}` syntax. In this example, the value of
 | 
			
		||||
`seed` is reused within the `[training]` block, and the whole block of
 | 
			
		||||
`[training.optimizer]` is reused in `[pretraining]` and will become
 | 
			
		||||
`pretraining.optimizer`.
 | 
			
		||||
 | 
			
		||||
> #### Note on syntax
 | 
			
		||||
>
 | 
			
		||||
> There are two different ways to format your variables, depending on whether
 | 
			
		||||
> you want to reference a single value or a block. Values are specified after a
 | 
			
		||||
> `:`, while blocks are specified with a `.`:
 | 
			
		||||
>
 | 
			
		||||
> 1. `${section:value}`, `${section.subsection:value}`
 | 
			
		||||
> 2. `${section.block}`, `${section.subsection.block}`
 | 
			
		||||
`${section.value}` syntax. In this example, the value of `seed` is reused within
 | 
			
		||||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
 | 
			
		||||
in `[pretraining]` and will become `pretraining.optimizer`.
 | 
			
		||||
 | 
			
		||||
```ini
 | 
			
		||||
### config.cfg (excerpt) {highlight="5,18"}
 | 
			
		||||
| 
						 | 
				
			
			@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
 | 
			
		|||
seed = 0
 | 
			
		||||
 | 
			
		||||
[training]
 | 
			
		||||
seed = ${system:seed}
 | 
			
		||||
seed = ${system.seed}
 | 
			
		||||
 | 
			
		||||
[training.optimizer]
 | 
			
		||||
@optimizers = "Adam.v1"
 | 
			
		||||
| 
						 | 
				
			
			@ -369,7 +359,7 @@ to a string.
 | 
			
		|||
[paths]
 | 
			
		||||
version = 5
 | 
			
		||||
root = "/Users/you/data"
 | 
			
		||||
train = "${paths:root}/train_${paths:version}.spacy"
 | 
			
		||||
train = "${paths.root}/train_${paths.version}.spacy"
 | 
			
		||||
# Result: /Users/you/data/train_5.spacy
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -484,20 +474,21 @@ still look good.
 | 
			
		|||
## Custom Functions {#custom-functions}
 | 
			
		||||
 | 
			
		||||
Registered functions in the training config files can refer to built-in
 | 
			
		||||
implementations, but you can also plug in fully custom implementations. To do
 | 
			
		||||
so, you first write your own implementation of a custom architectures, data
 | 
			
		||||
reader or any other functionality, and then register this function with the
 | 
			
		||||
correct [registry](/api/top-level#registry). This allows you to plug in models
 | 
			
		||||
defined in PyTorch or Tensorflow, make custom modifications to the `nlp` object,
 | 
			
		||||
create custom optimizers or schedules, or write a function that streams in data
 | 
			
		||||
and preprocesses it on the fly while training.
 | 
			
		||||
implementations, but you can also plug in fully **custom implementations**. All
 | 
			
		||||
you need to do is register your function using the `@spacy.registry` decorator
 | 
			
		||||
with the name of the respective [registry](/api/top-level#registry), e.g.
 | 
			
		||||
`@spacy.registry.architectures`, and a string name to assign to your function.
 | 
			
		||||
Registering custom functions allows you to **plug in models** defined in PyTorch
 | 
			
		||||
or TensorFlow, make **custom modifications** to the `nlp` object, create custom
 | 
			
		||||
optimizers or schedules, or **stream in data** and preprocesses it on the fly
 | 
			
		||||
while training.
 | 
			
		||||
 | 
			
		||||
Each custom function can have any numbers of arguments that should be passed
 | 
			
		||||
into them through the config similar as with the built-in functions. If your
 | 
			
		||||
function defines **default argument values**, spaCy is able to auto-fill your
 | 
			
		||||
config when you run [`init fill-config`](/api/cli#init-fill-config). If you want
 | 
			
		||||
to make sure that a given parameter is always explicitely set in the config,
 | 
			
		||||
avoid setting a default value for it.
 | 
			
		||||
Each custom function can have any numbers of arguments that are passed in via
 | 
			
		||||
the [config](#config), just the built-in functions. If your function defines
 | 
			
		||||
**default argument values**, spaCy is able to auto-fill your config when you run
 | 
			
		||||
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
 | 
			
		||||
given parameter is always explicitely set in the config, avoid setting a default
 | 
			
		||||
value for it.
 | 
			
		||||
 | 
			
		||||
<!-- TODO: possibly link to new (not yet created) page on creating models ? -->
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user