Merge remote-tracking branch 'upstream/develop' into feature/update-more-docs

# Conflicts:
#	website/docs/api/data-formats.md
This commit is contained in:
svlandeg 2020-08-20 10:02:13 +02:00
commit f728c00cbb
12 changed files with 55 additions and 64 deletions

View File

@ -5,5 +5,5 @@ include README.md
include pyproject.toml include pyproject.toml
recursive-exclude spacy/lang *.json recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json recursive-include spacy/cli *.json *.yml
recursive-include licenses * recursive-include licenses *

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a27,<8.0.0a30", "thinc>=8.0.0a28,<8.0.0a30",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"smart_open>=2.0.0,<3.0.0" "smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -105,7 +105,7 @@ factory = "tok2vec"
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }} rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }} also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }} also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
@ -127,7 +127,7 @@ nO = null
[components.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{%- endif %} {%- endif %}
{% if "parser" in components -%} {% if "parser" in components -%}
@ -144,7 +144,7 @@ nO = null
[components.parser.model.tok2vec] [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{%- endif %} {%- endif %}
{% if "ner" in components %} {% if "ner" in components %}
@ -161,7 +161,7 @@ nO = null
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{% endif %} {% endif %}
{% endif %} {% endif %}
@ -194,12 +194,12 @@ initial_rate = 5e-5
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 0 }} max_length = {{ 500 if hardware == "gpu" else 0 }}
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
max_length = 0 max_length = 0
{% if use_transformer %} {% if use_transformer %}

View File

@ -23,12 +23,12 @@ after_pipeline_creation = null
# Training hyper-parameters and additional features. # Training hyper-parameters and additional features.
[training] [training]
seed = ${system:seed} seed = ${system.seed}
dropout = 0.1 dropout = 0.1
accumulate_gradient = 1 accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal # Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths:init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths:raw} raw_text = ${paths.raw}
vectors = null vectors = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
@ -42,7 +42,7 @@ frozen_components = []
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries # Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time # and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
@ -54,7 +54,7 @@ limit = 0
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries # Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time # and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
@ -98,8 +98,8 @@ max_length = 500
dropout = 0.2 dropout = 0.2
n_save_every = null n_save_every = null
batch_size = 3000 batch_size = 3000
seed = ${system:seed} seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory} use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model" tok2vec_model = "components.tok2vec.model"
[pretraining.objective] [pretraining.objective]

View File

@ -20,11 +20,11 @@ dev = ""
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
[training.batcher] [training.batcher]
@batchers = "batch_by_words.v1" @batchers = "batch_by_words.v1"
@ -57,7 +57,7 @@ factory = "tagger"
[components.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model:width} width = ${components.tok2vec.model.width}
""" """
@ -284,13 +284,13 @@ def test_config_overrides():
def test_config_interpolation(): def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False) config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths:train}" assert config["training"]["train_corpus"]["path"] == "${paths.train}"
interpolated = config.interpolate() interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == "" assert interpolated["training"]["train_corpus"]["path"] == ""
nlp = English.from_config(config) nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}" assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config # Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model:width}" width = "${components.tok2vec.model.width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate() interpolated2 = nlp.config.interpolate()

View File

@ -94,7 +94,7 @@ blog post for background.
> >
> [components.tagger.model.tok2vec] > [components.tagger.model.tok2vec]
> @architectures = "spacy.Tok2VecListener.v1" > @architectures = "spacy.Tok2VecListener.v1"
> width = ${components.tok2vec.model:width} > width = ${components.tok2vec.model.width}
> ``` > ```
A listener is used as a sublayer within a component such as a A listener is used as a sublayer within a component such as a

View File

@ -28,7 +28,7 @@ streaming.
> >
> [training.train_corpus] > [training.train_corpus]
> @readers = "spacy.Corpus.v1" > @readers = "spacy.Corpus.v1"
> path = ${paths:train} > path = ${paths.train}
> gold_preproc = false > gold_preproc = false
> max_length = 0 > max_length = 0
> limit = 0 > limit = 0

View File

@ -111,7 +111,7 @@ model to copy components from). See the docs on
### paths, system {#config-variables tag="variables"} ### paths, system {#config-variables tag="variables"}
These sections define variables that can be referenced across the other sections These sections define variables that can be referenced across the other sections
as variables. For example `${paths:train}` uses the value of `train` defined in as variables. For example `${paths.train}` uses the value of `train` defined in
the block `[paths]`. If your config includes custom registered functions that the block `[paths]`. If your config includes custom registered functions that
need paths, you can define them here. All config values can also be need paths, you can define them here. All config values can also be
[overwritten](/usage/training#config-overrides) on the CLI when you run [overwritten](/usage/training#config-overrides) on the CLI when you run
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description | | Name | Description |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths:raw}`. ~~Optional[str]~~ | | `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | | `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | | `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | | `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ | | `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | | `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | | `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |

View File

@ -341,7 +341,7 @@ See the [`Transformer`](/api/transformer) API reference and
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
A batcher implements a batching strategy that essentially turns a stream of A data batcher implements a batching strategy that essentially turns a stream of
items into a stream of batches, with each batch consisting of one item or a list items into a stream of batches, with each batch consisting of one item or a list
of items. During training, the models update their weights after processing one of items. During training, the models update their weights after processing one
batch at a time. Typical batching strategies include presenting the training batch at a time. Typical batching strategies include presenting the training
@ -613,7 +613,7 @@ components are created, as well as all training settings and hyperparameters.
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ | | `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ | | `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ | | `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
| **RETURNS** | The model's config. ~~Config~~ | | **RETURNS** | The model's config. ~~Config~~ |
### util.load_meta {#util.load_meta tag="function" new="3"} ### util.load_meta {#util.load_meta tag="function" new="3"}

View File

@ -157,8 +157,8 @@ sections of a config file are:
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | | `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | | `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. | | `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. | | `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
@ -325,19 +325,9 @@ compound = 1.001
Another very useful feature of the config system is that it supports variable Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the define a setting once and can reference it across your config using the
`${section:value}` or `${section.block}` syntax. In this example, the value of `${section.value}` syntax. In this example, the value of `seed` is reused within
`seed` is reused within the `[training]` block, and the whole block of the `[training]` block, and the whole block of `[training.optimizer]` is reused
`[training.optimizer]` is reused in `[pretraining]` and will become in `[pretraining]` and will become `pretraining.optimizer`.
`pretraining.optimizer`.
> #### Note on syntax
>
> There are two different ways to format your variables, depending on whether
> you want to reference a single value or a block. Values are specified after a
> `:`, while blocks are specified with a `.`:
>
> 1. `${section:value}`, `${section.subsection:value}`
> 2. `${section.block}`, `${section.subsection.block}`
```ini ```ini
### config.cfg (excerpt) {highlight="5,18"} ### config.cfg (excerpt) {highlight="5,18"}
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
seed = 0 seed = 0
[training] [training]
seed = ${system:seed} seed = ${system.seed}
[training.optimizer] [training.optimizer]
@optimizers = "Adam.v1" @optimizers = "Adam.v1"
@ -369,7 +359,7 @@ to a string.
[paths] [paths]
version = 5 version = 5
root = "/Users/you/data" root = "/Users/you/data"
train = "${paths:root}/train_${paths:version}.spacy" train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy # Result: /Users/you/data/train_5.spacy
``` ```
@ -484,20 +474,21 @@ still look good.
## Custom Functions {#custom-functions} ## Custom Functions {#custom-functions}
Registered functions in the training config files can refer to built-in Registered functions in the training config files can refer to built-in
implementations, but you can also plug in fully custom implementations. To do implementations, but you can also plug in fully **custom implementations**. All
so, you first write your own implementation of a custom architectures, data you need to do is register your function using the `@spacy.registry` decorator
reader or any other functionality, and then register this function with the with the name of the respective [registry](/api/top-level#registry), e.g.
correct [registry](/api/top-level#registry). This allows you to plug in models `@spacy.registry.architectures`, and a string name to assign to your function.
defined in PyTorch or Tensorflow, make custom modifications to the `nlp` object, Registering custom functions allows you to **plug in models** defined in PyTorch
create custom optimizers or schedules, or write a function that streams in data or TensorFlow, make **custom modifications** to the `nlp` object, create custom
and preprocesses it on the fly while training. optimizers or schedules, or **stream in data** and preprocesses it on the fly
while training.
Each custom function can have any numbers of arguments that should be passed Each custom function can have any numbers of arguments that are passed in via
into them through the config similar as with the built-in functions. If your the [config](#config), just the built-in functions. If your function defines
function defines **default argument values**, spaCy is able to auto-fill your **default argument values**, spaCy is able to auto-fill your config when you run
config when you run [`init fill-config`](/api/cli#init-fill-config). If you want [`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
to make sure that a given parameter is always explicitely set in the config, given parameter is always explicitely set in the config, avoid setting a default
avoid setting a default value for it. value for it.
<!-- TODO: possibly link to new (not yet created) page on creating models ? --> <!-- TODO: possibly link to new (not yet created) page on creating models ? -->