Merge remote-tracking branch 'upstream/develop' into feature/update-more-docs

# Conflicts:
#	website/docs/api/data-formats.md
This commit is contained in:
svlandeg 2020-08-20 10:02:13 +02:00
commit f728c00cbb
12 changed files with 55 additions and 64 deletions

View File

@ -5,5 +5,5 @@ include README.md
include pyproject.toml
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json
recursive-include spacy/cli *.json *.yml
recursive-include licenses *

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a27,<8.0.0a30",
"thinc>=8.0.0a28,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -105,7 +105,7 @@ factory = "tok2vec"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
@ -127,7 +127,7 @@ nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "parser" in components -%}
@ -144,7 +144,7 @@ nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "ner" in components %}
@ -161,7 +161,7 @@ nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% endif %}
@ -194,12 +194,12 @@ initial_rate = 5e-5
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 0 }}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
max_length = 0
{% if use_transformer %}

View File

@ -23,12 +23,12 @@ after_pipeline_creation = null
# Training hyper-parameters and additional features.
[training]
seed = ${system:seed}
seed = ${system.seed}
dropout = 0.1
accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths:init_tok2vec}
raw_text = ${paths:raw}
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
@ -42,7 +42,7 @@ frozen_components = []
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
@ -54,7 +54,7 @@ limit = 0
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
@ -98,8 +98,8 @@ max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${system:seed}
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory}
seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]

View File

@ -20,11 +20,11 @@ dev = ""
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
[training.batcher]
@batchers = "batch_by_words.v1"
@ -57,7 +57,7 @@ factory = "tagger"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model:width}
width = ${components.tok2vec.model.width}
"""
@ -284,13 +284,13 @@ def test_config_overrides():
def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == ""
nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model:width}"
width = "${components.tok2vec.model.width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()

View File

@ -94,7 +94,7 @@ blog post for background.
>
> [components.tagger.model.tok2vec]
> @architectures = "spacy.Tok2VecListener.v1"
> width = ${components.tok2vec.model:width}
> width = ${components.tok2vec.model.width}
> ```
A listener is used as a sublayer within a component such as a

View File

@ -28,7 +28,7 @@ streaming.
>
> [training.train_corpus]
> @readers = "spacy.Corpus.v1"
> path = ${paths:train}
> path = ${paths.train}
> gold_preproc = false
> max_length = 0
> limit = 0

View File

@ -111,7 +111,7 @@ model to copy components from). See the docs on
### paths, system {#config-variables tag="variables"}
These sections define variables that can be referenced across the other sections
as variables. For example `${paths:train}` uses the value of `train` defined in
as variables. For example `${paths.train}` uses the value of `train` defined in
the block `[paths]`. If your config includes custom registered functions that
need paths, you can define them here. All config values can also be
[overwritten](/usage/training#config-overrides) on the CLI when you run
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths:raw}`. ~~Optional[str]~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |

View File

@ -341,7 +341,7 @@ See the [`Transformer`](/api/transformer) API reference and
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
A batcher implements a batching strategy that essentially turns a stream of
A data batcher implements a batching strategy that essentially turns a stream of
items into a stream of batches, with each batch consisting of one item or a list
of items. During training, the models update their weights after processing one
batch at a time. Typical batching strategies include presenting the training
@ -613,7 +613,7 @@ components are created, as well as all training settings and hyperparameters.
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
| **RETURNS** | The model's config. ~~Config~~ |
### util.load_meta {#util.load_meta tag="function" new="3"}

View File

@ -157,8 +157,8 @@ sections of a config file are:
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
@ -325,19 +325,9 @@ compound = 1.001
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section:value}` or `${section.block}` syntax. In this example, the value of
`seed` is reused within the `[training]` block, and the whole block of
`[training.optimizer]` is reused in `[pretraining]` and will become
`pretraining.optimizer`.
> #### Note on syntax
>
> There are two different ways to format your variables, depending on whether
> you want to reference a single value or a block. Values are specified after a
> `:`, while blocks are specified with a `.`:
>
> 1. `${section:value}`, `${section.subsection:value}`
> 2. `${section.block}`, `${section.subsection.block}`
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
seed = 0
[training]
seed = ${system:seed}
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
@ -369,7 +359,7 @@ to a string.
[paths]
version = 5
root = "/Users/you/data"
train = "${paths:root}/train_${paths:version}.spacy"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```
@ -484,20 +474,21 @@ still look good.
## Custom Functions {#custom-functions}
Registered functions in the training config files can refer to built-in
implementations, but you can also plug in fully custom implementations. To do
so, you first write your own implementation of a custom architectures, data
reader or any other functionality, and then register this function with the
correct [registry](/api/top-level#registry). This allows you to plug in models
defined in PyTorch or Tensorflow, make custom modifications to the `nlp` object,
create custom optimizers or schedules, or write a function that streams in data
and preprocesses it on the fly while training.
implementations, but you can also plug in fully **custom implementations**. All
you need to do is register your function using the `@spacy.registry` decorator
with the name of the respective [registry](/api/top-level#registry), e.g.
`@spacy.registry.architectures`, and a string name to assign to your function.
Registering custom functions allows you to **plug in models** defined in PyTorch
or TensorFlow, make **custom modifications** to the `nlp` object, create custom
optimizers or schedules, or **stream in data** and preprocesses it on the fly
while training.
Each custom function can have any numbers of arguments that should be passed
into them through the config similar as with the built-in functions. If your
function defines **default argument values**, spaCy is able to auto-fill your
config when you run [`init fill-config`](/api/cli#init-fill-config). If you want
to make sure that a given parameter is always explicitely set in the config,
avoid setting a default value for it.
Each custom function can have any numbers of arguments that are passed in via
the [config](#config), just the built-in functions. If your function defines
**default argument values**, spaCy is able to auto-fill your config when you run
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
given parameter is always explicitely set in the config, avoid setting a default
value for it.
<!-- TODO: possibly link to new (not yet created) page on creating models ? -->