Merge branch 'develop' of https://github.com/explosion/spaCy into develop [ci skip]

This commit is contained in:
Ines Montani 2020-08-20 11:20:58 +02:00
parent fb51b55eb9
commit 6ad59d59fe
11 changed files with 39 additions and 49 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a27,<8.0.0a30", "thinc>=8.0.0a28,<8.0.0a30",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"smart_open>=2.0.0,<3.0.0" "smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30 thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0 wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -105,7 +105,7 @@ factory = "tok2vec"
[components.tok2vec.model.embed] [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }} rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }} also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }} also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
@ -127,7 +127,7 @@ nO = null
[components.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{%- endif %} {%- endif %}
{% if "parser" in components -%} {% if "parser" in components -%}
@ -144,7 +144,7 @@ nO = null
[components.parser.model.tok2vec] [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{%- endif %} {%- endif %}
{% if "ner" in components %} {% if "ner" in components %}
@ -161,7 +161,7 @@ nO = null
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode.width}
{% endif %} {% endif %}
{% endif %} {% endif %}
@ -194,12 +194,12 @@ initial_rate = 5e-5
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 0 }} max_length = {{ 500 if hardware == "gpu" else 0 }}
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
max_length = 0 max_length = 0
{% if use_transformer %} {% if use_transformer %}

View File

@ -23,12 +23,12 @@ after_pipeline_creation = null
# Training hyper-parameters and additional features. # Training hyper-parameters and additional features.
[training] [training]
seed = ${system:seed} seed = ${system.seed}
dropout = 0.1 dropout = 0.1
accumulate_gradient = 1 accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal # Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths:init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths:raw} raw_text = ${paths.raw}
vectors = null vectors = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
@ -42,7 +42,7 @@ frozen_components = []
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries # Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time # and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
@ -54,7 +54,7 @@ limit = 0
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries # Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time # and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
@ -98,8 +98,8 @@ max_length = 500
dropout = 0.2 dropout = 0.2
n_save_every = null n_save_every = null
batch_size = 3000 batch_size = 3000
seed = ${system:seed} seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory} use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model" tok2vec_model = "components.tok2vec.model"
[pretraining.objective] [pretraining.objective]

View File

@ -20,11 +20,11 @@ dev = ""
[training.train_corpus] [training.train_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:train} path = ${paths.train}
[training.dev_corpus] [training.dev_corpus]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths:dev} path = ${paths.dev}
[training.batcher] [training.batcher]
@batchers = "batch_by_words.v1" @batchers = "batch_by_words.v1"
@ -57,7 +57,7 @@ factory = "tagger"
[components.tagger.model.tok2vec] [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model:width} width = ${components.tok2vec.model.width}
""" """
@ -284,13 +284,13 @@ def test_config_overrides():
def test_config_interpolation(): def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False) config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths:train}" assert config["training"]["train_corpus"]["path"] == "${paths.train}"
interpolated = config.interpolate() interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == "" assert interpolated["training"]["train_corpus"]["path"] == ""
nlp = English.from_config(config) nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}" assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config # Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model:width}" width = "${components.tok2vec.model.width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate() interpolated2 = nlp.config.interpolate()

View File

@ -94,7 +94,7 @@ blog post for background.
> >
> [components.tagger.model.tok2vec] > [components.tagger.model.tok2vec]
> @architectures = "spacy.Tok2VecListener.v1" > @architectures = "spacy.Tok2VecListener.v1"
> width = ${components.tok2vec.model:width} > width = ${components.tok2vec.model.width}
> ``` > ```
A listener is used as a sublayer within a component such as a A listener is used as a sublayer within a component such as a

View File

@ -28,7 +28,7 @@ streaming.
> >
> [training.train_corpus] > [training.train_corpus]
> @readers = "spacy.Corpus.v1" > @readers = "spacy.Corpus.v1"
> path = ${paths:train} > path = ${paths.train}
> gold_preproc = false > gold_preproc = false
> max_length = 0 > max_length = 0
> limit = 0 > limit = 0

View File

@ -111,7 +111,7 @@ model to copy components from). See the docs on
### paths, system {#config-variables tag="variables"} ### paths, system {#config-variables tag="variables"}
These sections define variables that can be referenced across the other sections These sections define variables that can be referenced across the other sections
as variables. For example `${paths:train}` uses the value of `train` defined in as variables. For example `${paths.train}` uses the value of `train` defined in
the block `[paths]`. If your config includes custom registered functions that the block `[paths]`. If your config includes custom registered functions that
need paths, you can define them here. All config values can also be need paths, you can define them here. All config values can also be
[overwritten](/usage/training#config-overrides) on the CLI when you run [overwritten](/usage/training#config-overrides) on the CLI when you run
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description | | Name | Description |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ | | `raw_text` | TODO: ... Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | | `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | | `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | | `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | | `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | | `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ | | `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | | `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | | `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |

View File

@ -612,7 +612,7 @@ components are created, as well as all training settings and hyperparameters.
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ | | `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ | | `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ | | `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
| **RETURNS** | The model's config. ~~Config~~ | | **RETURNS** | The model's config. ~~Config~~ |
### util.load_meta {#util.load_meta tag="function" new="3"} ### util.load_meta {#util.load_meta tag="function" new="3"}

View File

@ -157,8 +157,8 @@ sections of a config file are:
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | | `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | | `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. | | `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. | | `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
@ -325,19 +325,9 @@ compound = 1.001
Another very useful feature of the config system is that it supports variable Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the define a setting once and can reference it across your config using the
`${section:value}` or `${section.block}` syntax. In this example, the value of `${section.value}` syntax. In this example, the value of `seed` is reused within
`seed` is reused within the `[training]` block, and the whole block of the `[training]` block, and the whole block of `[training.optimizer]` is reused
`[training.optimizer]` is reused in `[pretraining]` and will become in `[pretraining]` and will become `pretraining.optimizer`.
`pretraining.optimizer`.
> #### Note on syntax
>
> There are two different ways to format your variables, depending on whether
> you want to reference a single value or a block. Values are specified after a
> `:`, while blocks are specified with a `.`:
>
> 1. `${section:value}`, `${section.subsection:value}`
> 2. `${section.block}`, `${section.subsection.block}`
```ini ```ini
### config.cfg (excerpt) {highlight="5,18"} ### config.cfg (excerpt) {highlight="5,18"}
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
seed = 0 seed = 0
[training] [training]
seed = ${system:seed} seed = ${system.seed}
[training.optimizer] [training.optimizer]
@optimizers = "Adam.v1" @optimizers = "Adam.v1"
@ -369,7 +359,7 @@ to a string.
[paths] [paths]
version = 5 version = 5
root = "/Users/you/data" root = "/Users/you/data"
train = "${paths:root}/train_${paths:version}.spacy" train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy # Result: /Users/you/data/train_5.spacy
``` ```