Update Thinc and config variables

This commit is contained in:
Ines Montani 2020-08-19 19:46:12 +02:00
parent cb9a2402ee
commit 3dd390b1a1
11 changed files with 39 additions and 49 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a27,<8.0.0a30",
"thinc>=8.0.0a28,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a27,<8.0.0a30
thinc>=8.0.0a28,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -105,7 +105,7 @@ factory = "tok2vec"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ true if has_letters else false }}
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
@ -127,7 +127,7 @@ nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "parser" in components -%}
@ -144,7 +144,7 @@ nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{%- endif %}
{% if "ner" in components %}
@ -161,7 +161,7 @@ nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% endif %}
@ -194,12 +194,12 @@ initial_rate = 5e-5
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 0 }}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
max_length = 0
{% if use_transformer %}

View File

@ -23,12 +23,12 @@ after_pipeline_creation = null
# Training hyper-parameters and additional features.
[training]
seed = ${system:seed}
seed = ${system.seed}
dropout = 0.1
accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths:init_tok2vec}
raw_text = ${paths:raw}
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
@ -42,7 +42,7 @@ frozen_components = []
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
@ -54,7 +54,7 @@ limit = 0
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
@ -98,8 +98,8 @@ max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${system:seed}
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory}
seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]

View File

@ -20,11 +20,11 @@ dev = ""
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
path = ${paths.train}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
path = ${paths.dev}
[training.batcher]
@batchers = "batch_by_words.v1"
@ -57,7 +57,7 @@ factory = "tagger"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model:width}
width = ${components.tok2vec.model.width}
"""
@ -284,13 +284,13 @@ def test_config_overrides():
def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == ""
nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model:width}"
width = "${components.tok2vec.model.width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()

View File

@ -94,7 +94,7 @@ blog post for background.
>
> [components.tagger.model.tok2vec]
> @architectures = "spacy.Tok2VecListener.v1"
> width = ${components.tok2vec.model:width}
> width = ${components.tok2vec.model.width}
> ```
A listener is used as a sublayer within a component such as a

View File

@ -28,7 +28,7 @@ streaming.
>
> [training.train_corpus]
> @readers = "spacy.Corpus.v1"
> path = ${paths:train}
> path = ${paths.train}
> gold_preproc = false
> max_length = 0
> limit = 0

View File

@ -111,7 +111,7 @@ model to copy components from). See the docs on
### paths, system {#config-variables tag="variables"}
These sections define variables that can be referenced across the other sections
as variables. For example `${paths:train}` uses the value of `train` defined in
as variables. For example `${paths.train}` uses the value of `train` defined in
the block `[paths]`. If your config includes custom registered functions that
need paths, you can define them here. All config values can also be
[overwritten](/usage/training#config-overrides) on the CLI when you run
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
| Name | Description |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
| `raw_text` | TODO: ... Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |

View File

@ -602,7 +602,7 @@ components are created, as well as all training settings and hyperparameters.
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ |
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
| **RETURNS** | The model's config. ~~Config~~ |
### util.load_meta {#util.load_meta tag="function" new="3"}

View File

@ -157,8 +157,8 @@ sections of a config file are:
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
@ -325,19 +325,9 @@ compound = 1.001
Another very useful feature of the config system is that it supports variable
interpolation for both **values and sections**. This means that you only need to
define a setting once and can reference it across your config using the
`${section:value}` or `${section.block}` syntax. In this example, the value of
`seed` is reused within the `[training]` block, and the whole block of
`[training.optimizer]` is reused in `[pretraining]` and will become
`pretraining.optimizer`.
> #### Note on syntax
>
> There are two different ways to format your variables, depending on whether
> you want to reference a single value or a block. Values are specified after a
> `:`, while blocks are specified with a `.`:
>
> 1. `${section:value}`, `${section.subsection:value}`
> 2. `${section.block}`, `${section.subsection.block}`
`${section.value}` syntax. In this example, the value of `seed` is reused within
the `[training]` block, and the whole block of `[training.optimizer]` is reused
in `[pretraining]` and will become `pretraining.optimizer`.
```ini
### config.cfg (excerpt) {highlight="5,18"}
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
seed = 0
[training]
seed = ${system:seed}
seed = ${system.seed}
[training.optimizer]
@optimizers = "Adam.v1"
@ -369,7 +359,7 @@ to a string.
[paths]
version = 5
root = "/Users/you/data"
train = "${paths:root}/train_${paths:version}.spacy"
train = "${paths.root}/train_${paths.version}.spacy"
# Result: /Users/you/data/train_5.spacy
```