From 6ad59d59fe5f923ce23cb66d7fb71ca511fd656b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Aug 2020 11:20:58 +0200 Subject: [PATCH] Merge branch 'develop' of https://github.com/explosion/spaCy into develop [ci skip] --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/cli/templates/quickstart_training.jinja | 12 +++++----- spacy/default_config.cfg | 14 +++++------ .../tests/serialize/test_serialize_config.py | 12 +++++----- website/docs/api/architectures.md | 2 +- website/docs/api/corpus.md | 2 +- website/docs/api/data-formats.md | 12 +++++----- website/docs/api/top-level.md | 2 +- website/docs/usage/training.md | 24 ++++++------------- 11 files changed, 39 insertions(+), 49 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b4972bd5..9a646d0d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a27,<8.0.0a30", + "thinc>=8.0.0a28,<8.0.0a30", "blis>=0.4.0,<0.5.0", "pytokenizations", "smart_open>=2.0.0,<3.0.0" diff --git a/requirements.txt b/requirements.txt index b4901a692..181cb2101 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a27,<8.0.0a30 +thinc>=8.0.0a28,<8.0.0a30 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index a34c34e23..d56eab3a6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a27,<8.0.0a30 + thinc>=8.0.0a28,<8.0.0a30 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a27,<8.0.0a30 + thinc>=8.0.0a28,<8.0.0a30 blis>=0.4.0,<0.5.0 wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 4f5a2226e..674099abc 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -105,7 +105,7 @@ factory = "tok2vec" [components.tok2vec.model.embed] @architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} rows = {{ 2000 if optimize == "efficiency" else 7000 }} also_embed_subwords = {{ true if has_letters else false }} also_use_static_vectors = {{ true if optimize == "accuracy" else false }} @@ -127,7 +127,7 @@ nO = null [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {%- endif %} {% if "parser" in components -%} @@ -144,7 +144,7 @@ nO = null [components.parser.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {%- endif %} {% if "ner" in components %} @@ -161,7 +161,7 @@ nO = null [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {% endif %} {% endif %} @@ -194,12 +194,12 @@ initial_rate = 5e-5 [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} +path = ${paths.train} max_length = {{ 500 if hardware == "gpu" else 0 }} [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} max_length = 0 {% if use_transformer %} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 8aadad668..3eab21888 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -23,12 +23,12 @@ after_pipeline_creation = null # Training hyper-parameters and additional features. [training] -seed = ${system:seed} +seed = ${system.seed} dropout = 0.1 accumulate_gradient = 1 # Extra resources for transfer-learning or pseudo-rehearsal -init_tok2vec = ${paths:init_tok2vec} -raw_text = ${paths:raw} +init_tok2vec = ${paths.init_tok2vec} +raw_text = ${paths.raw} vectors = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -42,7 +42,7 @@ frozen_components = [] [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} +path = ${paths.train} # Whether to train on sequences with 'gold standard' sentence boundaries # and tokens. If you set this to true, take care to ensure your run-time # data is passed in sentence-by-sentence via some prior preprocessing. @@ -54,7 +54,7 @@ limit = 0 [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} # Whether to train on sequences with 'gold standard' sentence boundaries # and tokens. If you set this to true, take care to ensure your run-time # data is passed in sentence-by-sentence via some prior preprocessing. @@ -98,8 +98,8 @@ max_length = 500 dropout = 0.2 n_save_every = null batch_size = 3000 -seed = ${system:seed} -use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory} +seed = ${system.seed} +use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory} tok2vec_model = "components.tok2vec.model" [pretraining.objective] diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1de137e81..f2b496d71 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -20,11 +20,11 @@ dev = "" [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} +path = ${paths.train} [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} [training.batcher] @batchers = "batch_by_words.v1" @@ -57,7 +57,7 @@ factory = "tagger" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model:width} +width = ${components.tok2vec.model.width} """ @@ -284,13 +284,13 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["train_corpus"]["path"] == "${paths:train}" + assert config["training"]["train_corpus"]["path"] == "${paths.train}" interpolated = config.interpolate() assert interpolated["training"]["train_corpus"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}" + assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config - width = "${components.tok2vec.model:width}" + width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 25a44245d..acdf4cb19 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -94,7 +94,7 @@ blog post for background. > > [components.tagger.model.tok2vec] > @architectures = "spacy.Tok2VecListener.v1" -> width = ${components.tok2vec.model:width} +> width = ${components.tok2vec.model.width} > ``` A listener is used as a sublayer within a component such as a diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 8c530ab6d..86cfa9121 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -28,7 +28,7 @@ streaming. > > [training.train_corpus] > @readers = "spacy.Corpus.v1" -> path = ${paths:train} +> path = ${paths.train} > gold_preproc = false > max_length = 0 > limit = 0 diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index ff106b229..87f3ecbf2 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -111,7 +111,7 @@ model to copy components from). See the docs on ### paths, system {#config-variables tag="variables"} These sections define variables that can be referenced across the other sections -as variables. For example `${paths:train}` uses the value of `train` defined in +as variables. For example `${paths.train}` uses the value of `train` defined in the block `[paths]`. If your config includes custom registered functions that need paths, you can define them here. All config values can also be [overwritten](/usage/training#config-overrides) on the CLI when you run @@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train). | Name | Description | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ | -| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `raw_text` | TODO: ... Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | | `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | | `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | | `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | @@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain). | `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | | `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | -| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | -| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ | | `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | | `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | | `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index b33d7f022..325a94f5c 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -612,7 +612,7 @@ components are created, as well as all training settings and hyperparameters. | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ | | `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ | -| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ | +| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ | | **RETURNS** | The model's config. ~~Config~~ | ### util.load_meta {#util.load_meta tag="function" new="3"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index d4f380c10..348e42b41 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -157,8 +157,8 @@ sections of a config file are: | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | | `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | -| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. | -| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. | +| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. | +| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `training` | Settings and controls for the training and evaluation process. | | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | @@ -325,19 +325,9 @@ compound = 1.001 Another very useful feature of the config system is that it supports variable interpolation for both **values and sections**. This means that you only need to define a setting once and can reference it across your config using the -`${section:value}` or `${section.block}` syntax. In this example, the value of -`seed` is reused within the `[training]` block, and the whole block of -`[training.optimizer]` is reused in `[pretraining]` and will become -`pretraining.optimizer`. - -> #### Note on syntax -> -> There are two different ways to format your variables, depending on whether -> you want to reference a single value or a block. Values are specified after a -> `:`, while blocks are specified with a `.`: -> -> 1. `${section:value}`, `${section.subsection:value}` -> 2. `${section.block}`, `${section.subsection.block}` +`${section.value}` syntax. In this example, the value of `seed` is reused within +the `[training]` block, and the whole block of `[training.optimizer]` is reused +in `[pretraining]` and will become `pretraining.optimizer`. ```ini ### config.cfg (excerpt) {highlight="5,18"} @@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the seed = 0 [training] -seed = ${system:seed} +seed = ${system.seed} [training.optimizer] @optimizers = "Adam.v1" @@ -369,7 +359,7 @@ to a string. [paths] version = 5 root = "/Users/you/data" -train = "${paths:root}/train_${paths:version}.spacy" +train = "${paths.root}/train_${paths.version}.spacy" # Result: /Users/you/data/train_5.spacy ```