Merge remote-tracking branch 'upstream/develop' into feature/update-more-docs

# Conflicts: # website/docs/api/data-formats.md
2025-09-22 20:16:43 +03:00 · 2020-08-20 10:02:13 +02:00 · 2020-08-20 10:02:13 +02:00 · f728c00cbb
commit f728c00cbb
parent 229033831a ea6640ea72
12 changed files with 55 additions and 64 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -5,5 +5,5 @@ include README.md
 include pyproject.toml
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
-recursive-include spacy/cli *.json
+recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a27,<8.0.0a30",
+    "thinc>=8.0.0a28,<8.0.0a30",
    "blis>=0.4.0,<0.5.0",
    "pytokenizations",
    "smart_open>=2.0.0,<3.0.0"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a27,<8.0.0a30
+thinc>=8.0.0a28,<8.0.0a30
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -34,13 +34,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a27,<8.0.0a30
+    thinc>=8.0.0a28,<8.0.0a30
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a27,<8.0.0a30
+    thinc>=8.0.0a28,<8.0.0a30
    blis>=0.4.0,<0.5.0
    wasabi>=0.7.1,<1.1.0
    srsly>=2.1.0,<3.0.0
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -105,7 +105,7 @@ factory = "tok2vec"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
+width = ${components.tok2vec.model.encode.width}
 rows = {{ 2000 if optimize == "efficiency" else 7000 }}
 also_embed_subwords = {{ true if has_letters else false }}
 also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
@ -127,7 +127,7 @@ nO = null
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
+width = ${components.tok2vec.model.encode.width}
 {%- endif %}
 {% if "parser" in components -%}
@ -144,7 +144,7 @@ nO = null
 [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
+width = ${components.tok2vec.model.encode.width}
 {%- endif %}
 {% if "ner" in components %}
@ -161,7 +161,7 @@ nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
+width = ${components.tok2vec.model.encode.width}
 {% endif %}
 {% endif %}
@ -194,12 +194,12 @@ initial_rate = 5e-5
 [training.train_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:train}
+path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 0 }}
 [training.dev_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
+path = ${paths.dev}
 max_length = 0
 {% if use_transformer %}
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -23,12 +23,12 @@ after_pipeline_creation = null
 # Training hyper-parameters and additional features.
 [training]
-seed = ${system:seed}
+seed = ${system.seed}
 dropout = 0.1
 accumulate_gradient = 1
 # Extra resources for transfer-learning or pseudo-rehearsal
-init_tok2vec = ${paths:init_tok2vec}
+init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths:raw}
+raw_text = ${paths.raw}
 vectors = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
@ -42,7 +42,7 @@ frozen_components = []
 [training.train_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:train}
+path = ${paths.train}
 # Whether to train on sequences with 'gold standard' sentence boundaries
 # and tokens. If you set this to true, take care to ensure your run-time
 # data is passed in sentence-by-sentence via some prior preprocessing.
@ -54,7 +54,7 @@ limit = 0
 [training.dev_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
+path = ${paths.dev}
 # Whether to train on sequences with 'gold standard' sentence boundaries
 # and tokens. If you set this to true, take care to ensure your run-time
 # data is passed in sentence-by-sentence via some prior preprocessing.
@ -98,8 +98,8 @@ max_length = 500
 dropout = 0.2
 n_save_every = null
 batch_size = 3000
-seed = ${system:seed}
+seed = ${system.seed}
-use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory}
+use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
 tok2vec_model = "components.tok2vec.model"
 [pretraining.objective]
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -20,11 +20,11 @@ dev = ""
 [training.train_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:train}
+path = ${paths.train}
 [training.dev_corpus]
@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
+path = ${paths.dev}
 [training.batcher]
@batchers = "batch_by_words.v1"
@ -57,7 +57,7 @@ factory = "tagger"
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model:width}
+width = ${components.tok2vec.model.width}
 """
@ -284,13 +284,13 @@ def test_config_overrides():
 def test_config_interpolation():
    config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["train_corpus"]["path"] == "${paths:train}"
+    assert config["training"]["train_corpus"]["path"] == "${paths.train}"
    interpolated = config.interpolate()
    assert interpolated["training"]["train_corpus"]["path"] == ""
    nlp = English.from_config(config)
-    assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
+    assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
    # Ensure that variables are preserved in nlp config
-    width = "${components.tok2vec.model:width}"
+    width = "${components.tok2vec.model.width}"
    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    interpolated2 = nlp.config.interpolate()
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -94,7 +94,7 @@ blog post for background.
 >
 > [components.tagger.model.tok2vec]
 > @architectures = "spacy.Tok2VecListener.v1"
-> width = ${components.tok2vec.model:width}
+> width = ${components.tok2vec.model.width}
 > ```
 A listener is used as a sublayer within a component such as a
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -28,7 +28,7 @@ streaming.
 >
 > [training.train_corpus]
 > @readers = "spacy.Corpus.v1"
-> path = ${paths:train}
+> path = ${paths.train}
 > gold_preproc = false
 > max_length = 0
 > limit = 0
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -111,7 +111,7 @@ model to copy components from). See the docs on
 ### paths, system {#config-variables tag="variables"}
 These sections define variables that can be referenced across the other sections
-as variables. For example `${paths:train}` uses the value of `train` defined in
+as variables. For example `${paths.train}` uses the value of `train` defined in
 the block `[paths]`. If your config includes custom registered functions that
 need paths, you can define them here. All config values can also be
 [overwritten](/usage/training#config-overrides) on the CLI when you run
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | Name                  | Description                                                                                                                                                                                                  |
 | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `seed`                | The random seed. Defaults to variable `${system:seed}`. ~~int~~                                                                                                                                              |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~                                              |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths:raw}`. ~~Optional[str]~~                                            |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsel](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
 | `vectors`             | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~                                          |
 | `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
 | `dropout`                    | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                  |
 | `n_save_every`               | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                         |
 | `batch_size`                 | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
-| `seed`                       | The random seed. Defaults to variable `${system:seed}`. ~~int~~                                                                 |
+| `seed`                       | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                 |
-| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~                              |
+| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~                              |
 | `tok2vec_model`              | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~                   |
 | `objective`                  | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                          |
 | `optimizer`                  | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                   |
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -341,7 +341,7 @@ See the [`Transformer`](/api/transformer) API reference and
 ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
-A batcher implements a batching strategy that essentially turns a stream of
+A data batcher implements a batching strategy that essentially turns a stream of
 items into a stream of batches, with each batch consisting of one item or a list
 of items. During training, the models update their weights after processing one
 batch at a time. Typical batching strategies include presenting the training
@ -613,7 +613,7 @@ components are created, as well as all training settings and hyperparameters.
 | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `path`        | Path to the model's `config.cfg`. ~~Union[str, Path]~~                                                                                                                      |
 | `overrides`   | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
-| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~                                              |
+| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~                                              |
 | **RETURNS**   | The model's config. ~~Config~~                                                                                                                                              |
 ### util.load_meta {#util.load_meta tag="function" new="3"}
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -157,8 +157,8 @@ sections of a config file are:
 | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nlp`         | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names.                                           |
 | `components`  | Definitions of the [pipeline components](/usage/processing-pipelines) and their models.                                                                         |
-| `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI.          |
+| `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI.          |
-| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
+| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
 | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
@ -325,19 +325,9 @@ compound = 1.001
 Another very useful feature of the config system is that it supports variable
 interpolation for both **values and sections**. This means that you only need to
 define a setting once and can reference it across your config using the
-`${section:value}` or `${section.block}` syntax. In this example, the value of
+`${section.value}` syntax. In this example, the value of `seed` is reused within
-`seed` is reused within the `[training]` block, and the whole block of
+the `[training]` block, and the whole block of `[training.optimizer]` is reused
-`[training.optimizer]` is reused in `[pretraining]` and will become
+in `[pretraining]` and will become `pretraining.optimizer`.
 `pretraining.optimizer`.
 > #### Note on syntax
 >
 > There are two different ways to format your variables, depending on whether
 > you want to reference a single value or a block. Values are specified after a
 > `:`, while blocks are specified with a `.`:
 >
 > 1. `${section:value}`, `${section.subsection:value}`
 > 2. `${section.block}`, `${section.subsection.block}`
 ```ini
 ### config.cfg (excerpt) {highlight="5,18"}
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
 seed = 0
 [training]
-seed = ${system:seed}
+seed = ${system.seed}
 [training.optimizer]
@optimizers = "Adam.v1"
@ -369,7 +359,7 @@ to a string.
 [paths]
 version = 5
 root = "/Users/you/data"
-train = "${paths:root}/train_${paths:version}.spacy"
+train = "${paths.root}/train_${paths.version}.spacy"
 # Result: /Users/you/data/train_5.spacy
 ```
@ -484,20 +474,21 @@ still look good.
 ## Custom Functions {#custom-functions}
 Registered functions in the training config files can refer to built-in
-implementations, but you can also plug in fully custom implementations. To do
+implementations, but you can also plug in fully **custom implementations**. All
-so, you first write your own implementation of a custom architectures, data
+you need to do is register your function using the `@spacy.registry` decorator
-reader or any other functionality, and then register this function with the
+with the name of the respective [registry](/api/top-level#registry), e.g.
-correct [registry](/api/top-level#registry). This allows you to plug in models
+`@spacy.registry.architectures`, and a string name to assign to your function.
-defined in PyTorch or Tensorflow, make custom modifications to the `nlp` object,
+Registering custom functions allows you to **plug in models** defined in PyTorch
-create custom optimizers or schedules, or write a function that streams in data
+or TensorFlow, make **custom modifications** to the `nlp` object, create custom
-and preprocesses it on the fly while training.
+optimizers or schedules, or **stream in data** and preprocesses it on the fly
 while training.
-Each custom function can have any numbers of arguments that should be passed
+Each custom function can have any numbers of arguments that are passed in via
-into them through the config similar as with the built-in functions. If your
+the [config](#config), just the built-in functions. If your function defines
-function defines **default argument values**, spaCy is able to auto-fill your
+**default argument values**, spaCy is able to auto-fill your config when you run
-config when you run [`init fill-config`](/api/cli#init-fill-config). If you want
+[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
-to make sure that a given parameter is always explicitely set in the config,
+given parameter is always explicitely set in the config, avoid setting a default
-avoid setting a default value for it.
+value for it.
 <!-- TODO: possibly link to new (not yet created) page on creating models ? -->