From ba6cf9821f0ba4174fe91a840688785fbaa5ed98 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 4 Sep 2020 14:28:28 +0200
Subject: [PATCH 001/516] Replace docs analytics [ci skip]

---
 website/gatsby-config.js | 9 ---------
 website/meta/site.json   | 1 -
 website/package.json     | 1 -
 3 files changed, 11 deletions(-)
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 2a5f957f4..144b8e93e 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -131,15 +131,6 @@ module.exports = {
                 icon: `src/images/icon.png`,
             },
         },
-        {
-            resolve: `gatsby-plugin-google-analytics`,
-            options: {
-                trackingId: site.analytics,
-                head: false,
-                anonymize: true,
-                respectDNT: true,
-            },
-        },
         {
             resolve: `gatsby-plugin-plausible`,
             options: {
diff --git a/website/meta/site.json b/website/meta/site.json
index 4d12a4c46..31f2f2f68 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -14,7 +14,6 @@
         "github": "explosion"
     },
     "theme": "#09a3d5",
-    "analytics": "UA-58931649-1",
     "newsletter": {
         "user": "spacy.us12",
         "id": "83b0498b1e7fa3c91ce68c3f1",
diff --git a/website/package.json b/website/package.json
index a59bc9bdc..8d8ba6408 100644
--- a/website/package.json
+++ b/website/package.json
@@ -20,7 +20,6 @@
         "gatsby-image": "^2.0.29",
         "gatsby-mdx": "^0.3.6",
         "gatsby-plugin-catch-links": "^2.0.11",
-        "gatsby-plugin-google-analytics": "^2.0.14",
         "gatsby-plugin-manifest": "^2.0.17",
         "gatsby-plugin-offline": "^2.0.24",
         "gatsby-plugin-plausible": "0.0.6",

From 33d9c649771cf03122ccb9fe7544e8c14ed788fa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 4 Sep 2020 14:44:38 +0200
Subject: [PATCH 002/516] Fix outbound link and update package lock [ci skip]

---
 website/package-lock.json      |  8 --------
 website/src/components/link.js | 11 ++---------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/website/package-lock.json b/website/package-lock.json
index dded33fb0..63e67ebd2 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -7441,14 +7441,6 @@
         "escape-string-regexp": "^1.0.5"
       }
     },
-    "gatsby-plugin-google-analytics": {
-      "version": "2.0.14",
-      "resolved": "https://registry.npmjs.org/gatsby-plugin-google-analytics/-/gatsby-plugin-google-analytics-2.0.14.tgz",
-      "integrity": "sha512-sFD73d9isJQknnDAAkDidaybHJx6VIaLfy3nO3DwbFaitvZ08RimbynYOkcWAeA0zwwix2RgAvbq/9pAmtTb/A==",
-      "requires": {
-        "@babel/runtime": "^7.0.0"
-      }
-    },
     "gatsby-plugin-manifest": {
       "version": "2.0.17",
       "resolved": "https://registry.npmjs.org/gatsby-plugin-manifest/-/gatsby-plugin-manifest-2.0.17.tgz",
diff --git a/website/src/components/link.js b/website/src/components/link.js
index 4c4aa9492..dc0cfda8e 100644
--- a/website/src/components/link.js
+++ b/website/src/components/link.js
@@ -1,7 +1,6 @@
 import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import { Link as GatsbyLink } from 'gatsby'
-import { OutboundLink } from 'gatsby-plugin-google-analytics'
 import classNames from 'classnames'
 
 import Icon from './icon'
@@ -74,15 +73,9 @@ const Link = ({
     const rel = isInternal ? null : 'noopener nofollow noreferrer'
     return (
         <Wrapper>
-            <OutboundLink
-                href={dest}
-                className={linkClassNames}
-                target="_blank"
-                rel={rel}
-                {...other}
-            >
+            <a href={dest} className={linkClassNames} target="_blank" rel={rel} {...other}>
                 {content}
-            </OutboundLink>
+            </a>
         </Wrapper>
     )
 }

From a26f864ed3c227fab1d2a506e27cb4b5b5d831d2 Mon Sep 17 00:00:00 2001
From: Marek Grzenkowicz <chopeen@gmail.com>
Date: Tue, 8 Sep 2020 21:13:50 +0200
Subject: [PATCH 003/516] Clarify how to choose pretrained weights files
 (closes #6027) [ci skip] (#6039)

---
 website/docs/api/cli.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 779fa7695..b97308aab 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -445,7 +445,8 @@ an approximate language-modeling objective. Specifically, we load pretrained
 vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
 match the pretrained ones. The weights are saved to a directory after each
 epoch. You can then pass a path to one of these pretrained weights files to the
-`spacy train` command.
+`spacy train` command. You can try to use a few with low `Loss` values reported
+in the output.
 
 This technique may be especially helpful if you have little labelled data.
 However, it's still quite experimental, so your mileage may vary. To load the

From bd87e8686e05487116c3a0c631bcb789059b2636 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 21:40:38 +0200
Subject: [PATCH 004/516] move tests to correct subdir

---
 spacy/tests/{ => pipeline}/test_tok2vec.py  | 2 +-
 spacy/tests/training/__init__.py            | 0
 spacy/tests/{ => training}/test_training.py | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename spacy/tests/{ => pipeline}/test_tok2vec.py (99%)
 create mode 100644 spacy/tests/training/__init__.py
 rename spacy/tests/{ => training}/test_training.py (99%)

diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
similarity index 99%
rename from spacy/tests/test_tok2vec.py
rename to spacy/tests/pipeline/test_tok2vec.py
index fb30c6ae5..0365554bc 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -9,7 +9,7 @@ from spacy.tokens import Doc
 from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
-from .util import get_batch
+from ..util import get_batch
 
 from thinc.api import Config
 
diff --git a/spacy/tests/training/__init__.py b/spacy/tests/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/test_training.py b/spacy/tests/training/test_training.py
similarity index 99%
rename from spacy/tests/test_training.py
rename to spacy/tests/training/test_training.py
index 1926aca1f..67cc37b1c 100644
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from .util import make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.fixture

From 51fa929f47120272bd6b8dfbba1f000833446f0f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 21:58:04 +0200
Subject: [PATCH 005/516] rewrite train_corpus to corpus.train in config

---
 extra/experiments/onto-joint/defaults.cfg     |  6 ++--
 .../ptb-joint-pos-dep/defaults.cfg            |  6 ++--
 spacy/cli/templates/quickstart_training.jinja |  6 ++--
 spacy/cli/train.py                            |  4 +--
 spacy/default_config.cfg                      |  6 ++--
 spacy/schemas.py                              |  3 +-
 .../tests/serialize/test_serialize_config.py  | 16 +++++----
 website/docs/api/corpus.md                    |  2 +-
 website/docs/api/data-formats.md              | 35 +++++++++----------
 website/docs/api/top-level.md                 |  4 +--
 website/docs/usage/projects.md                |  2 +-
 website/docs/usage/training.md                |  2 +-
 12 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
index 7954b57b5..97eebe6b4 100644
--- a/extra/experiments/onto-joint/defaults.cfg
+++ b/extra/experiments/onto-joint/defaults.cfg
@@ -21,14 +21,16 @@ eval_frequency = 200
 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
 frozen_components = []
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
index 8f9c5666e..03e2f5bd7 100644
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -20,14 +20,16 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
 
-[training.read_train]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.read_dev]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 199aae217..39d4d875d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -195,12 +195,14 @@ total_steps = 20000
 initial_rate = 5e-5
 {% endif %}
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ae4a8455e..2c2eeb88b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -92,8 +92,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = T_cfg["train_corpus"]
-    dev_corpus = T_cfg["dev_corpus"]
+    train_corpus = T_cfg["corpus"]["train"]
+    dev_corpus = T_cfg["corpus"]["dev"]
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7cd71453f..61f3dfe25 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -44,7 +44,9 @@ frozen_components = []
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 # Whether to train on sequences with 'gold standard' sentence boundaries
@@ -56,7 +58,7 @@ max_length = 0
 # Limitation on number of training examples
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 # Whether to train on sequences with 'gold standard' sentence boundaries
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 0dd2b9204..d8bcf3c1d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    train_corpus: Reader = Field(..., title="Reader for the training data")
-    dev_corpus: Reader = Field(..., title="Reader for the dev data")
+    corpus: Reader = Field(..., title="Reader for the training and dev data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 0ab212fda..d113ac2a5 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -19,11 +19,13 @@ dev = ""
 
 [training]
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 
@@ -300,20 +302,20 @@ def test_config_overrides():
 
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["training"]["train_corpus"]["path"] == ""
+    assert interpolated["training"]["corpus"]["train"]["path"] == ""
     nlp = English.from_config(config)
-    assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
     width = "${components.tok2vec.model.width}"
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["training"]["train_corpus"]["path"] == ""
+    assert interpolated2["training"]["corpus"]["train"]["path"] == ""
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["training"]["train_corpus"]["path"] == ""
+    assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 0f49b02e3..c25ce1651 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -26,7 +26,7 @@ streaming.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 79ecb08b3..74d612862 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                  |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus`          | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `train_corpus`        | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
+| Name                  | Description                                                                                                                                                                                                                                           |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
+| `corpus`              | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                       |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                       |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                             |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                               |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                       |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                                   |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                         |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                       |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                      |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..be7994d5d 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -448,7 +448,7 @@ remain in the config file stored on your local system.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 | Name                   | Description                                                                                                                           |
@@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 9776dab1b..3a6bd4551 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -969,7 +969,7 @@ your results.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65cfb563b..bba2e2853 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
 > #### config.cfg
 >
 > ```ini
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "corpus_variants.v1"
 > source = "s3://your_bucket/path/data.csv"
 > ```

From 733665766205f350398d3216e94ab8a5ac6c3751 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:07:16 +0200
Subject: [PATCH 006/516] corpus is a Dict

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index d8bcf3c1d..2030048d8 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,7 +198,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    corpus: Reader = Field(..., title="Reader for the training and dev data")
+    corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")

From 55f8d5478ecb5fd913a3a5fe7c469e8bc8a4f038 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:09:30 +0200
Subject: [PATCH 007/516] fix example output

---
 website/docs/api/cli.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8449d23e1..7dd6e6184 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -272,7 +272,7 @@ training -> dropout     field required
 training -> optimizer   field required
 training -> optimize    extra fields not permitted
 
-{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
+{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
 
 If your config contains missing values, you can run the 'init fill-config'
 command to fill in all the defaults, if possible:
@@ -370,7 +370,12 @@ Registry   @schedules
 Name       compounding.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 43)
-ℹ [training.dev_corpus]
+ℹ [training.corpus.dev]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
+ℹ [training.corpus.train]
 Registry   @readers
 Name       spacy.Corpus.v1
 Module     spacy.training.corpus
@@ -385,11 +390,6 @@ Registry   @schedules
 Name       warmup_linear.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 91)
-ℹ [training.train_corpus]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
 ```
 
 </Accordion>

From f420aa1138f52c732102b6ad00825bab797792ec Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:30:09 +0200
Subject: [PATCH 008/516] use e.value to get to the ExceptionInfo value

---
 spacy/tests/test_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index fba362b76..2a24d368a 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -298,4 +298,4 @@ def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
     with pytest.raises(ValueError) as e:
         Language(value)
-    assert err_fragment in str(e)
+    assert err_fragment in str(e.value)

From 714a5a05c65e28b5264d16e7dba202126de2cbfb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:39:55 +0200
Subject: [PATCH 009/516] test for custom readers with ml_datasets >= 0.2

---
 spacy/pipeline/textcat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 3f6250680..e7cb62a0d 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -181,9 +181,9 @@ class TextCategorizer(Pipe):
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#predict
         """
-        tensors = [doc.tensor for doc in docs]
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
+            tensors = [doc.tensor for doc in docs]
             xp = get_array_module(tensors)
             scores = xp.zeros((len(docs), len(self.labels)))
             return scores

From 1040e250d8f740db7d0a6b012962b25ce7f95ffb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:41:28 +0200
Subject: [PATCH 010/516] actual commit with test for custom readers with
 ml_datasets >= 0.2

---
 requirements.txt                     |  2 +-
 spacy/tests/training/test_readers.py | 58 ++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/training/test_readers.py

diff --git a/requirements.txt b/requirements.txt
index db6eae2ef..a67ade640 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a31,<8.0.0a40
 blis>=0.4.0,<0.5.0
-ml_datasets>=0.1.1
+ml_datasets>=0.2.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
new file mode 100644
index 000000000..c81ec0897
--- /dev/null
+++ b/spacy/tests/training/test_readers.py
@@ -0,0 +1,58 @@
+import pytest
+from thinc.api import Config
+from spacy.util import load_model_from_config
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "reader,additional_config",
+    [
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
+    ],
+)
+def test_cat_readers(reader, additional_config):
+    nlp_config_string = """
+    [training]
+    
+    [training.corpus]
+    @readers = "PLACEHOLDER"
+
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec", "textcat"]
+    
+    [components]
+    
+    [components.tok2vec]
+    factory = "tok2vec"
+    
+    [components.textcat]
+    factory = "textcat"
+    """
+    config = Config().from_str(nlp_config_string)
+    config["training"]["corpus"]["@readers"] = reader
+    config["training"]["corpus"].update(additional_config)
+    nlp, resolved = load_model_from_config(config, auto_fill=True)
+
+    train_corpus = resolved["training"]["corpus"]["train"]
+    optimizer = resolved["training"]["optimizer"]
+    # simulate a training loop
+    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    for example in train_corpus(nlp):
+        assert example.y.cats
+        # this shouldn't fail if each training example has at least one positive label
+        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
+        nlp.update([example], sgd=optimizer)
+    # simulate performance benchmark on dev corpus
+    dev_corpus = resolved["training"]["corpus"]["dev"]
+    dev_examples = list(dev_corpus(nlp))
+    for example in dev_examples:
+        # this shouldn't fail if each dev example has at least one positive label
+        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
+    scores = nlp.evaluate(dev_examples)
+    assert scores["cats_score"]
+    # ensure the pipeline runs
+    doc = nlp("Quick test")
+    assert doc.cats

From 0dc914b667706b4e598b61e3cfff0a85e820118f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:42:58 +0200
Subject: [PATCH 011/516] bump thinc to 8.0.0a33

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e610e603e..a413a099c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a31,<8.0.0a40",
+    "thinc>=8.0.0a33,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index a67ade640..69477c2d3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a31,<8.0.0a40
+thinc>=8.0.0a33,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.2.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 10a8972b0..359e63172 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a31,<8.0.0a40
+    thinc>=8.0.0a33,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a31,<8.0.0a40
+    thinc>=8.0.0a33,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0

From 87c329c7114767d8788090a3838fce0bf36822b7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:37:29 +0200
Subject: [PATCH 012/516] Set rule-based lemmatizers as default (#6076)

For languages without provided models and with lemmatizer rules in
`spacy-lookups-data`, make the rule-based lemmatizer the default:
Bengali, Persian, Norwegian, Swedish
---
 spacy/lang/bn/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/fa/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/nb/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/sv/__init__.py            | 23 +++++++++++++++++++++++
 spacy/tests/lang/test_lemmatizers.py |  2 +-
 5 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6c1d66cba..270185a4b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,7 +1,11 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class BengaliDefaults(Language.Defaults):
@@ -17,4 +21,22 @@ class Bengali(Language):
     Defaults = BengaliDefaults
 
 
+@Bengali.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Bengali"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 7fdb9d065..244534120 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class PersianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Persian(Language):
     Defaults = PersianDefaults
 
 
+@Persian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Persian"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index d2bb92072..28a2f0bf2 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class NorwegianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Norwegian(Language):
     Defaults = NorwegianDefaults
 
 
+@Norwegian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Norwegian"]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 0c6a1b9f4..6db74cd39 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,8 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
+
 
 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@@ -22,4 +27,22 @@ class Swedish(Language):
     Defaults = SwedishDefaults
 
 
+@Swedish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Swedish"]
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 14c59659a..6e7f82341 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 # Only include languages with no external dependencies
 # excluded: ru, uk
 # excluded for custom tables: pl
-LANGUAGES = ["el", "en", "fr", "nl"]
+LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 # fmt: on
 
 

From d722a439aa3bef5d4b4fa677aa6b427f7186a673 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:39:41 +0200
Subject: [PATCH 013/516] Remove unneeded methods in senter and morphologizer
 (#6074)

Now that the tagger doesn't manage the tag map, the child classes senter
and morphologizer don't need to override the serialization methods.
---
 spacy/pipeline/morphologizer.pyx | 76 --------------------------------
 spacy/pipeline/senter.pyx        | 76 --------------------------------
 2 files changed, 152 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 57bdb28d7..0e0791004 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -259,79 +259,3 @@ class Morphologizer(Tagger):
         results.update(Scorer.score_token_attr_per_feat(examples,
             "morph", **kwargs))
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The loaded Morphologizer.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The modified Morphologizer object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 00664131b..a7eb721fd 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
         results = Scorer.score_spans(examples, "sents", **kwargs)
         del results["sents_per_type"]
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The loaded SentenceRecognizer.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The modified SentenceRecognizer object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self

From f3db3f6fe00455f69bf05135f941ba88d307738b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:45:04 +0200
Subject: [PATCH 014/516] Add vectors option to CharacterEmbed (#6069)

* Add vectors option to CharacterEmbed

* Update spacy/pipeline/morphologizer.pyx

* Adjust default morphologizer config

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/ml/models/tok2vec.py       | 39 +++++++++++++++++++++++---------
 spacy/pipeline/morphologizer.pyx |  1 +
 spacy/tests/test_tok2vec.py      |  4 ++--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 2e5f8a802..7ced4bd04 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -164,7 +164,7 @@ def MultiHashEmbed(
 
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
+def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is
@@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
+    also_use_static_vectors (bool): Whether to also use static word vectors.
+        Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    model = chain(
-        concatenate(
-            chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
-            chain(
-                FeatureExtractor([NORM]),
-                list2ragged(),
-                with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+    if also_use_static_vectors:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+                StaticVectors(width, dropout=0.0),
             ),
-        ),
-        with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
-        ragged2list(),
+            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
+    )
+    else:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+            ),
+            with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
     )
     return model
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 0e0791004..bb68a358c 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,6 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
+also_use_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index fb30c6ae5..f3f35e4a7 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From d31afc833485fb6fd347fd41d94a4050a69dfa96 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:49:48 +0200
Subject: [PATCH 015/516] Fix Language.link_components when model is None

---
 spacy/language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 8f7cb1973..4c0a6d7e6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1450,8 +1450,8 @@ class Language:
         """
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
-                for name2, proc2 in self.pipeline[i:]:
-                    if hasattr(proc2, "model"):
+                for name2, proc2 in self.pipeline[i+1:]:
+                    if isinstance(getattr(proc2, "model", None), Model):
                         proc1.find_listeners(proc2.model)
 
     @classmethod

From 4a573d18b3a818d3f9de3115d5376bf564337ba5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:51:29 +0200
Subject: [PATCH 016/516] Add comment

---
 spacy/language.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 4c0a6d7e6..3f0f850c2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1448,6 +1448,11 @@ class Language:
         """Register 'listeners' within pipeline components, to allow them to
         effectively share weights.
         """
+        # I had though, "Why do we do this inside the Language object? Shouldn't
+        # it be the tok2vec/transformer/etc's job?
+        # The problem is we need to do it during deserialization...And the
+        # components don't receive the pipeline then. So this does have to be
+        # here :(
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
                 for name2, proc2 in self.pipeline[i+1:]:

From c776594ab1a27f51ddb6e5ea1ea815f515ad5213 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 18:15:14 +0200
Subject: [PATCH 017/516] Fix

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3f0f850c2..d530e6b92 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
 import warnings
-from thinc.api import get_current_ops, Config, require_gpu, Optimizer
+from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle

From a119667a36cced2ae5db6333e1539eb407fff70d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 20:32:38 +0200
Subject: [PATCH 018/516] Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
---
 .../pipeline/_parser_internals/arc_eager.pyx  |  2 +-
 spacy/pipeline/_parser_internals/nonproj.pyx  |  2 +-
 spacy/tests/doc/test_doc_api.py               | 12 +--
 spacy/tests/doc/test_token_api.py             | 35 ++++++-
 spacy/tests/parser/test_parse.py              |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  2 +-
 spacy/tokens/_retokenize.pyx                  |  5 +-
 spacy/tokens/doc.pxd                          |  9 +-
 spacy/tokens/doc.pyx                          | 63 +++++--------
 spacy/tokens/span.pyx                         |  3 -
 spacy/tokens/token.pyx                        | 92 +++----------------
 11 files changed, 85 insertions(+), 142 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index bb0bf35b8..a5fc2ea0e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -680,7 +680,7 @@ cdef class ArcEager(TransitionSystem):
 
     def finalize_doc(self, Doc doc):
         doc.is_parsed = True
-        set_children_from_heads(doc.c, doc.length)
+        set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
         for word in eg.y[start:end]:
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 8f5fdaa71..82070cd27 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
             new_head = _find_new_head(doc[i], head_label)
             doc.c[i].head = new_head.i - i
             doc.c[i].dep = doc.vocab.strings.add(new_label)
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     return doc
 
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index b37a31e43..31dbad9ca 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -265,17 +265,11 @@ def test_doc_is_nered(en_vocab):
 
 def test_doc_from_array_sent_starts(en_vocab):
     words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
-    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = Doc(en_vocab, words=words)
-    for i, (dep, head) in enumerate(zip(deps, heads)):
-        doc[i].dep_ = dep
-        doc[i].head = doc[head]
-        if head == i:
-            doc[i].is_sent_start = True
-    doc.is_parsed
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index be56c9b71..28ef0dd7f 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -112,7 +112,6 @@ def test_doc_token_api_ancestors(en_tokenizer):
 
 
 def test_doc_token_api_head_setter(en_tokenizer):
-    # the structure of this sentence depends on the English annotation scheme
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
     tokens = en_tokenizer(text)
@@ -169,6 +168,40 @@ def test_doc_token_api_head_setter(en_tokenizer):
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
 
+    # test sentence starts when two sentences are joined
+    text = "This is one sentence. This is another sentence."
+    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
+    tokens = en_tokenizer(text)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=["dep"] * len(heads),
+    )
+    # initially two sentences
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # modifying with a sentence doesn't change sent starts
+    doc[2].head = doc[3]
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # attach the second sentence to the first, resulting in one sentence
+    doc[5].head = doc[0]
+    assert doc[0].is_sent_start
+    assert not doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[9]
+
 
 def test_is_sent_start(en_tokenizer):
     doc = en_tokenizer("This is a sentence. This is another.")
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 8d45e2132..691a7c3aa 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -184,7 +184,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert doc[i].is_sent_start is None
+            assert not doc[i].is_sent_start
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index beb8faca1..859e4d80e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert doc[1].is_sent_start is None
+    assert not doc[1].is_sent_start
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 9323bb579..cd1e73a2b 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
     for i in range(doc.length):
         doc.c[i].head -= i
     # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     # Make sure ent_iob remains consistent
     make_iob_consistent(doc.c, doc.length)
     # Return the merged Python object
@@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     for i in range(doc.length):
         doc.c[i].head -= i
     # set children from head
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
 
 
 def _validate_extensions(extensions):
@@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
 def normalize_token_attrs(Vocab vocab, attrs):
     if "_" in attrs:  # Extension attributes
         extensions = attrs["_"]
-        print("EXTENSIONS", extensions)
         _validate_extensions(extensions)
         attrs = {key: value for key, value in attrs.items() if key != "_"}
         attrs = intify_attrs(attrs, strings_map=vocab.strings)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 2775aa97e..9b382d687 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
     const_TokenC_ptr
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
 
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
@@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
-
-
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
 
 cdef class Doc:
@@ -74,5 +71,3 @@ cdef class Doc:
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
 
     cpdef np.ndarray to_array(self, object features)
-
-    cdef void set_parse(self, const TokenC* parsed) nogil
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 93520aeda..62a6dd6db 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,32 +1,27 @@
 # cython: infer_types=True, bounds_check=False, profile=True
 cimport cython
 cimport numpy as np
-from libc.string cimport memcpy, memset
+from libc.string cimport memcpy
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
 import copy
 from collections import Counter
 import numpy
-import numpy.linalg
-import struct
 import srsly
 from thinc.api import get_array_module
 from thinc.util import copy_array
 import warnings
-import copy
 
 from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
+from ..attrs cimport attr_id_t
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
-from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ..attrs import intify_attr, intify_attrs, IDS
-from ..util import normalize_slice
+from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from .. import util
@@ -291,7 +286,7 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#getitem
         """
         if isinstance(i, slice):
-            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
+            start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self, start, stop, label=0)
         if i < 0:
             i = self.length + i
@@ -627,10 +622,7 @@ cdef class Doc:
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
-        objects. Sentence spans have no label. To improve accuracy on informal
-        texts, spaCy calculates sentence boundaries from the syntactic
-        dependency parse. If the parser is disabled, the `sents` iterator will
-        be unavailable.
+        objects. Sentence spans have no label.
 
         YIELDS (Span): Sentences in the document.
 
@@ -786,14 +778,6 @@ cdef class Doc:
         for i in range(self.length, self.max_length + PADDING):
             self.c[i].lex = &EMPTY_LEXEME
 
-    cdef void set_parse(self, const TokenC* parsed) nogil:
-        # TODO: This method is fairly misleading atm. It's used by Parser
-        # to actually apply the parse calculated. Need to rethink this.
-        # Probably we should use from_array?
-        self.is_parsed = True
-        for i in range(self.length):
-            self.c[i] = parsed[i]
-
     def from_array(self, attrs, array):
         """Load attributes from a numpy array. Write to a `Doc` object, from an
         `(M, N)` array of attributes.
@@ -884,7 +868,7 @@ cdef class Doc:
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
         # If document is parsed, set children
         if self.is_parsed:
-            set_children_from_heads(self.c, length)
+            set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
@@ -1321,13 +1305,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
             return mid
     return -1
 
-
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
+    # note: end is exclusive
     cdef TokenC* head
     cdef TokenC* child
     cdef int i
     # Set number of left/right children to 0. We'll increment it in the loops.
-    for i in range(length):
+    for i in range(start, end):
         tokens[i].l_kids = 0
         tokens[i].r_kids = 0
         tokens[i].l_edge = i
@@ -1341,38 +1325,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
     # without risking getting stuck in an infinite loop if something is
     # terribly malformed.
     while not heads_within_sents:
-        heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
+        heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
         if loop_count > 10:
             warnings.warn(Warnings.W026)
             break
         loop_count += 1
     # Set sentence starts
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        tokens[i].sent_start = -1
+    for i in range(start, end):
+        if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = True
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
     # Set left edges
     cdef TokenC* head
     cdef TokenC* child
     cdef int i, j
-    for i in range(length):
+    for i in range(start, end):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child < head and loop_count == 0:
+        if loop_count == 0 and child < head:
             head.l_kids += 1
         if child.l_edge < head.l_edge:
             head.l_edge = child.l_edge
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
     # Set right edges - same as above, but iterate in reverse
-    for i in range(length-1, -1, -1):
+    for i in range(end-1, start-1, -1):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child > head and loop_count == 0:
+        if loop_count == 0 and child > head:
             head.r_kids += 1
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
@@ -1380,14 +1366,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
             head.l_edge = child.l_edge
     # Get sentence start positions according to current state
     sent_starts = set()
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        if tokens[i].head == 0:
             sent_starts.add(tokens[i].l_edge)
     cdef int curr_sent_start = 0
     cdef int curr_sent_end = 0
     # Check whether any heads are not within the current sentence
-    for i in range(length):
-        if (i > 0 and i in sent_starts) or i == length - 1:
+    for i in range(start, end):
+        if (i > 0 and i in sent_starts) or i == end - 1:
             curr_sent_end = i
             for j in range(curr_sent_start, curr_sent_end):
                 if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
@@ -1436,6 +1422,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
         with shape (n, n), where n = len(doc).
     """
     cdef int [:,:] lca_matrix
+    cdef int j, k
     n_tokens= end - start
     lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
     lca_mat.fill(-1)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f06f3307d..1f42c84ee 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -4,13 +4,10 @@ cimport numpy as np
 from libc.math cimport sqrt
 
 import numpy
-import numpy.linalg
 from thinc.api import get_array_module
-from collections import defaultdict
 import warnings
 
 from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
-from .token cimport TokenC
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2474f0637..35142c35e 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,6 +1,4 @@
 # cython: infer_types=True
-from libc.string cimport memcpy
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
-from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
-from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
+from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
+from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
 from ..symbols cimport conj
 from .morphanalysis cimport MorphAnalysis
+from .doc cimport set_children_from_heads
 
 from .. import parts_of_speech
-from .. import util
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
@@ -658,78 +655,19 @@ cdef class Token:
             # Do nothing if old head is new head
             if self.i + self.c.head == new_head.i:
                 return
-            cdef Token old_head = self.head
-            cdef int rel_newhead_i = new_head.i - self.i
-            # Is the new head a descendant of the old head
-            cdef bint is_desc = old_head.is_ancestor(new_head)
-            cdef int new_edge
-            cdef Token anc, child
-            # Update number of deps of old head
-            if self.c.head > 0:  # left dependent
-                old_head.c.l_kids -= 1
-                if self.c.l_edge == old_head.c.l_edge:
-                    # The token dominates the left edge so the left edge of
-                    # the head may change when the token is reattached, it may
-                    # not change if the new head is a descendant of the current
-                    # head.
-                    new_edge = self.c.l_edge
-                    # The new l_edge is the left-most l_edge on any of the
-                    # other dependents where the l_edge is left of the head,
-                    # otherwise it is the head
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.l_edge < new_edge:
-                                new_edge = child.c.l_edge
-                        old_head.c.l_edge = new_edge
-                    # Walk up the tree from old_head and assign new l_edge to
-                    # ancestors until an ancestor already has an l_edge that's
-                    # further left
-                    for anc in old_head.ancestors:
-                        if anc.c.l_edge <= new_edge:
-                            break
-                        anc.c.l_edge = new_edge
-            elif self.c.head < 0:  # right dependent
-                old_head.c.r_kids -= 1
-                # Do the same thing as for l_edge
-                if self.c.r_edge == old_head.c.r_edge:
-                    new_edge = self.c.r_edge
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.r_edge > new_edge:
-                                new_edge = child.c.r_edge
-                        old_head.c.r_edge = new_edge
-                    for anc in old_head.ancestors:
-                        if anc.c.r_edge >= new_edge:
-                            break
-                        anc.c.r_edge = new_edge
-            # Update number of deps of new head
-            if rel_newhead_i > 0:  # left dependent
-                new_head.c.l_kids += 1
-                # Walk up the tree from new head and set l_edge to self.l_edge
-                # until you hit a token with an l_edge further to the left
-                if self.c.l_edge < new_head.c.l_edge:
-                    new_head.c.l_edge = self.c.l_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.l_edge <= self.c.l_edge:
-                            break
-                        anc.c.l_edge = self.c.l_edge
-            elif rel_newhead_i < 0:  # right dependent
-                new_head.c.r_kids += 1
-                # Do the same as for l_edge
-                if self.c.r_edge > new_head.c.r_edge:
-                    new_head.c.r_edge = self.c.r_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.r_edge >= self.c.r_edge:
-                            break
-                        anc.c.r_edge = self.c.r_edge
+            # Find the widest l/r_edges of the roots of the two tokens involved
+            # to limit the number of tokens for set_children_from_heads
+            cdef Token self_root, new_head_root
+            self_ancestors = list(self.ancestors)
+            new_head_ancestors = list(new_head.ancestors)
+            self_root = self_ancestors[-1] if self_ancestors else self
+            new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
+            start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
+            end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
             # Set new head
-            self.c.head = rel_newhead_i
+            self.c.head = new_head.i - self.i
+            # Adjust parse properties and sentence starts
+            set_children_from_heads(self.doc.c, start, end + 1)
 
     @property
     def conjuncts(self):

From 7e4cd7575c33929bca0d3f7d932b0968803e4a71 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 00:14:01 +0200
Subject: [PATCH 019/516] Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/displacy/__init__.py                    |   2 +-
 spacy/errors.py                               |  14 +-
 spacy/lang/de/syntax_iterators.py             |   2 +-
 spacy/lang/el/syntax_iterators.py             |   2 +-
 spacy/lang/en/syntax_iterators.py             |   2 +-
 spacy/lang/es/syntax_iterators.py             |   2 +-
 spacy/lang/fa/syntax_iterators.py             |   2 +-
 spacy/lang/fr/syntax_iterators.py             |   2 +-
 spacy/lang/id/syntax_iterators.py             |   2 +-
 spacy/lang/nb/syntax_iterators.py             |   2 +-
 spacy/lang/sv/syntax_iterators.py             |   2 +-
 spacy/matcher/matcher.pyx                     |  15 +-
 spacy/matcher/phrasematcher.pyx               |  20 ++-
 .../pipeline/_parser_internals/arc_eager.pyx  |   1 -
 spacy/pipeline/functions.py                   |   2 +-
 spacy/pipeline/morphologizer.pyx              |   2 -
 spacy/pipeline/tagger.pyx                     |   1 -
 spacy/tests/doc/test_doc_api.py               |  89 ++++++++--
 spacy/tests/doc/test_span.py                  |   6 +-
 spacy/tests/doc/test_token_api.py             |   9 +-
 spacy/tests/lang/de/test_noun_chunks.py       |   4 -
 spacy/tests/lang/el/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_sbd.py               |   3 +-
 spacy/tests/lang/es/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fa/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fr/test_noun_chunks.py       |   4 -
 spacy/tests/lang/id/test_noun_chunks.py       |   4 -
 spacy/tests/lang/nb/test_noun_chunks.py       |   4 -
 spacy/tests/lang/sv/test_noun_chunks.py       |   4 -
 spacy/tests/matcher/test_matcher_api.py       |  11 +-
 spacy/tests/matcher/test_phrase_matcher.py    |  17 +-
 spacy/tests/parser/test_parse.py              |   5 +-
 spacy/tests/parser/test_parse_navigate.py     |   2 +-
 spacy/tests/parser/test_space_attachment.py   |   3 +-
 spacy/tests/pipeline/test_attributeruler.py   |   6 +
 spacy/tests/pipeline/test_functions.py        |   2 -
 spacy/tests/pipeline/test_sentencizer.py      |  12 +-
 spacy/tests/regression/test_issue1-1000.py    |   5 +-
 spacy/tests/regression/test_issue1501-2000.py |  27 ++-
 spacy/tests/regression/test_issue2001-2500.py |   5 +-
 spacy/tests/regression/test_issue2501-3000.py |   8 +-
 spacy/tests/regression/test_issue3001-3500.py |  18 +-
 spacy/tests/regression/test_issue3501-4000.py |   2 -
 spacy/tests/regression/test_issue4001-4500.py |   5 +-
 spacy/tests/test_scorer.py                    |   1 -
 spacy/tests/test_training.py                  |  20 +--
 spacy/tokens/_serialize.py                    |   2 +-
 spacy/tokens/doc.pxd                          |   4 -
 spacy/tokens/doc.pyx                          | 157 +++++++++---------
 spacy/tokens/span.pyx                         |  17 +-
 spacy/tokens/token.pyx                        |   2 +-
 spacy/training/converters/conllu2docs.py      |   4 -
 spacy/training/gold_io.pyx                    |  12 +-
 website/docs/api/doc.md                       |  47 +++---
 website/docs/usage/v3.md                      |  20 +++
 56 files changed, 350 insertions(+), 282 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 0e80c3b5f..48229572b 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         warnings.warn(Warnings.W005)
     if options.get("collapse_phrases", False):
         with doc.retokenize() as retokenizer:
diff --git a/spacy/errors.py b/spacy/errors.py
index 3bdeeccbe..173aedab9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -119,6 +119,11 @@ class Warnings:
     W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
             "need to match on a stream of documents, you can use nlp.pipe and "
             "call the {matcher} on each Doc object.")
+    W106 = ("Both HEAD and SENT_START are included as attributes in "
+            "doc.from_array(). The parse trees based on the HEAD attribute "
+            "will override the values in SENT_START.")
+    W107 = ("The property Doc.{prop} is deprecated. Use "
+            "Doc.has_annotation(\"{attr}\") instead.")
 
 
 @add_codes
@@ -192,11 +197,6 @@ class Errors:
             "Alternatively, add the dependency parser, or set sentence "
             "boundaries by setting doc[i].is_sent_start.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
-    E032 = ("Conflicting attributes specified in doc.from_array(): "
-            "(HEAD, SENT_START). The HEAD attribute currently sets sentence "
-            "boundaries implicitly, based on the tree structure. This means "
-            "the HEAD attribute would potentially override the sentence "
-            "boundaries set by SENT_START.")
     E033 = ("Cannot load into non-empty Doc of length {length}.")
     E035 = ("Error creating span with start {start} and end {end} for Doc of "
             "length {length}.")
@@ -397,8 +397,8 @@ class Errors:
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option validate=True with Matcher, "
             "PhraseMatcher, or EntityRuler for more details.")
-    E155 = ("The pipeline needs to include a tagger in order to use "
-            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
+    E155 = ("The pipeline needs to include a {pipe} in order to use "
+            "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
             "instead of list(nlp.tokenizer.pipe()).")
     E156 = ("The pipeline needs to include a parser in order to use "
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index bd495f792..bd75a61eb 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_label = doc.vocab.strings.add("NP")
     np_deps = set(doc.vocab.strings.add(label) for label in labels)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 0a13edcc0..89cfd8b72 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     # Further improvement of the models will eliminate the need for this tag.
     labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 59ae733bd..2a1b0867e 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 427f1f203..ad0a1b838 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
     doc = doclike.doc
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     if not len(doc):
         return
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index b63db3539..0be06e73c 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -19,7 +19,7 @@ def noun_chunks(doclike):
     ]
     doc = doclike.doc  # Ensure works on both Doc and Span.
 
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
 
     np_deps = [doc.vocab.strings.add(label) for label in labels]
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index f6d261643..0f29bfe16 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 662b508ed..d5ae47853 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 079cac788..d83f58181 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,7 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -215,10 +215,15 @@ cdef class Matcher:
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
         cdef Pool tmp_pool = Pool()
-        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
-          and not doc.is_tagged:
-            raise ValueError(Errors.E155.format())
-        if DEP in self._seen_attrs and not doc.is_parsed:
+        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
+            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+        if POS in self._seen_attrs and not doc.has_annotation("POS"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
+            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
             raise ValueError(Errors.E156.format())
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                 extensions=self._extensions, predicates=self._extra_predicates)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index fae513367..b00ba157f 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
@@ -184,12 +184,20 @@ cdef class PhraseMatcher:
             if len(doc) == 0:
                 continue
             if isinstance(doc, Doc):
-                if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
-                    raise ValueError(Errors.E155.format())
-                if self.attr == DEP and not doc.is_parsed:
+                attrs = (TAG, POS, MORPH, LEMMA, DEP)
+                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+                if self.attr == TAG and not has_annotation[TAG]:
+                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+                if self.attr == POS and not has_annotation[POS]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+                if self.attr == MORPH and not has_annotation[MORPH]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+                if self.attr == LEMMA and not has_annotation[LEMMA]:
+                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+                if self.attr == DEP and not has_annotation[DEP]:
                     raise ValueError(Errors.E156.format())
-                if self._validate and (doc.is_tagged or doc.is_parsed) \
-                  and self.attr not in (DEP, POS, TAG, LEMMA):
+                if self._validate and any(has_annotation.values()) \
+                        and self.attr not in attrs:
                     string_attr = self.vocab.strings[self.attr]
                     warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
                 keyword = self._convert_to_array(doc)
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index a5fc2ea0e..dafa99bdd 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -679,7 +679,6 @@ cdef class ArcEager(TransitionSystem):
                 st._sent[i].dep = self.root_label
 
     def finalize_doc(self, Doc doc):
-        doc.is_parsed = True
         set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 7e68ea369..614608b25 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
 
     DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
     """
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         return doc
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bb68a358c..62ad9e0eb 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -204,8 +204,6 @@ class Morphologizer(Tagger):
                 doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
                 doc.c[j].pos = self.cfg["labels_pos"][morph]
 
-            doc.is_morphed = True
-
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 1f8b4eb7a..0d78047ae 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -168,7 +168,6 @@ class Tagger(Pipe):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0:
                     doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
-            doc.is_tagged = True
 
     def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
         """Learn from a batch of documents and gold-standard information,
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 31dbad9ca..ce979d3d1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     tokens = en_tokenizer(text)
     tokens[0].lemma_ = "lemma"
     tokens[0].norm_ = "norm"
+    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
     tokens[0].ent_kb_id_ = "ent_kb_id"
     new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
@@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
 
 def test_doc_api_sents_empty_string(en_tokenizer):
     doc = en_tokenizer("")
-    doc.is_parsed = True
     sents = list(doc.sents)
     assert len(sents) == 0
 
@@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
     text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
     heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
              -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    deps = ["dep"] * len(heads)
     # fmt: on
 
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
 )
 def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
     tokens = en_tokenizer(sentence)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+    doc = get_doc(
+        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
+    )
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -251,16 +254,16 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
 def test_doc_is_nered(en_vocab):
     words = ["I", "live", "in", "New", "York"]
     doc = Doc(en_vocab, words=words)
-    assert not doc.is_nered
+    assert not doc.has_annotation("ENT_IOB")
     doc.ents = [Span(doc, 3, 5, label="GPE")]
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test creating doc from array with unknown values
     arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
     doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test serialization
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_nered
+    assert new_doc.has_annotation("ENT_IOB")
 
 
 def test_doc_from_array_sent_starts(en_vocab):
@@ -271,25 +274,35 @@ def test_doc_from_array_sent_starts(en_vocab):
     # fmt: on
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
+    # HEAD overrides SENT_START with warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
-    with pytest.raises(ValueError):
+    with pytest.warns(UserWarning):
         new_doc.from_array(attrs, arr)
 
-    attrs = [SENT_START, DEP]
+    # no warning using default attrs
+    attrs = doc._get_array_attrs()
+    arr = doc.to_array(attrs)
+    with pytest.warns(None) as record:
+        new_doc.from_array(attrs, arr)
+        assert len(record) == 0
+
+    # only SENT_START uses SENT_START
+    attrs = [SENT_START]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert not new_doc.is_parsed
+    assert not new_doc.has_annotation("DEP")
 
+    # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert new_doc.is_parsed
+    assert new_doc.has_annotation("DEP")
 
 
 def test_doc_from_array_morph(en_vocab):
@@ -359,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    with pytest.raises(ValueError):
-        # important attributes from sentenziser or parser are missing
-        assert list(m_doc.sents)
     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
     assert str(m_doc) == " ".join(en_texts_without_empty)
@@ -373,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
 
+def test_doc_api_from_docs_ents(en_tokenizer):
+    texts = ["Merging the docs is fun.", "They don't think alike."]
+    docs = [en_tokenizer(t) for t in texts]
+    docs[0].ents = ()
+    docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
+    doc = Doc.from_docs(docs)
+    assert len(doc.ents) == 1
+
+
 def test_doc_lang(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "world"])
     assert doc.lang_ == "en"
@@ -393,3 +412,45 @@ def test_token_lexeme(en_vocab):
     assert isinstance(token.lex, Lexeme)
     assert token.lex.text == token.text
     assert en_vocab[token.orth] == token.lex
+
+
+def test_has_annotation(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
+    for attr in attrs:
+        assert not doc.has_annotation(attr)
+
+    doc[0].tag_ = "A"
+    doc[0].pos_ = "X"
+    doc[0].morph_ = "Feat=Val"
+    doc[0].lemma_ = "a"
+    doc[0].dep_ = "dep"
+    doc[0].head = doc[1]
+    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    doc[1].tag_ = "A"
+    doc[1].pos_ = "X"
+    doc[1].morph_ = ""
+    doc[1].lemma_ = "a"
+    doc[1].dep_ = "dep"
+    doc.ents = [Span(doc, 0, 2, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert doc.has_annotation(attr, require_complete=True)
+
+
+def test_is_flags_deprecated(en_tokenizer):
+    doc = en_tokenizer("test")
+    with pytest.deprecated_call():
+        doc.is_tagged
+    with pytest.deprecated_call():
+        doc.is_parsed
+    with pytest.deprecated_call():
+        doc.is_nered
+    with pytest.deprecated_call():
+        doc.is_sentenced
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 1e9623484..ad4f49042 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
     text = "This is a sentence. This is another sentence. And a third."
     tokens = en_tokenizer(text)
     doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.is_parsed = False
     return doc
 
 
@@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
     heads = [0, 3, -1, -2, -4]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
     lca = doc[:2].get_lca_matrix()
     assert lca.shape == (2, 2)
     assert lca[0, 0] == 0  # the & the -> the
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 28ef0dd7f..1308df67b 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -114,8 +114,9 @@ def test_doc_token_api_ancestors(en_tokenizer):
 def test_doc_token_api_head_setter(en_tokenizer):
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
@@ -208,7 +209,6 @@ def test_is_sent_start(en_tokenizer):
     assert doc[5].is_sent_start is None
     doc[5].is_sent_start = True
     assert doc[5].is_sent_start is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -217,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
     assert doc[4].is_sent_end is None
     doc[5].is_sent_start = True
     assert doc[4].is_sent_end is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -242,14 +241,14 @@ def test_token0_has_sent_start_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_start is True
     assert doc[1].is_sent_start is None
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_tokenlast_has_sent_end_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_end is None
     assert doc[1].is_sent_end is True
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_token_api_conjuncts_chain(en_vocab):
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index ff9f8d5e5..0ed12d208 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_de(de_tokenizer):
     """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = de_tokenizer("Er lag auf seinem")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index 38e72b0b2..2d376c612 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_el(el_tokenizer):
     """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 5395dbabe..fa3a134bd 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -11,12 +11,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = en_tokenizer("This is a sentence")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 38c8d94d8..ee1e6be17 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
 def test_en_sbd_single_punct(en_tokenizer, text, punct):
     heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index a7ec4e562..db89fd903 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_es(es_tokenizer):
     """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = es_tokenizer("en Oxford este verano")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
index 767e91f6b..53b39d9a1 100644
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -3,12 +3,8 @@ import pytest
 
 def test_noun_chunks_is_parsed_fa(fa_tokenizer):
     """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
 
     doc = fa_tokenizer("این یک جمله نمونه می باشد.")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 5fd6897f7..d81199a3e 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = fr_tokenizer("trouver des travaux antérieurs")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index 445643933..fef1524f1 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_id(id_tokenizer):
     """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = id_tokenizer("sebelas")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index c6a00354b..9965fcd14 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
     """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = nb_tokenizer("Smørsausen brukes bl.a. til")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index f352ca648..458cdadd5 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -5,12 +5,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
     """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = sv_tokenizer("Studenten läste den bästa boken")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f335a19..04f9585f1 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = Matcher(en_vocab)
     matcher.add("TEST", [[{"DEP": "a"}]])
     matcher(doc1)
@@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
         matcher(doc2)
     with pytest.raises(ValueError):
         matcher(doc3)
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = Matcher(en_vocab)
         matcher.add("TEST", [[{attr: "a"}]])
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4b7027f87..9caf284a3 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
 
 def test_phrase_matcher_validation(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
     doc3 = Doc(en_vocab, words=["Test"])
     matcher = PhraseMatcher(en_vocab, validate=True)
     with pytest.warns(UserWarning):
@@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = PhraseMatcher(en_vocab, attr="DEP")
     matcher.add("TEST1", [doc1])
     with pytest.raises(ValueError):
         matcher.add("TEST2", [doc2])
     with pytest.raises(ValueError):
         matcher.add("TEST3", [doc3])
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = PhraseMatcher(en_vocab, attr=attr)
         matcher.add("TEST2", [doc2])
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 691a7c3aa..9e760c1e7 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
     text = "The four wheels on the bus turned quickly"
     heads = [2, 1, 4, -1, 1, -2, 0, -1]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert not doc[i].is_sent_start
+            assert doc[i].is_sent_start is False
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f42601a85..db1e98ba0 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
 
     lefts = {}
     rights = {}
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3a0a6b943..3672dabea 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
 def test_parser_space_attachment(en_tokenizer):
     text = "This is a test.\nTo ensure  spaces are attached well."
     heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 9254688cc..a66b34bc0 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
@@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
     # initialize with patterns from asset
     nlp.add_pipe(
@@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_score(nlp, pattern_dicts):
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 0ec8a5ec2..ee9e34df3 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -35,8 +35,6 @@ def doc2(en_tokenizer):
         deps=deps,
     )
     doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
-    doc.is_parsed = True
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 1b1c51f34..5dd0fef43 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     sent_starts = [t.is_sent_start for t in doc]
     sent_ends = [t.is_sent_end for t in doc]
     assert sent_starts == [True, False, True, False, False, False, False]
@@ -22,13 +22,13 @@ def test_sentencizer_pipe():
     nlp = English()
     nlp.add_pipe("sentencizer")
     for doc in nlp.pipe(texts):
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
     for ex in nlp.pipe(texts):
         doc = ex.doc
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
@@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
     nlp.add_pipe("sentencizer")
     for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
         for doc in nlp.pipe(texts):
-            assert doc.is_sentenced
+            assert doc.has_annotation("SENT_START")
             sent_starts = [t.is_sent_start for t in doc]
             if len(doc) == 0:
                 assert sent_starts == []
@@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
@@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=punct_chars)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index ed5bcc1a5..30f66fb1d 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
     doc = get_doc(
         tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
     )
-    doc.is_parsed = True
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
@@ -170,11 +169,9 @@ def test_issue595():
 
 def test_issue599(en_vocab):
     doc = Doc(en_vocab)
-    doc.is_tagged = True
-    doc.is_parsed = True
     doc2 = Doc(doc.vocab)
     doc2.from_bytes(doc.to_bytes())
-    assert doc2.is_parsed
+    assert doc2.has_annotation("DEP")
 
 
 def test_issue600():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index c1d726db6..e226c8524 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir
+from ..util import make_tempdir, get_doc
 
 
 def test_issue1506():
@@ -198,17 +198,26 @@ def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
     string = "This is a first sentence . And another one"
-    doc = Doc(Vocab(), words=string.split())
-    doc[6].sent_start = True
+    words = string.split()
+    doc = get_doc(Vocab(), words=words)
+    doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
-    assert not new_doc.is_parsed
-    assert not new_doc.is_tagged
-    doc.is_parsed = True
-    doc.is_tagged = True
+    assert not new_doc.has_annotation("DEP")
+    assert not new_doc.has_annotation("TAG")
+    doc = get_doc(
+        Vocab(),
+        words=words,
+        tags=["TAG"] * len(words),
+        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        deps=["dep"] * len(words),
+    )
+    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_parsed
-    assert new_doc.is_tagged
+    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
+    assert new_doc[6].sent_start
+    assert new_doc.has_annotation("DEP")
+    assert new_doc.has_annotation("TAG")
 
 
 def test_issue1868():
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 357fbb84e..3bea5d3f6 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
 def test_issue2361(de_tokenizer):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
     doc = de_tokenizer('< > & " ')
-    doc.is_parsed = True
-    doc.is_tagged = True
     html = render(doc)
     for char in chars:
         assert char in html
@@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
     heads = [1, 0, 1, -2, -1, -1]
+    deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
             [0, 1, 1, 1, 1, 1],
@@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 859e4d80e..9267a7346 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -16,16 +16,16 @@ from ..util import get_doc
 
 
 def test_issue2564():
-    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
+    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
     nlp.begin_training()
     doc = nlp("hello world")
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
     docs = nlp.pipe(["hello", "world"])
     piped_doc = next(docs)
-    assert piped_doc.is_tagged
+    assert piped_doc.has_annotation("TAG")
 
 
 def test_issue2569(en_tokenizer):
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert not doc[1].is_sent_start
+    assert doc[1].is_sent_start is False
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 3059eb5ab..d848467dd 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
     ents = [(2, 4, "PERCENT")]
     doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
 
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
@@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
 def test_issue3199():
     """Test that Span.noun_chunks works correctly if no noun chunks iterator
     is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
+    with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
-    doc.is_parsed = True
+    doc = get_doc(
+        Vocab(),
+        words=["This", "is", "a", "sentence"],
+        heads=[0, -1, -2, -3],
+        deps=["dep"] * 4,
+    )
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -250,16 +254,16 @@ def test_issue3456():
 
 
 def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
     be restored after serialization."""
     nlp = English()
     nlp.add_pipe("sentencizer")
     doc = nlp("Hello world")
     assert doc[0].is_sent_start
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert len(list(doc.sents)) == 1
     doc_bytes = doc.to_bytes()
     new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
     assert new_doc[0].is_sent_start
-    assert new_doc.is_sentenced
+    assert new_doc.has_annotation("SENT_START")
     assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index d36e693c7..8c483d877 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
     copy of the Doc.
     """
     doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -386,7 +385,6 @@ def test_issue3959():
     doc[0].pos_ = "NOUN"
     assert doc[0].pos_ == "NOUN"
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     with make_tempdir() as tmp_dir:
         file_path = tmp_dir / "my_doc"
         doc.to_disk(file_path)
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 2beccedcf..4e58c347e 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
     for i, token in enumerate(doc):
         token.pos_ = pos[i]
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     doc_bytes = doc.to_bytes()
     vocab = Vocab()
     vocab = vocab.from_bytes(vocab_bytes)
@@ -249,7 +248,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we have correct IOB annotations
     doc1 = nlp("hi")
-    assert doc1.is_nered
+    assert doc1.has_annotation("ENT_IOB")
     for token in doc1:
         assert token.ent_iob == 2
     # add entity ruler and run again
@@ -260,7 +259,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we still have correct IOB annotations
     doc2 = nlp("hi")
-    assert doc2.is_nered
+    assert doc2.has_annotation("ENT_IOB")
     for token in doc2:
         assert token.ent_iob == 2
 
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index fb96c0361..6e3604ce8 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -80,7 +80,6 @@ def tagged_doc():
         doc[i].morph_ = morphs[i]
         if i > 0:
             doc[i].is_sent_start = False
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/test_training.py b/spacy/tests/test_training.py
index 1926aca1f..5fd40a0dc 100644
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from .util import make_tempdir
+from .util import make_tempdir, get_doc
 
 
 @pytest.fixture
@@ -26,24 +26,16 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
+    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
     nlp = English()
-    doc = nlp(text)
-    for i in range(len(tags)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = pos[i]
-        doc[i].morph_ = morphs[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].dep_ = deps[i]
-        doc[i].head = doc[heads[i]]
-    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
+    words = [t.text for t in nlp.make_doc(text)]
+    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
     doc.cats = cats
-    doc.is_tagged = True
-    doc.is_parsed = True
     return doc
 
 
@@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
     docs = json2docs(data)
     assert len(docs) == 1
     for doc in docs:
-        assert not doc.is_nered
+        assert not doc.has_annotation("ENT_IOB")
     for token in doc:
         assert token.ent_iob == 0
     eg = Example(
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index cd8c81939..c9a20f6c0 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -13,7 +13,7 @@ from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
 
 # fmt: off
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
 # fmt: on
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9b382d687..08f795b1a 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -46,10 +46,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public bint is_tagged
-    cdef public bint is_parsed
-    cdef public bint is_morphed
-
     cdef public float sentiment
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 62a6dd6db..5c5443258 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -24,9 +24,11 @@ from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
 from .. import util
 from .underscore import Underscore, get_ext_args
 from ._retokenize import Retokenizer
+from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 
 
 DEF PADDING = 5
@@ -185,8 +187,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.is_tagged = False
-        self.is_parsed = False
         self.sentiment = 0.0
         self.cats = {}
         self.user_hooks = {}
@@ -216,11 +216,6 @@ cdef class Doc:
             else:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
-        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
-        # There's no information we'd like to add to it, so I guess so?
-        if self.length == 0:
-            self.is_tagged = True
-            self.is_parsed = True
 
     @property
     def _(self):
@@ -228,37 +223,61 @@ cdef class Doc:
         return Underscore(Underscore.doc_extensions, self)
 
     @property
-    def is_sentenced(self):
-        """Check if the document has sentence boundaries assigned. This is
-        defined as having at least one of the following:
+    def is_tagged(self):
+        warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
+        return self.has_annotation("TAG")
 
-        a) An entry "sents" in doc.user_hooks";
-        b) Doc.is_parsed is set to True;
-        c) At least one token other than the first where sent_start is not None.
-        """
-        if "sents" in self.user_hooks:
-            return True
-        if self.is_parsed:
-            return True
-        if len(self) < 2:
-            return True
-        for i in range(1, self.length):
-            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
-                return True
-        return False
+    @property
+    def is_parsed(self):
+        warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
+        return self.has_annotation("DEP")
 
     @property
     def is_nered(self):
-        """Check if the document has named entities set. Will return True if
-        *any* of the tokens has a named entity tag set (even if the others are
-        unknown values), or if the document is empty.
+        warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
+        return self.has_annotation("ENT_IOB")
+
+    @property
+    def is_sentenced(self):
+        warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
+        return self.has_annotation("SENT_START")
+
+    def has_annotation(self, attr, *, require_complete=False):
+        """Check whether the doc contains annotation on a token attribute.
+
+        attr (Union[int, str]): The attribute string name or int ID.
+        require_complete (bool): Whether to check that the attribute is set on
+            every token in the doc.
+        RETURNS (bool): Whether annotation is present.
+
+        DOCS: https://nightly.spacy.io/api/doc#has_annotation
         """
-        if len(self) == 0:
+
+        # empty docs are always annotated
+        if self.length == 0:
             return True
-        for i in range(self.length):
-            if self.c[i].ent_iob != 0:
+        cdef int i
+        cdef int range_start = 0
+        attr = intify_attr(attr)
+        # adjust attributes
+        if attr == HEAD:
+            # HEAD does not have an unset state, so rely on DEP
+            attr = DEP
+        elif attr == self.vocab.strings["IS_SENT_START"]:
+            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
+            attr = SENT_START
+        # special cases for sentence boundaries
+        if attr == SENT_START:
+            if "sents" in self.user_hooks:
                 return True
-        return False
+            # docs of length 1 always have sentence boundaries
+            if self.length == 1:
+                return True
+            range_start = 1
+        if require_complete:
+            return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
+        else:
+            return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
 
     def __getitem__(self, object i):
         """Get a `Token` or `Span` object.
@@ -628,7 +647,7 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#sents
         """
-        if not self.is_sentenced:
+        if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
             yield from self.user_hooks["sents"](self)
@@ -652,10 +671,6 @@ cdef class Doc:
         return self.vocab.lang
 
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
-        if self.length == 0:
-            # Flip these to false when we see the first token.
-            self.is_tagged = False
-            self.is_parsed = False
         if self.length == self.max_length:
             self._realloc(self.length * 2)
         cdef TokenC* t = &self.c[self.length]
@@ -802,8 +817,8 @@ cdef class Doc:
         if array.dtype != numpy.uint64:
             warnings.warn(Warnings.W028.format(type=array.dtype))
 
-        if SENT_START in attrs and HEAD in attrs:
-            raise ValueError(Errors.E032)
+        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
+            warnings.warn(Warnings.W106)
         cdef int i, col
         cdef int32_t abs_head_index
         cdef attr_id_t attr_id
@@ -863,18 +878,17 @@ cdef class Doc:
                     # add morph to morphology table
                     self.vocab.morphology.add(self.vocab.strings[value])
                 Token.set_struct_attr(token, attr_ids[j], value)
-        # Set flags
-        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
-        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
-        # If document is parsed, set children
-        if self.is_parsed:
-            set_children_from_heads(self.c, 0, length)
+        # If document is parsed, set children and sentence boundaries
+        if HEAD in attrs and DEP in attrs:
+            col = attrs.index(DEP)
+            if array[:, col].any():
+                set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
     def from_docs(docs, ensure_whitespace=True, attrs=None):
-        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
-        the same `Vocab`.
+        """Concatenate multiple Doc objects to form a new one. Raises an error
+        if the `Doc` objects do not all share the same `Vocab`.
 
         docs (list): A list of Doc objects.
         ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
@@ -892,16 +906,7 @@ cdef class Doc:
         (vocab,) = vocab
 
         if attrs is None:
-            attrs = [LEMMA, NORM]
-            if all(doc.is_nered for doc in docs):
-                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
-            # TODO: separate for is_morphed?
-            if all(doc.is_tagged for doc in docs):
-                attrs.extend([TAG, POS, MORPH])
-            if all(doc.is_parsed for doc in docs):
-                attrs.extend([HEAD, DEP])
-            else:
-                attrs.append(SENT_START)
+            attrs = Doc._get_array_attrs()
         else:
             if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
                 attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
@@ -973,9 +978,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.is_tagged = self.is_tagged
-        other.is_parsed = self.is_parsed
-        other.is_morphed = self.is_morphed
         other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
@@ -1049,22 +1051,16 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
-        if self.is_tagged:
-            array_head.extend([TAG, POS])
-        # If doc parsed add head and dep attribute
-        if self.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = Doc._get_array_attrs()
         strings = set()
         for token in self:
             strings.add(token.tag_)
             strings.add(token.lemma_)
+            strings.add(token.morph_)
             strings.add(token.dep_)
             strings.add(token.ent_type_)
             strings.add(token.ent_kb_id_)
+            strings.add(token.ent_id_)
             strings.add(token.norm_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
@@ -1214,22 +1210,29 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#to_json
         """
         data = {"text": self.text}
-        if self.is_nered:
+        if self.has_annotation("ENT_IOB"):
             data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                             "label": ent.label_} for ent in self.ents]
-        if self.is_sentenced:
+        if self.has_annotation("SENT_START"):
             sents = list(self.sents)
             data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                              for sent in sents]
         if self.cats:
             data["cats"] = self.cats
         data["tokens"] = []
+        attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
+        include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
         for token in self:
             token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if self.is_tagged:
-                token_data["pos"] = token.pos_
+            if include_annotation["TAG"]:
                 token_data["tag"] = token.tag_
-            if self.is_parsed:
+            if include_annotation["POS"]:
+                token_data["pos"] = token.pos_
+            if include_annotation["MORPH"]:
+                token_data["morph"] = token.morph_
+            if include_annotation["LEMMA"]:
+                token_data["lemma"] = token.lemma_
+            if include_annotation["DEP"]:
                 token_data["dep"] = token.dep_
                 token_data["head"] = token.head.i
             data["tokens"].append(token_data)
@@ -1275,6 +1278,12 @@ cdef class Doc:
                     j += 1
         return output
 
+    @staticmethod
+    def _get_array_attrs():
+        attrs = [LENGTH, SPACY]
+        attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
+        return tuple(attrs)
+
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
     cdef int i = token_by_char(tokens, length, start_char)
@@ -1335,7 +1344,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         tokens[i].sent_start = -1
     for i in range(start, end):
         if tokens[i].head == 0:
-            tokens[tokens[i].l_edge].sent_start = True
+            tokens[tokens[i].l_edge].sent_start = 1
 
 
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1f42c84ee..781474d3a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -201,7 +201,7 @@ cdef class Span:
         return Underscore(Underscore.span_extensions, self,
                           start=self.start_char, end=self.end_char)
 
-    def as_doc(self, bint copy_user_data=False):
+    def as_doc(self, *, bint copy_user_data=False):
         """Create a `Doc` object with a copy of the `Span`'s data.
 
         copy_user_data (bool): Whether or not to copy the original doc's user data.
@@ -209,19 +209,10 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#as_doc
         """
-        # TODO: make copy_user_data a keyword-only argument (Python 3 only)
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
-        if self.doc.is_tagged:
-            array_head.append(TAG)
-        # If doc parsed add head and dep attribute
-        if self.doc.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = self.doc._get_array_attrs()
         array = self.doc.to_array(array_head)
         array = array[self.start : self.end]
         self._fix_dep_copy(array_head, array)
@@ -375,7 +366,7 @@ cdef class Span:
         self.doc.sents
         # Use `sent_start` token attribute to find sentence boundaries
         cdef int n = 0
-        if self.doc.is_sentenced:
+        if self.doc.has_annotation("SENT_START"):
             # Find start of the sentence
             start = self.start
             while self.doc.c[start].sent_start != 1 and start > 0:
@@ -507,8 +498,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#noun_chunks
         """
-        if not self.doc.is_parsed:
-            raise ValueError(Errors.E029)
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenisation from being changed out from under us
         # during the iteration. The tricky thing here is that Span accepts
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 35142c35e..239de4559 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -486,7 +486,7 @@ cdef class Token:
                 return True
 
         def __set__(self, value):
-            if self.doc.is_parsed:
+            if self.doc.has_annotation("DEP"):
                 raise ValueError(Errors.E043)
             if value is None:
                 self.c.sent_start = 0
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index 85afdeef3..ebd123375 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
         doc[i]._.merged_spaceafter = spaces[i]
     ents = get_entities(lines, ner_tag_pattern, ner_map)
     doc.ents = spans_from_biluo_tags(doc, ents)
-    doc.is_parsed = True
-    doc.is_tagged = True
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
@@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
         doc_x[i].dep_ = deps[i]
         doc_x[i].head = doc_x[heads[i]]
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
-    doc_x.is_parsed = True
-    doc_x.is_tagged = True
 
     return doc_x
 
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 5dc39eb31..b58df0d71 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                 json_para["links"].append(link_dict)
         biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
+        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
         for j, sent in enumerate(doc.sents):
             json_sent = {"tokens": [], "brackets": []}
             for token in sent:
                 json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
-                if doc.is_tagged:
+                if include_annotation["TAG"]:
                     json_token["tag"] = token.tag_
+                if include_annotation["POS"]:
                     json_token["pos"] = token.pos_
+                if include_annotation["MORPH"]:
                     json_token["morph"] = token.morph_
+                if include_annotation["LEMMA"]:
                     json_token["lemma"] = token.lemma_
-                if doc.is_parsed:
+                if include_annotation["DEP"]:
                     json_token["head"] = token.head.i-token.i
                     json_token["dep"] = token.dep_
-                json_token["ner"] = biluo_tags[token.i]
+                if include_annotation["ENT_IOB"]:
+                    json_token["ner"] = biluo_tags[token.i]
                 json_sent["tokens"].append(json_token)
             json_para["sentences"].append(json_sent)
         json_doc["paragraphs"].append(json_para)
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 88dc62c2a..380f6a172 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------------------------------------------------------- |
 | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
 
+## Doc.has_annotation {#has_annotation tag="method"}
+
+Check whether the doc contains annotation on a token attribute.
+
+| Name               | Description                                                                                         |
+| ------------------ | --------------------------------------------------------------------------------------------------- |
+| `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
+| _keyword-only_     |                                                                                                     |
+| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
+| **RETURNS**        | Whether specified annotation is present in the doc. ~~bool~~                                        |
+
 ## Doc.to_array {#to_array tag="method"}
 
 Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
@@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                    | Description                                                                                                                                                                              |
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                                  | A string representation of the document text. ~~str~~                                                                                                                                    |
-| `text_with_ws`                          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                                                            |
-| `mem`                                   | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                                                                 |
-| `vocab`                                 | The store of lexical types. ~~Vocab~~                                                                                                                                                    |
-| `tensor` <Tag variant="new">2</Tag>     | Container for dense vector representations. ~~numpy.ndarray~~                                                                                                                            |
-| `cats` <Tag variant="new">2</Tag>       | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~                                              |
-| `user_data`                             | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                                                                         |
-| `lang` <Tag variant="new">2.1</Tag>     | Language of the document's vocabulary. ~~int~~                                                                                                                                           |
-| `lang_` <Tag variant="new">2.1</Tag>    | Language of the document's vocabulary. ~~str~~                                                                                                                                           |
-| `is_tagged`                             | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~                                                                       |
-| `is_parsed`                             | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~                                                                        |
-| `is_sentenced`                          | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~                                                             |
-| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
-| `sentiment`                             | The document's positivity/negativity score, if available. ~~float~~                                                                                                                      |
-| `user_hooks`                            | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                                                                |
-| `user_token_hooks`                      | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                                                                        |
-| `user_span_hooks`                       | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                                                                         |
-| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                            |
+| Name                                 | Description                                                                                                                                 |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                               | A string representation of the document text. ~~str~~                                                                                       |
+| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                               |
+| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                    |
+| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                                       |
+| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                               |
+| `cats` <Tag variant="new">2</Tag>    | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
+| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                            |
+| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                              |
+| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                              |
+| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                         |
+| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
+| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
+| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
+| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
 
 ## Serialization fields {#serialization-fields}
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44810da58..346b44600 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        |
 | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         |
+| [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         |
 | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              |
 | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             |
 | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          |
@@ -763,6 +764,25 @@ nlp = spacy.blank("en")
 + ruler.load_from_tag_map(YOUR_TAG_MAP)
 ```
 
+### Migrating Doc flags {#migrating-doc-flags}
+
+The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3 and replaced by
+[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
+token attribute symbols (the same symbols used in `Matcher` patterns):
+
+```diff
+doc = nlp(text)
+- doc.is_parsed
++ doc.has_annotation("DEP")
+- doc.is_tagged
++ doc.has_annotation("TAG")
+- doc.is_sentenced
++ doc.has_annotation("SENT_START")
+- doc.is_nered
++ doc.has_annotation("ENT_IOB")
+```
+
 ### Training pipelines and models {#migrating-training}
 
 To train your pipelines, you should now pretty much always use the

From 8303d101a5327e96ecddb28d7dc668d75db56b50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 00:18:49 +0200
Subject: [PATCH 020/516] Set version to v3.0.0a19

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4ed3dd327..4fb6dfff1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a18"
+__version__ = "3.0.0a19"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 21dcf92964c6a2c4218d5ffc44a164dead641c44 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 09:21:36 +0200
Subject: [PATCH 021/516] Update website/docs/api/data-formats.md

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 website/docs/api/data-formats.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 74d612862..cf091e16c 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -130,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
 | `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
-| `corpus`              | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
+| `corpus`              | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |

From 0c35885751f2ad83098f54103de33b987b4a199e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 11:38:59 +0200
Subject: [PATCH 022/516] generalize corpora, dot notation for dev and train
 corpus

---
 extra/experiments/onto-joint/defaults.cfg     |  34 +++---
 .../ptb-joint-pos-dep/defaults.cfg            |  32 +++---
 spacy/cli/pretrain.py                         |   3 +-
 spacy/cli/templates/quickstart_training.jinja |  27 ++---
 spacy/cli/train.py                            |   5 +-
 spacy/default_config.cfg                      |  56 +++++----
 spacy/default_config_pretraining.cfg          |  17 +--
 spacy/schemas.py                              |   6 +-
 .../tests/serialize/test_serialize_config.py  |  20 ++--
 spacy/tests/training/test_readers.py          |  63 ++++++++++-
 website/docs/api/cli.md                       |  20 ++--
 website/docs/api/corpus.md                    |   4 +-
 website/docs/api/data-formats.md              | 107 +++++++++++++-----
 website/docs/api/top-level.md                 |   6 +-
 website/docs/usage/projects.md                |   2 +-
 website/docs/usage/training.md                |   2 +-
 16 files changed, 261 insertions(+), 143 deletions(-)

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
index 97eebe6b4..90101281c 100644
--- a/extra/experiments/onto-joint/defaults.cfg
+++ b/extra/experiments/onto-joint/defaults.cfg
@@ -8,6 +8,22 @@ init_tok2vec = null
 seed = 0
 use_pytorch_for_gpu_memory = false
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+gold_preproc = true
+max_length = 0
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
+max_length = 0
+limit = 0
+
 [training]
 seed = ${system:seed}
 dropout = 0.1
@@ -20,22 +36,8 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
 frozen_components = []
-
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
index 03e2f5bd7..55fb52b99 100644
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -8,6 +8,22 @@ init_tok2vec = null
 seed = 0
 use_pytorch_for_gpu_memory = false
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+gold_preproc = true
+max_length = 0
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
+max_length = 0
+limit = 0
+
 [training]
 seed = ${system:seed}
 dropout = 0.2
@@ -20,22 +36,6 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 70858123d..3567e7339 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -20,6 +20,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..tokens import Doc
 from ..attrs import ID
 from .. import util
+from ..util import dot_to_object
 
 
 @app.command(
@@ -106,7 +107,7 @@ def pretrain(
         use_pytorch_for_gpu_memory()
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
-    corpus = P_cfg["corpus"]
+    corpus = dot_to_object(config, config["pretraining"]["corpus"])
     batcher = P_cfg["batcher"]
     model = create_pretraining_model(nlp, config["pretraining"])
     optimizer = config["pretraining"]["optimizer"]
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 39d4d875d..00b77af4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -173,6 +173,18 @@ factory = "{{ pipe }}"
 {% endif %}
 {% endfor %}
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = {{ 500 if hardware == "gpu" else 2000 }}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
 [training]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = null
@@ -182,11 +194,12 @@ vectors = "{{ word_vectors }}"
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
 {% endif %}
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 
 [training.optimizer]
 @optimizers = "Adam.v1"
 
-
 {% if use_transformer -%}
 [training.optimizer.learn_rate]
 @schedules = "warmup_linear.v1"
@@ -195,18 +208,6 @@ total_steps = 20000
 initial_rate = 5e-5
 {% endif %}
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = {{ 500 if hardware == "gpu" else 2000 }}
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-max_length = 0
-
 {% if use_transformer %}
 [training.batcher]
 @batchers = "spacy.batch_by_padded.v1"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2c2eeb88b..15c745b69 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -18,6 +18,7 @@ from ..language import Language
 from .. import util
 from ..training.example import Example
 from ..errors import Errors
+from ..util import dot_to_object
 
 
 @app.command(
@@ -92,8 +93,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = T_cfg["corpus"]["train"]
-    dev_corpus = T_cfg["corpus"]["dev"]
+    train_corpus = dot_to_object(config, config["training"]["train_corpus"])
+    dev_corpus = dot_to_object(config, config["training"]["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 61f3dfe25..c7c9593d7 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -22,6 +22,33 @@ after_pipeline_creation = null
 
 [components]
 
+# Readers for corpora like dev and train.
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
@@ -40,35 +67,14 @@ eval_frequency = 200
 score_weights = {}
 # Names of pipeline components that shouldn't be updated during training
 frozen_components = []
+# Location in the config where the dev corpus is defined
+dev_corpus = "corpora.dev"
+# Location in the config where the train corpus is defined
+train_corpus = "corpora.train"
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length
-max_length = 0
-# Limitation on number of training examples
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length
-max_length = 0
-# Limitation on number of training examples
-limit = 0
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 9120db338..bbd595308 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -4,6 +4,7 @@ dropout = 0.2
 n_save_every = null
 component = "tok2vec"
 layer = ""
+corpus = "corpora.pretrain"
 
 [pretraining.batcher]
 @batchers = "spacy.batch_by_words.v1"
@@ -12,13 +13,6 @@ discard_oversize = false
 tolerance = 0.2
 get_length = null
 
-[pretraining.corpus]
-@readers = "spacy.JsonlReader.v1"
-path = ${paths.raw}
-min_length = 5
-max_length = 500
-limit = 0
-
 [pretraining.objective]
 type = "characters"
 n_characters = 4
@@ -33,3 +27,12 @@ grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001
+
+[corpora]
+
+[corpora.pretrain]
+@readers = "spacy.JsonlReader.v1"
+path = ${paths.raw}
+min_length = 5
+max_length = 500
+limit = 0
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 2030048d8..a530db3d0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,7 +198,8 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data")
+    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
+    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
@@ -248,7 +249,7 @@ class ConfigSchemaPretrain(BaseModel):
     dropout: StrictFloat = Field(..., title="Dropout rate")
     n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
-    corpus: Reader = Field(..., title="Reader for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     component: str = Field(..., title="Component to find the layer to pretrain")
     layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
@@ -267,6 +268,7 @@ class ConfigSchema(BaseModel):
     nlp: ConfigSchemaNlp
     pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
     components: Dict[str, Dict[str, Any]]
+    corpora: Dict[str, Reader]
 
     @root_validator(allow_reuse=True)
     def validate_config(cls, values):
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index d113ac2a5..1e17b3212 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -17,18 +17,18 @@ nlp_config_string = """
 train = ""
 dev = ""
 
-[training]
+[corpora]
 
-[training.corpus]
-
-[training.corpus.train]
+[corpora.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 
-[training.corpus.dev]
+[corpora.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 
+[training]
+
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 size = 666
@@ -302,20 +302,20 @@ def test_config_overrides():
 
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
+    assert config["corpora"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["training"]["corpus"]["train"]["path"] == ""
+    assert interpolated["corpora"]["train"]["path"] == ""
     nlp = English.from_config(config)
-    assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
+    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
     width = "${components.tok2vec.model.width}"
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["training"]["corpus"]["train"]["path"] == ""
+    assert interpolated2["corpora"]["train"]["path"] == ""
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
+    assert nlp2.config["corpora"]["train"]["path"] == ""
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index c81ec0897..52a4abecc 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,6 +1,57 @@
+from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
-from spacy.util import load_model_from_config
+
+from spacy import Language
+from spacy.util import load_model_from_config, registry, dot_to_object
+from spacy.training import Example
+
+
+def test_readers():
+    config_string = """
+    [training]
+    
+    [corpora]
+    @readers = "myreader.v1"
+
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec", "textcat"]
+    
+    [components]
+    
+    [components.tok2vec]
+    factory = "tok2vec"
+    
+    [components.textcat]
+    factory = "textcat"
+    """
+    @registry.readers.register("myreader.v1")
+    def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
+        annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
+        def reader(nlp: Language):
+            doc = nlp.make_doc(f"This is an example")
+            return [Example.from_dict(doc, annots)]
+        return {"train": reader, "dev": reader, "extra": reader, "something": reader}
+
+    config = Config().from_str(config_string)
+    nlp, resolved = load_model_from_config(config, auto_fill=True)
+
+    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    assert isinstance(train_corpus, Callable)
+    optimizer = resolved["training"]["optimizer"]
+    # simulate a training loop
+    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    for example in train_corpus(nlp):
+        nlp.update([example], sgd=optimizer)
+    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
+    scores = nlp.evaluate(list(dev_corpus(nlp)))
+    assert scores["cats_score"]
+    # ensure the pipeline runs
+    doc = nlp("Quick test")
+    assert doc.cats
+    extra_corpus = resolved["corpora"]["extra"]
+    assert isinstance(extra_corpus, Callable)
 
 
 @pytest.mark.slow
@@ -16,7 +67,7 @@ def test_cat_readers(reader, additional_config):
     nlp_config_string = """
     [training]
     
-    [training.corpus]
+    [corpora]
     @readers = "PLACEHOLDER"
 
     [nlp]
@@ -32,11 +83,11 @@ def test_cat_readers(reader, additional_config):
     factory = "textcat"
     """
     config = Config().from_str(nlp_config_string)
-    config["training"]["corpus"]["@readers"] = reader
-    config["training"]["corpus"].update(additional_config)
+    config["corpora"]["@readers"] = reader
+    config["corpora"].update(additional_config)
     nlp, resolved = load_model_from_config(config, auto_fill=True)
 
-    train_corpus = resolved["training"]["corpus"]["train"]
+    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
     optimizer = resolved["training"]["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
@@ -46,7 +97,7 @@ def test_cat_readers(reader, additional_config):
         assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
         nlp.update([example], sgd=optimizer)
     # simulate performance benchmark on dev corpus
-    dev_corpus = resolved["training"]["corpus"]["dev"]
+    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
     dev_examples = list(dev_corpus(nlp))
     for example in dev_examples:
         # this shouldn't fail if each dev example has at least one positive label
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7dd6e6184..5c5eb6486 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -355,6 +355,16 @@ Registry   @architectures
 Name       spacy.MaxoutWindowEncoder.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 207)
+ℹ [corpora.dev]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
+ℹ [corpora.train]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
 ℹ [training.logger]
 Registry   @loggers
 Name       spacy.ConsoleLogger.v1
@@ -370,16 +380,6 @@ Registry   @schedules
 Name       compounding.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 43)
-ℹ [training.corpus.dev]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
-ℹ [training.corpus.train]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
 ℹ [training.optimizer]
 Registry   @optimizers
 Name       Adam.v1
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index c25ce1651..2b308d618 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -26,7 +26,7 @@ streaming.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
@@ -135,7 +135,7 @@ Initialize the reader.
 >
 > ```ini
 > ### Example config
-> [pretraining.corpus]
+> [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = "corpus/raw_text.jsonl"
 > min_length = 0
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index cf091e16c..f868233c7 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -121,28 +121,78 @@ that you don't want to hard-code in your config file.
 $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 ```
 
+### corpora {#config-corpora tag="section"}
+
+This section defines a dictionary mapping of string keys to `Callable`
+functions. Each callable takes an `nlp` object and yields
+[`Example`](/api/example) objects. By default, the two keys `train` and `dev`
+are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When
+pretraining, an additional pretrain section is added that defaults to a
+[`JsonlReader`](/api/top-level#JsonlReader).
+
+These subsections can be expanded with additional subsections, each referring to
+a callback of type `Callable[[Language], Iterator[Example]]`:
+
+> #### Example
+>
+> ```ini
+> [corpora]
+> [corpora.train]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths:train}
+>
+> [corpora.dev]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths:dev}
+>
+> [corpora.pretrain]
+> @readers = "spacy.JsonlReader.v1"
+> path = ${paths.raw}
+> min_length = 5
+> max_length = 500
+>
+> [corpora.mydata]
+> @readers = "my_reader.v1"
+> shuffle = true
+> ```
+
+Alternatively, the `corpora` block could refer to one function with return type
+`Dict[str, Callable[[Language], Iterator[Example]]]`:
+
+> #### Example
+>
+> ```ini
+> [corpora]
+> @readers = "my_dict_reader.v1"
+> train_path = ${paths:train}
+> dev_path = ${paths:dev}
+> shuffle = true
+>
+> ```
+
 ### training {#config-training tag="section"}
 
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                                                           |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
-| `corpus`              | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                       |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                       |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                             |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                               |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                       |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                                   |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                         |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                       |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                      |
+| Name                  | Description                                                                                                                                                                                                  |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
+| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
+| `corpus`              | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -150,17 +200,18 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/embeddings-transformers#pretraining). It's
 used when you run [`spacy pretrain`](/api/cli#pretrain).
 
-| Name           | Description                                                                                                                                                                                  |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                                                                                                        |
-| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                                                                               |
-| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                                                                                      |
-| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                                                                                       |
-| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                |
-| `corpus`       | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ |
-| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                            |
-| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                                                                                                                    |
-| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                                                                                                             |
+| Name           | Description                                                                                            |
+| -------------- | ------------------------------------------------------------------------------------------------------ |
+| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                  |
+| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                         |
+| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                |
+| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
+| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
+| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~    |
+| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
+| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
+| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
+|                |
 
 ## Training data {#training}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index be7994d5d..72b79de48 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -448,7 +448,7 @@ remain in the config file stored on your local system.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
+> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```
 
 | Name                   | Description                                                                                                                           |
@@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
@@ -506,7 +506,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 > [paths]
 > pretrain = "corpus/raw_text.jsonl"
 >
-> [pretraining.corpus]
+> [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = ${paths.pretrain}
 > min_length = 0
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 3a6bd4551..665caa15b 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -969,7 +969,7 @@ your results.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
+> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index bba2e2853..c0f4caad7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
 > #### config.cfg
 >
 > ```ini
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "corpus_variants.v1"
 > source = "s3://your_bucket/path/data.csv"
 > ```

From 427dbecdd63706f9c6c55875d46ed570f5a6a48b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 11:48:04 +0200
Subject: [PATCH 023/516] cleanup and formatting

---
 spacy/cli/pretrain.py                | 14 +++++---------
 spacy/cli/train.py                   |  4 ++--
 spacy/schemas.py                     |  2 +-
 spacy/tests/training/test_readers.py |  3 +++
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 3567e7339..aec077eb7 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -71,9 +71,7 @@ def pretrain_cli(
 
     with show_validation_error(config_path):
         config = util.load_config(
-            config_path,
-            overrides=config_overrides,
-            interpolate=True
+            config_path, overrides=config_overrides, interpolate=True
         )
     if not config.get("pretraining"):
         # TODO: What's the solution here? How do we handle optional blocks?
@@ -84,7 +82,7 @@ def pretrain_cli(
 
     config.to_disk(output_dir / "config.cfg")
     msg.good("Saved config file in the output directory")
- 
+
     pretrain(
         config,
         output_dir,
@@ -99,7 +97,7 @@ def pretrain(
     output_dir: Path,
     resume_path: Optional[Path] = None,
     epoch_resume: Optional[int] = None,
-    use_gpu: int=-1
+    use_gpu: int = -1,
 ):
     if config["system"].get("seed") is not None:
         fix_random_seed(config["system"]["seed"])
@@ -107,7 +105,7 @@ def pretrain(
         use_pytorch_for_gpu_memory()
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
-    corpus = dot_to_object(config, config["pretraining"]["corpus"])
+    corpus = dot_to_object(config, P_cfg["corpus"])
     batcher = P_cfg["batcher"]
     model = create_pretraining_model(nlp, config["pretraining"])
     optimizer = config["pretraining"]["optimizer"]
@@ -148,9 +146,7 @@ def pretrain(
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 msg.row(progress, **row_settings)
-            if P_cfg["n_save_every"] and (
-                batch_id % P_cfg["n_save_every"] == 0
-            ):
+            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
                 _save_model(epoch, is_temp=True)
         _save_model(epoch)
         tracker.epoch_loss = 0.0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 15c745b69..50306b350 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -93,8 +93,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-    dev_corpus = dot_to_object(config, config["training"]["dev_corpus"])
+    train_corpus = dot_to_object(config, T_cfg["train_corpus"])
+    dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/schemas.py b/spacy/schemas.py
index a530db3d0..06bc4beed 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -104,7 +104,7 @@ class TokenPatternOperator(str, Enum):
 StringValue = Union[TokenPatternString, StrictStr]
 NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
-    TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
+    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 52a4abecc..898746c2a 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -26,12 +26,15 @@ def test_readers():
     [components.textcat]
     factory = "textcat"
     """
+
     @registry.readers.register("myreader.v1")
     def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
         annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
+
         def reader(nlp: Language):
             doc = nlp.make_doc(f"This is an example")
             return [Example.from_dict(doc, annots)]
+
         return {"train": reader, "dev": reader, "extra": reader, "something": reader}
 
     config = Config().from_str(config_string)

From 6761028c6f5b033109e3eed4a4b1b19218f55e40 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 12:34:11 +0200
Subject: [PATCH 024/516] Update docs [ci skip]

---
 website/docs/usage/v3.md | 42 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44810da58..72971dce2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -709,6 +709,48 @@ nlp = spacy.blank("en")
 + nlp.add_pipe("ner", source=source_nlp)
 ```
 
+#### Configuring pipeline components with settings {#migrating-configure-pipe}
+
+Because pipeline components are now added using their string names, you won't
+have to instantiate the [component classes](/api/#architecture-pipeline)
+directly anynore. To configure the component, you can now use the `config`
+argument on [`nlp.add_pipe`](/api/language#add_pipe).
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [components.sentencizer]
+> factory = "sentencizer"
+> punct_chars = ["!", ".", "?"]
+> ```
+
+```diff
+punct_chars = ["!", ".", "?"]
+- sentencizer = Sentencizer(punct_chars=punct_chars)
++ sentencizer = nlp.add_pipe("sentencizer", config={"punct_chars": punct_chars})
+```
+
+The `config` corresponds to the component settings in the
+[`config.cfg`](/usage/training#config-components) and will overwrite the default
+config defined by the components.
+
+<Infobox variant="warning" title="Important note on config values">
+
+Config values you pass to components **need to be JSON-serializable** and can't
+be arbitrary Python objects. Otherwise, the settings you provide can't be
+represented in the `config.cfg` and spaCy has no way of knowing how to re-create
+your component with the same settings when you load the pipeline back in. If you
+need to pass arbitrary objects to a component, use a
+[registered function](/usage/processing-pipelines#example-stateful-components):
+
+```diff
+- config = {"model": MyTaggerModel()}
++ config= {"model": {"@architectures": "MyTaggerModel"}}
+tagger = nlp.add_pipe("tagger", config=config)
+```
+
+</Infobox>
+
 ### Adding match patterns {#migrating-matcher}
 
 The [`Matcher.add`](/api/matcher#add),

From 30e85b2a42cdd827bc48411371ebe79b092009a2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 13:59:12 +0200
Subject: [PATCH 025/516] Remove outdated configs

---
 extra/experiments/onto-joint/defaults.cfg     | 133 ---------------
 extra/experiments/onto-joint/pretrain.cfg     | 152 ------------------
 extra/experiments/onto-ner.cfg                |  73 ---------
 .../ptb-joint-pos-dep/bilstm_tok2vec.cfg      |  73 ---------
 .../ptb-joint-pos-dep/defaults.cfg            | 110 -------------
 .../tok2vec-ner/charembed_tok2vec.cfg         |  69 --------
 .../tok2vec-ner/multihashembed_tok2vec.cfg    |  51 ------
 7 files changed, 661 deletions(-)
 delete mode 100644 extra/experiments/onto-joint/defaults.cfg
 delete mode 100644 extra/experiments/onto-joint/pretrain.cfg
 delete mode 100644 extra/experiments/onto-ner.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg
 delete mode 100644 extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
 delete mode 100644 extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
deleted file mode 100644
index 7954b57b5..000000000
--- a/extra/experiments/onto-joint/defaults.cfg
+++ /dev/null
@@ -1,133 +0,0 @@
-[paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
-
-[system]
-seed = 0
-use_pytorch_for_gpu_memory = false
-
-[training]
-seed = ${system:seed}
-dropout = 0.1
-init_tok2vec = ${paths:init_tok2vec}
-vectors = null
-accumulate_gradient = 1
-max_steps = 0
-max_epochs = 0
-patience = 10000
-eval_frequency = 200
-score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
-frozen_components = []
-
-[training.train_corpus]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.dev_corpus]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-load_vocab_data = false
-pipeline = ["tok2vec", "ner", "tagger", "parser"]
-
-[nlp.tokenizer]
-@tokenizers = "spacy.Tokenizer.v1"
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.Lemmatizer.v1"
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-
-[components.tagger]
-factory = "tagger"
-
-[components.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 30
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 128
-maxout_pieces = 2
-use_upper = true
-
-[components.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 128
-maxout_pieces = 2
-use_upper = true
-
-[components.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-rows = 2000
-also_embed_subwords = true
-also_use_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
diff --git a/extra/experiments/onto-joint/pretrain.cfg b/extra/experiments/onto-joint/pretrain.cfg
deleted file mode 100644
index 211339603..000000000
--- a/extra/experiments/onto-joint/pretrain.cfg
+++ /dev/null
@@ -1,152 +0,0 @@
-# Training hyper-parameters and additional features.
-[training]
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length or number of examples.
-max_length = 0
-limit = 0
-# Data augmentation
-orth_variant_level = 0.0
-dropout = 0.1
-# Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 400
-# Other settings
-seed = 0
-accumulate_gradient = 1
-use_pytorch_for_gpu_memory = false
-# Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
-score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
-# These settings are invalid for the transformer models.
-init_tok2vec = null
-discard_oversize = false
-omit_extra_lookups = false
-batch_by = "words"
-use_gpu = -1
-raw_text = null
-tag_map = null
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 1000
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[pretraining]
-max_epochs = 1000
-min_length = 5
-max_length = 500
-dropout = 0.2
-n_save_every = null
-batch_size = 3000
-seed = ${training:seed}
-use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
-tok2vec_model = "nlp.pipeline.tok2vec.model"
-
-[pretraining.objective]
-type = "characters"
-n_characters = 4
-
-[pretraining.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-vectors = null
-base_model = null
-
-[nlp.pipeline]
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.senter]
-factory = "senter"
-
-[nlp.pipeline.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.tagger]
-factory = "tagger"
-
-[nlp.pipeline.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.senter.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.senter.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 128
-maxout_pieces = 3
-use_upper = false
-
-[nlp.pipeline.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 128
-maxout_pieces = 3
-use_upper = false
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 256
-depth = 6
-window_size = 1
-embed_size = 10000
-maxout_pieces = 3
-subword_features = true
-dropout = null
diff --git a/extra/experiments/onto-ner.cfg b/extra/experiments/onto-ner.cfg
deleted file mode 100644
index eab68a27f..000000000
--- a/extra/experiments/onto-ner.cfg
+++ /dev/null
@@ -1,73 +0,0 @@
-# Training hyper-parameters and additional features.
-[training]
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length or number of examples.
-max_length = 3000
-limit = 0
-# Data augmentation
-orth_variant_level = 0.0
-dropout = 0.1
-# Controls early-stopping. 0 or -1 mean unlimited.
-patience = 100000
-max_epochs = 0
-max_steps = 0
-eval_frequency = 1000
-# Other settings
-seed = 0
-accumulate_gradient = 1
-use_pytorch_for_gpu_memory = false
-# Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "ents_p", "ents_r", "ents_f"]
-score_weights = {"ents_f": 1.0}
-# These settings are invalid for the transformer models.
-init_tok2vec = null
-discard_oversize = false
-omit_extra_lookups = false
-batch_by = "words"
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 64
-maxout_pieces = 2
-use_upper = true
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 96
-depth = 4
-window_size = 1
-embed_size = 2000
-maxout_pieces = 3
-subword_features = true
-dropout = ${training:dropout}
diff --git a/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
deleted file mode 100644
index f1b702a4e..000000000
--- a/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ /dev/null
@@ -1,73 +0,0 @@
-[training]
-patience = 10000
-eval_frequency = 200
-dropout = 0.2
-init_tok2vec = null
-vectors = null
-max_epochs = 100
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-use_gpu = 0
-scores = ["tags_acc", "uas", "las"]
-score_weights = {"las": 0.8, "tags_acc": 0.2}
-limit = 0
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = ${training:vectors}
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.tagger]
-factory = "tagger"
-
-[nlp.pipeline.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 64
-maxout_pieces = 3
-
-[nlp.pipeline.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.HashEmbedBiLSTM.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 96
-depth = 4
-embed_size = 2000
-subword_features = true
-maxout_pieces = 3
-dropout = null
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
deleted file mode 100644
index 8f9c5666e..000000000
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ /dev/null
@@ -1,110 +0,0 @@
-[paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
-
-[system]
-seed = 0
-use_pytorch_for_gpu_memory = false
-
-[training]
-seed = ${system:seed}
-dropout = 0.2
-init_tok2vec = ${paths:init_tok2vec}
-vectors = null
-accumulate_gradient = 1
-max_steps = 0
-max_epochs = 0
-patience = 10000
-eval_frequency = 200
-score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
-
-[training.read_train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.read_dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger", "parser"]
-load_vocab_data = false
-
-[nlp.tokenizer]
-@tokenizers = "spacy.Tokenizer.v1"
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.Lemmatizer.v1"
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tagger]
-factory = "tagger"
-
-[components.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 64
-maxout_pieces = 3
-
-[components.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-rows = 2000
-also_embed_subwords = true
-also_use_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
diff --git a/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
deleted file mode 100644
index eca6a22fa..000000000
--- a/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ /dev/null
@@ -1,69 +0,0 @@
-[training]
-use_gpu = -1
-limit = 0
-dropout = 0.2
-patience = 10000
-eval_frequency = 200
-scores = ["ents_f"]
-score_weights = {"ents_f": 1}
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-batch_size = 25
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[nlp.pipeline.tok2vec.model.extract]
-@architectures = "spacy.CharacterEmbed.v1"
-width = 96
-nM = 64
-nC = 8
-rows = 2000
-columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-dropout = null
-
-[nlp.pipeline.tok2vec.model.extract.features]
-@architectures = "spacy.Doc2Feats.v1"
-columns = ${nlp.pipeline.tok2vec.model.extract:columns}
-
-[nlp.pipeline.tok2vec.model.embed]
-@architectures = "spacy.LayerNormalizedMaxout.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
-maxout_pieces = 4
-
-[nlp.pipeline.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
-window_size = 1
-maxout_pieces = 2
-depth = 2
-
-[nlp.pipeline.ner]
-factory = "ner"
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
diff --git a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
deleted file mode 100644
index e2ab148c6..000000000
--- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ /dev/null
@@ -1,51 +0,0 @@
-[training]
-use_gpu = -1
-limit = 0
-dropout = 0.2
-patience = 10000
-eval_frequency = 200
-scores = ["ents_p", "ents_r", "ents_f"]
-score_weights = {"ents_f": 1}
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 3000
-stop = 3000
-compound = 1.001
-
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.ner]
-factory = "ner"
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-width = 128
-depth = 4
-embed_size = 7000
-maxout_pieces = 3
-window_size = 1
-subword_features = true
-pretrained_vectors = null
-dropout = null

From 130ffa5fbf8751de4eeb4bfd2463f46242ecc50d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 14:59:41 +0200
Subject: [PATCH 026/516] fix typos in docs

---
 website/docs/api/data-formats.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index f868233c7..b9e185d9c 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -191,7 +191,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
 | `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
 | `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `corpus`              | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
+| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
 | `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
 
 ### pretraining {#config-pretraining tag="section,optional"}
@@ -207,7 +207,7 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                |
 | `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
 | `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
-| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~    |
+| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
 | `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
 | `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |

From 3a3110ef6040e6cd9a745676586954f7508c6a6c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 15:44:11 +0200
Subject: [PATCH 027/516] remove empty files

---
 extra/experiments/onto-joint/defaults.cfg        | 0
 extra/experiments/ptb-joint-pos-dep/defaults.cfg | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 extra/experiments/onto-joint/defaults.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
deleted file mode 100644
index e69de29bb..000000000
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
deleted file mode 100644
index e69de29bb..000000000

From ddfc1fc146ec35dab19f835602345de91342eeee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:05:40 +0200
Subject: [PATCH 028/516] add pretraining option to init config

---
 spacy/cli/init_config.py | 12 +++++++++---
 website/docs/api/cli.md  | 34 ++++++++++++++++++----------------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index ec65b0e0a..60ea1b640 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -30,6 +30,7 @@ def init_config_cli(
     pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
     cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
+    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     # fmt: on
 ):
     """
@@ -43,7 +44,7 @@ def init_config_cli(
     if isinstance(optimize, Optimizations):  # instance of enum from the CLI
         optimize = optimize.value
     pipeline = string_to_list(pipeline)
-    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
+    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu,  pretraining=pretraining)
 
 
 @init_cli.command("fill-config")
@@ -109,7 +110,7 @@ def fill_config(
 
 
 def init_config(
-    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
+    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False,
 ) -> None:
     is_stdout = str(output_file) == "-"
     msg = Printer(no_print=is_stdout)
@@ -156,8 +157,13 @@ def init_config(
     with show_validation_error(hint_fill=False):
         config = util.load_config_from_str(base_template)
         nlp, _ = util.load_model_from_config(config, auto_fill=True)
+        config = nlp.config
+        if pretraining:
+            validate_config_for_pretrain(config, msg)
+            pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+            config = pretrain_config.merge(config)
     msg.good("Auto-filled config with all values")
-    save_config(nlp.config, output_file, is_stdout=is_stdout)
+    save_config(config, output_file, is_stdout=is_stdout)
 
 
 def save_config(
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8449d23e1..7ba451c2f 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -124,15 +124,16 @@ customize those settings in your config file later.
 $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
 ```
 
-| Name               | Description                                                                                                                                                                                                                                                                                                                        |
-| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_file`      | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
-| `--lang`, `-l`     | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
-| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
-| `--cpu`, `-C`      | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
-| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
-| **CREATES**        | The config file for training.                                                                                                                                                                                                                                                                                                      |
+| Name                  | Description                                                                                                                                                                                                                                                                                                                        |
+| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output_file`         | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
+| `--lang`, `-l`        | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
+| `--pipeline`, `-p`    | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
+| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--optimize`, `-o`    | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `--cpu`, `-C`         | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
+| **CREATES**           | The config file for training.                                                                                                                                                                                                                                                                                                      |
 
 ### init fill-config {#init-fill-config new="3"}
 
@@ -160,13 +161,14 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name           | Description                                                                                                                         |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `base_path`    | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
-| `output_file`  | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
-| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                                                                          |
-| **CREATES**    | Complete and auto-filled config file for training.                                                                                  |
+| Name                  | Description                                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `base_path`           | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
+| `output_file`         | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
+| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                     |
+| `--diff`, `-D`        | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
+| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                          |
+| **CREATES**           | Complete and auto-filled config file for training.                                                                                  |
 
 ### init vocab {#init-vocab new="3" tag="command"}
 

From 5fade4feb7fbd3d579a6b9a2d696a470456a997f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:15:20 +0200
Subject: [PATCH 029/516] fix cli abbrev

---
 website/docs/api/cli.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7ba451c2f..8edee6b29 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -124,16 +124,16 @@ customize those settings in your config file later.
 $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
 ```
 
-| Name                  | Description                                                                                                                                                                                                                                                                                                                        |
-| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_file`         | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
-| `--lang`, `-l`        | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
-| `--pipeline`, `-p`    | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
-| `--optimize`, `-o`    | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
-| `--cpu`, `-C`         | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
-| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
-| **CREATES**           | The config file for training.                                                                                                                                                                                                                                                                                                      |
+| Name                   | Description                                                                                                                                                                                                                                                                                                                        |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
+| `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
+| `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
+| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
+| **CREATES**            | The config file for training.                                                                                                                                                                                                                                                                                                      |
 
 ### init fill-config {#init-fill-config new="3"}
 

From 35a393106404d8f69d69e6c12d62e21a7d517065 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:36:27 +0200
Subject: [PATCH 030/516] fix typo

---
 spacy/cli/debug_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index a4899a458..58908c5e8 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -54,7 +54,7 @@ def debug_model_cli(
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=config_overrides)
-        nlp, config = util.load_model_from_config(config_path)
+        nlp, config = util.load_model_from_config(config)
     seed = config["training"]["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")

From ec751068f328e47ae7fa8ca1745a1dd8ac00529d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 16:42:53 +0200
Subject: [PATCH 031/516] Draft text for static vectors intro

---
 website/docs/usage/embeddings-transformers.md | 45 +++++++++++++++----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 8dd104ead..6a239cb1e 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using
 
 <Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
 
-The key difference between [word vectors](#word-vectors) and contextual language
-models such as [transformers](#transformers) is that word vectors model
-**lexical types**, rather than _tokens_. If you have a list of terms with no
-context around them, a transformer model like BERT can't really help you. BERT
-is designed to understand language **in context**, which isn't what you have. A
-word vectors table will be a much better fit for your task. However, if you do
-have words in context — whole sentences or paragraphs of running text — word
-vectors will only provide a very rough approximation of what the text is about.
+[Transformers](#transformers) are large and powerful neural networks that give
+you better accuracy, but are harder to deploy in production, as they require a GPU to run
+effectively. [Word vectors](#word-vectors) are a slightly older technique that
+can give your models a smaller improvement in accuracy, and can also provide
+some additional capabilities. 
+
+The key difference between word-vectors and contextual language
+models such as transformers is that word vectors model **lexical types**, rather
+than _tokens_. If you have a list of terms with no context around them, a transformer
+model like BERT can't really help you. BERT is designed to understand language
+**in context**, which isn't what you have. A word vectors table will be a much
+better fit for your task. However, if you do have words in context — whole sentences
+or paragraphs of running text — word vectors will only provide a very rough
+approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a
 vector with a single indexing operation. Word vectors are therefore useful as a
@@ -478,7 +484,28 @@ training.
 
 ## Static vectors {#static-vectors}
 
-<!-- TODO: write -->
+If your pipeline includes a word vectors table, you'll be able to use the
+`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects.
+You'll also be able to access the vectors using the `.vector` attribute, or you
+can look up one or more vectors directly using the `Vocab` object. Pipelines
+with word vectors can also use the vectors as features for the statistical
+models, which can improve the accuracy of your components.
+
+Word vectors in spaCy are "static" in the sense that they are not learned
+parameters of the statistical models, and spaCy itself does not feature any
+algorithms for learning word vector tables. You can train a word vectors table
+using tools such as Gensim, word2vec, FastText or GloVe. There are also many
+word vector tables available for download. Once you have a word vectors table
+you want to use, you can convert it for use with spaCy using the `spacy init vocab`
+command, which will give you a directory you can load or refer to in your training
+configs.
+
+When converting the vectors, there are two ways you can trim them down to make
+your package smaller. You can _truncate_ the vectors with the `--truncate-vectors`
+option, which will remove entries for rarer words from the table. Alternatively,
+you can use the `--prune-vectors` option to remap rarer words to the closest vector
+that remains in the table. This allows the vectors table to return meaningful
+(albeit imperfect) results for more words than you have rows in the table.
 
 ### Using word vectors in your models {#word-vectors-models}
 

From 127ce0c574da23f2e17c824dcebec6f229d4561f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:55:53 +0200
Subject: [PATCH 032/516] Update website/docs/api/cli.md

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8edee6b29..5f3a06c36 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -121,7 +121,7 @@ customize those settings in your config file later.
 > ```
 
 ```cli
-$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
+$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] [--pretraining]
 ```
 
 | Name                   | Description                                                                                                                                                                                                                                                                                                                        |

From e5ceec5df0cf7d279d6f2bac716a30f4edb71fc8 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:56:20 +0200
Subject: [PATCH 033/516] Update website/docs/api/cli.md

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 5f3a06c36..f5ac943e2 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -129,7 +129,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 | `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
 | `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
 | `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
 | `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |

From 38652143434207531c2779ab6905331269f072ca Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:57:02 +0200
Subject: [PATCH 034/516] Use consistent shortcut

---
 spacy/cli/init_config.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 60ea1b640..e70195e15 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -44,7 +44,14 @@ def init_config_cli(
     if isinstance(optimize, Optimizations):  # instance of enum from the CLI
         optimize = optimize.value
     pipeline = string_to_list(pipeline)
-    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu,  pretraining=pretraining)
+    init_config(
+        output_file,
+        lang=lang,
+        pipeline=pipeline,
+        optimize=optimize,
+        cpu=cpu,
+        pretraining=pretraining,
+    )
 
 
 @init_cli.command("fill-config")
@@ -52,7 +59,7 @@ def init_fill_config_cli(
     # fmt: off
     base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
-    pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
+    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
     # fmt: on
 ):
@@ -110,7 +117,13 @@ def fill_config(
 
 
 def init_config(
-    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False,
+    output_file: Path,
+    *,
+    lang: str,
+    pipeline: List[str],
+    optimize: str,
+    cpu: bool,
+    pretraining: bool = False,
 ) -> None:
     is_stdout = str(output_file) == "-"
     msg = Printer(no_print=is_stdout)

From c4b414b2825021410c8f8e80304b83eac3847bf1 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:58:09 +0200
Subject: [PATCH 035/516] Update website/docs/api/cli.md

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f5ac943e2..9d0b872c3 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -165,7 +165,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | `base_path`           | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
 | `output_file`         | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                     |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--diff`, `-D`        | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
 | `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                          |
 | **CREATES**           | Complete and auto-filled config file for training.                                                                                  |

From 3d8e010655e7180eb875fe784f2c8f098a332388 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:58:46 +0200
Subject: [PATCH 036/516] Change order

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f5ac943e2..f9a192000 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -129,9 +129,9 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 | `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
 | `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
 | `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
 | `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                  |
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
 | **CREATES**            | The config file for training.                                                                                                                                                                                                                                                                                                      |
 

From a2c8cda26ffbc6ba0e15b0872b8691ee4f366994 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 17:12:51 +0200
Subject: [PATCH 037/516] Update docs [ci skip]

---
 website/docs/usage/embeddings-transformers.md | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 6a239cb1e..9f73661c3 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -31,18 +31,18 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using
 <Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
 
 [Transformers](#transformers) are large and powerful neural networks that give
-you better accuracy, but are harder to deploy in production, as they require a GPU to run
-effectively. [Word vectors](#word-vectors) are a slightly older technique that
-can give your models a smaller improvement in accuracy, and can also provide
-some additional capabilities. 
+you better accuracy, but are harder to deploy in production, as they require a
+GPU to run effectively. [Word vectors](#word-vectors) are a slightly older
+technique that can give your models a smaller improvement in accuracy, and can
+also provide some additional capabilities.
 
-The key difference between word-vectors and contextual language
-models such as transformers is that word vectors model **lexical types**, rather
-than _tokens_. If you have a list of terms with no context around them, a transformer
-model like BERT can't really help you. BERT is designed to understand language
-**in context**, which isn't what you have. A word vectors table will be a much
-better fit for your task. However, if you do have words in context — whole sentences
-or paragraphs of running text — word vectors will only provide a very rough
+The key difference between word-vectors and contextual language models such as
+transformers is that word vectors model **lexical types**, rather than _tokens_.
+If you have a list of terms with no context around them, a transformer model
+like BERT can't really help you. BERT is designed to understand language **in
+context**, which isn't what you have. A word vectors table will be a much better
+fit for your task. However, if you do have words in context — whole sentences or
+paragraphs of running text — word vectors will only provide a very rough
 approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a
@@ -484,28 +484,32 @@ training.
 
 ## Static vectors {#static-vectors}
 
-If your pipeline includes a word vectors table, you'll be able to use the
-`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects.
-You'll also be able to access the vectors using the `.vector` attribute, or you
-can look up one or more vectors directly using the `Vocab` object. Pipelines
-with word vectors can also use the vectors as features for the statistical
-models, which can improve the accuracy of your components.
+If your pipeline includes a **word vectors table**, you'll be able to use the
+`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span),
+[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able
+to access the vectors using the `.vector` attribute, or you can look up one or
+more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with
+word vectors can also **use the vectors as features** for the statistical
+models, which can **improve the accuracy** of your components.
 
 Word vectors in spaCy are "static" in the sense that they are not learned
 parameters of the statistical models, and spaCy itself does not feature any
 algorithms for learning word vector tables. You can train a word vectors table
-using tools such as Gensim, word2vec, FastText or GloVe. There are also many
-word vector tables available for download. Once you have a word vectors table
-you want to use, you can convert it for use with spaCy using the `spacy init vocab`
-command, which will give you a directory you can load or refer to in your training
-configs.
+using tools such as [Gensim](https://radimrehurek.com/gensim/),
+[FastText](https://fasttext.cc/) or
+[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing
+pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you
+convert vectors for use with spaCy and will give you a directory you can load or
+refer to in your [training configs](/usage/training#config).
 
-When converting the vectors, there are two ways you can trim them down to make
-your package smaller. You can _truncate_ the vectors with the `--truncate-vectors`
-option, which will remove entries for rarer words from the table. Alternatively,
-you can use the `--prune-vectors` option to remap rarer words to the closest vector
-that remains in the table. This allows the vectors table to return meaningful
-(albeit imperfect) results for more words than you have rows in the table.
+<Infobox title="Word vectors and similarity" emoji="📖">
+
+For more details on loading word vectors into spaCy, using them for similarity
+and improving word vector coverage by truncating and pruning the vectors, see
+the usage guide on
+[word vectors and similarity](/usage/linguistic-features#vectors-similarity).
+
+</Infobox>
 
 ### Using word vectors in your models {#word-vectors-models}
 

From ed0fb034cb487a1fcc206e250ca34c8a38b7e0de Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 18:11:10 +0200
Subject: [PATCH 038/516] ml_datasets v0.2.0a0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 69477c2d3..55fe627b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a33,<8.0.0a40
 blis>=0.4.0,<0.5.0
-ml_datasets>=0.2.0
+ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0

From 6efb7688a65faae489de33073c1c40b11ec4f432 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 18:17:03 +0200
Subject: [PATCH 039/516] Draft pretrain usage

---
 website/docs/usage/embeddings-transformers.md | 86 ++++++++++++++++---
 1 file changed, 76 insertions(+), 10 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 9f73661c3..678237dc2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -610,17 +610,83 @@ def MyCustomVectors(
 
 ## Pretraining {#pretraining}
 
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
+The `spacy pretrain` command lets you initialize your models with information
+from raw text. Without pretraining, the models for your components will usually
+be initialized randomly. The idea behind pretraining is simple: random probably
+isn't optimal, so if we have some text to learn from, we can probably find
+a way to get the model off to a better start. The impact of `spacy pretrain` varies,
+but it will usually be worth trying if you're not using a transformer model and
+you have relatively little training data (for instance, fewer than 5,000 sentence).
+A good rule of thumb is that pretraining will generally give you a similar accuracy
+improvement to using word vectors in your model. If word vectors have given you
+a 10% error reduction, the `spacy pretrain` command might give you another 10%,
+for a 20% error reduction in total.
 
-<!--
-- explain general concept and idea (short!)
-- present it as a separate lightweight mechanism for pretraining the tok2vec
-  layer
-- advantages (could also be pros/cons table)
-- explain how it generates a separate file (!) and how it depends on the same
-  vectors
--->
+The `spacy pretrain` command will take a specific subnetwork within one of your
+components, and add additional layers to build a network for a temporary task,
+that forces the model to learn something about sentence structure and word
+cooccurrence statistics. Pretraining produces a binary weights file that can be
+loaded back in at the start of training. The weights file specifies an initial
+set of weights. Training then proceeds as normal.
+
+You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork
+must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer).
+The most common workflow is to use the `Tok2Vec` component to create a shared
+token-to-vector layer for several components of your pipeline, and apply
+pretraining to its whole model. 
+
+The `spacy pretrain` command is configured using the `[pretraining]` section of
+your config file. The `pretraining.component` and `pretraining.layer` settings
+tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer`
+setting should be either the empty string (to use the whole model), or a 
+[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's
+built-in model architectures have a reference named `"tok2vec"` that will refer
+to the right layer.
+
+```ini
+# Pretrain nlp.get_pipe("tok2vec").model
+[pretraining]
+component = "tok2vec"
+layer = ""
+
+[pretraining]
+# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec")
+component = "textcat"
+layer = "tok2vec"
+```
+
+two pretraining objectives are available, both of which are variants of the cloze
+task Devlin et al (2018) introduced for BERT.
+
+* The *characters* objective asks the model to predict some number of leading and
+  trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the
+  model will try to predict the first two and last two characters of the word.
+
+* The *vectors* objective asks the model to predict the word's vector, from
+  a static embeddings table. This requires a word vectors model to be trained
+  and loaded. The vectors objective can optimize either a cosine or an L2 loss.
+  We've generally found cosine loss to perform better.
+
+These pretraining objectives use a trick that we term _language modelling with
+approximate outputs (LMAO)_. The motivation for the trick is that predicting
+an exact word ID introduces a lot of incidental complexity. You need a large
+output layer, and even then, the vocabulary is too large, which motivates
+tokenization schemes that do not align to actual word boundaries. At the end of
+training, the output layer will be thrown away regardless: we just want a task
+that forces the network to model something about word cooccurrence statistics.
+Predicting leading and trailing characters does that more than adequately, as
+the exact word sequence could be recovered with high accuracy if the initial
+and trailing characters are predicted accurately. With the vectors objective,
+the pretraining is use the embedding space learned by an algorithm such as
+GloVe or word2vec, allowing the model to focus on the contextual
+modelling we actual care about.
+
+The `[pretraining]` section has several configuration subsections that are
+familiar from the training block: the `[pretraining.batcher]`,
+[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
+expect the same types of objects, although for pretraining your corpus does not
+need to have any annotations, so you will often use a different reader, such as 
+`spacy.training.JsonlReader1`.
 
 > #### Raw text format
 >

From a0b4389a3845a1692b934a6ca79caf54bb29b1a3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 19:24:48 +0200
Subject: [PATCH 040/516] Update docs [ci skip]

---
 website/docs/usage/embeddings-transformers.md | 200 +++++++++++-------
 1 file changed, 121 insertions(+), 79 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 678237dc2..4adcd927c 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -610,99 +610,141 @@ def MyCustomVectors(
 
 ## Pretraining {#pretraining}
 
-The `spacy pretrain` command lets you initialize your models with information
-from raw text. Without pretraining, the models for your components will usually
-be initialized randomly. The idea behind pretraining is simple: random probably
-isn't optimal, so if we have some text to learn from, we can probably find
-a way to get the model off to a better start. The impact of `spacy pretrain` varies,
-but it will usually be worth trying if you're not using a transformer model and
-you have relatively little training data (for instance, fewer than 5,000 sentence).
-A good rule of thumb is that pretraining will generally give you a similar accuracy
-improvement to using word vectors in your model. If word vectors have given you
-a 10% error reduction, the `spacy pretrain` command might give you another 10%,
-for a 20% error reduction in total.
+The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
+models with **information from raw text**. Without pretraining, the models for
+your components will usually be initialized randomly. The idea behind
+pretraining is simple: random probably isn't optimal, so if we have some text to
+learn from, we can probably find a way to get the model off to a better start.
 
-The `spacy pretrain` command will take a specific subnetwork within one of your
-components, and add additional layers to build a network for a temporary task,
-that forces the model to learn something about sentence structure and word
-cooccurrence statistics. Pretraining produces a binary weights file that can be
-loaded back in at the start of training. The weights file specifies an initial
-set of weights. Training then proceeds as normal.
-
-You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork
-must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer).
-The most common workflow is to use the `Tok2Vec` component to create a shared
-token-to-vector layer for several components of your pipeline, and apply
-pretraining to its whole model. 
-
-The `spacy pretrain` command is configured using the `[pretraining]` section of
-your config file. The `pretraining.component` and `pretraining.layer` settings
-tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer`
-setting should be either the empty string (to use the whole model), or a 
-[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's
-built-in model architectures have a reference named `"tok2vec"` that will refer
-to the right layer.
-
-```ini
-# Pretrain nlp.get_pipe("tok2vec").model
-[pretraining]
-component = "tok2vec"
-layer = ""
-
-[pretraining]
-# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec")
-component = "textcat"
-layer = "tok2vec"
-```
-
-two pretraining objectives are available, both of which are variants of the cloze
-task Devlin et al (2018) introduced for BERT.
-
-* The *characters* objective asks the model to predict some number of leading and
-  trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the
-  model will try to predict the first two and last two characters of the word.
-
-* The *vectors* objective asks the model to predict the word's vector, from
-  a static embeddings table. This requires a word vectors model to be trained
-  and loaded. The vectors objective can optimize either a cosine or an L2 loss.
-  We've generally found cosine loss to perform better.
-
-These pretraining objectives use a trick that we term _language modelling with
-approximate outputs (LMAO)_. The motivation for the trick is that predicting
-an exact word ID introduces a lot of incidental complexity. You need a large
-output layer, and even then, the vocabulary is too large, which motivates
-tokenization schemes that do not align to actual word boundaries. At the end of
-training, the output layer will be thrown away regardless: we just want a task
-that forces the network to model something about word cooccurrence statistics.
-Predicting leading and trailing characters does that more than adequately, as
-the exact word sequence could be recovered with high accuracy if the initial
-and trailing characters are predicted accurately. With the vectors objective,
-the pretraining is use the embedding space learned by an algorithm such as
-GloVe or word2vec, allowing the model to focus on the contextual
-modelling we actual care about.
-
-The `[pretraining]` section has several configuration subsections that are
-familiar from the training block: the `[pretraining.batcher]`,
-[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
+Pretraining uses the same [`config.cfg`](/usage/training#config) file as the
+regular training, which helps keep the settings and hyperparameters consistent.
+The additional `[pretraining]` section has several configuration subsections
+that are familiar from the training block: the `[pretraining.batcher]`,
+`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
-need to have any annotations, so you will often use a different reader, such as 
-`spacy.training.JsonlReader1`.
+need to have any annotations, so you will often use a different reader, such as
+the [`JsonlReader`](/api/toplevel#jsonlreader).
 
 > #### Raw text format
 >
-> The raw text can be provided as JSONL (newline-delimited JSON) with a key
-> `"text"` per entry. This allows the data to be read in line by line, while
-> also allowing you to include newlines in the texts.
+> The raw text can be provided in spaCy's
+> [binary `.spacy` format](/api/data-formats#training) consisting of serialized
+> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per
+> entry. This allows the data to be read in line by line, while also allowing
+> you to include newlines in the texts.
 >
 > ```json
 > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
 > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
 > ```
+>
+> You can also use your own custom corpus loader instead.
+
+You can add a `[pretraining]` block to your config by setting the
+`--pretraining` flag on [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config):
 
 ```cli
 $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
 ```
 
+You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config
+and pass in optional config overrides, like the path to the raw text file:
+
 ```cli
-$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
+$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```
+
+### How pretraining works {#pretraining-details}
+
+The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
+be worth trying if you're **not using a transformer** model and you have
+**relatively little training data** (for instance, fewer than 5,000 sentences).
+A good rule of thumb is that pretraining will generally give you a similar
+accuracy improvement to using word vectors in your model. If word vectors have
+given you a 10% error reduction, pretraining with spaCy might give you another
+10%, for a 20% error reduction in total.
+
+The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
+subnetwork** within one of your components, and add additional layers to build a
+network for a temporary task, that forces the model to learn something about
+sentence structure and word cooccurrence statistics. Pretraining produces a
+**binary weights file** that can be loaded back in at the start of training. The
+weights file specifies an initial set of weights. Training then proceeds as
+normal.
+
+You can only pretrain one subnetwork from your pipeline at a time, and the
+subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be
+a "tok2vec" layer). The most common workflow is to use the
+[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for
+several components of your pipeline, and apply pretraining to its whole model.
+
+#### Configuring the pretraining {#pretraining-configure}
+
+The [`spacy pretrain`](/api/cli#pretrain) command is configured using the
+`[pretraining]` section of your [config file](/usage/training#config). The
+`component` and `layer` settings tell spaCy how to **find the subnetwork** to
+pretrain. The `layer` setting should be either the empty string (to use the
+whole model), or a
+[node reference](https://thinc.ai/docs/usage-models#model-state). Most of
+spaCy's built-in model architectures have a reference named `"tok2vec"` that
+will refer to the right layer.
+
+```ini
+### config.cfg
+# 1. Use the whole model of the "tok2vec" component
+[pretraining]
+component = "tok2vec"
+layer = ""
+
+# 2. Pretrain the "tok2vec" node of the "textcat" component
+[pretraining]
+component = "textcat"
+layer = "tok2vec"
+```
+
+#### Pretraining objectives {#pretraining-details}
+
+Two pretraining objectives are available, both of which are variants of the
+cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced
+for BERT. The objective can be defined and configured via the
+`[pretraining.objective]` config block.
+
+> ```ini
+> ### Characters objective
+> [pretraining.objective]
+> type = "characters"
+> n_characters = 4
+> ```
+>
+> ```ini
+> ### Vectors objective
+> [pretraining.objective]
+> type = "vectors"
+> loss = "cosine"
+> ```
+
+- **Characters:** The `"characters"` objective asks the model to predict some
+  number of leading and trailing UTF-8 bytes for the words. For instance,
+  setting `n_characters = 2`, the model will try to predict the first two and
+  last two characters of the word.
+
+- **Vectors:** The `"vectors"` objective asks the model to predict the word's
+  vector, from a static embeddings table. This requires a word vectors model to
+  be trained and loaded. The vectors objective can optimize either a cosine or
+  an L2 loss. We've generally found cosine loss to perform better.
+
+These pretraining objectives use a trick that we term **language modelling with
+approximate outputs (LMAO)**. The motivation for the trick is that predicting an
+exact word ID introduces a lot of incidental complexity. You need a large output
+layer, and even then, the vocabulary is too large, which motivates tokenization
+schemes that do not align to actual word boundaries. At the end of training, the
+output layer will be thrown away regardless: we just want a task that forces the
+network to model something about word cooccurrence statistics. Predicting
+leading and trailing characters does that more than adequately, as the exact
+word sequence could be recovered with high accuracy if the initial and trailing
+characters are predicted accurately. With the vectors objective, the pretraining
+is use the embedding space learned by an algorithm such as
+[GloVe](https://nlp.stanford.edu/projects/glove/) or
+[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
+focus on the contextual modelling we actual care about.

From 8b650f3a786094833cccd8686ab4d6d73330565c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 21:10:41 +0200
Subject: [PATCH 041/516] Modify setting missing and blocked entity tokens

In order to make it easier to construct `Doc` objects as training data,
modify how missing and blocked entity tokens are set to prioritize
setting `O` and missing entity tokens for training purposes over setting
blocked entity tokens.

* `Doc.ents` setter sets tokens outside entity spans to `O` regardless
of the current state of each token

* For `Doc.ents`, setting a span with a missing label sets the `ent_iob`
to missing instead of blocked

* `Doc.block_ents(spans)` marks spans as hard `O` for use with the
`EntityRecognizer`
---
 spacy/tests/doc/test_doc_api.py | 18 ++++++++++++++++--
 spacy/tests/parser/test_ner.py  |  4 ++--
 spacy/tokens/doc.pyx            | 25 +++++++++++++++++++------
 spacy/training/example.pyx      |  4 +---
 spacy/training/iob_utils.py     | 12 ++++--------
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ce979d3d1..53c309ba5 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer):
     assert len(tokens.ents) == 0
     tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
     assert len(list(tokens.ents)) == 1
-    assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+    assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
     assert tokens.ents[0].label_ == "PRODUCT"
     assert tokens.ents[0].start == 2
     assert tokens.ents[0].end == 4
@@ -426,7 +426,7 @@ def test_has_annotation(en_vocab):
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
 
     for attr in attrs:
         assert doc.has_annotation(attr)
@@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_block_ents(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    doc.block_ents([doc[1:2], doc[3:5]])
+    assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+    assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+    assert doc.ents == tuple()
+
+    # invalid IOB repaired
+    doc.ents = [Span(doc, 3, 5, "ENT")]
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+    doc.block_ents([doc[3:4]])
+    assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 548cd2697..b8fdf15f9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
-    doc2.ents = [(0, 3, 5)]
+    doc2.block_ents([doc2[3:5]])
     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.ents = [(0, self.start, self.end)]
+        doc.block_ents([doc[self.start:self.end]])
         return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5c5443258..1bae84508 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -590,17 +590,16 @@ cdef class Doc:
                 entity_type = 0
                 kb_id = 0
 
-                # Set ent_iob to Missing (0) by default unless this token was nered before
-                ent_iob = 0
-                if self.c[i].ent_iob != 0:
-                    ent_iob = 2
+                # Set ent_iob to Outside (2) by default
+                ent_iob = 2
 
                 # overwrite if the token was part of a specified entity
                 if i in tokens_in_ents.keys():
                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                     if entity_type is None or entity_type <= 0:
-                        # Blocking this token from being overwritten by downstream NER
-                        ent_iob = 3
+                        # Empty label: Missing, unset this token
+                        ent_iob = 0
+                        entity_type = 0
                     elif ent_start == i:
                         # Marking the start of an entity
                         ent_iob = 3
@@ -612,6 +611,20 @@ cdef class Doc:
                 self.c[i].ent_kb_id = kb_id
                 self.c[i].ent_iob = ent_iob
 
+    def block_ents(self, spans):
+        """Mark spans as never an entity for the EntityRecognizer.
+
+        spans (List[Span]): The spans to block as never entities.
+        """
+        for span in spans:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 3
+                self.c[i].ent_type = 0
+            # if the following token is I, set to B
+            if span.end < self.length:
+                if self.c[span.end].ent_iob == 1:
+                    self.c[span.end].ent_iob = 3
+
     @property
     def noun_chunks(self):
         """Iterate over the base noun phrases in the document. Yields base
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..d396a2040 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data):
             spans_from_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
-        # Ugh, this is super messy. Really hard to set O entities
         doc.ents = ner_data
-        doc.ents = [span for span in ner_data if span.label_]
     else:
         raise ValueError(Errors.E973)
 
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index ceb5e16b8..33a4733ca 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -182,22 +182,18 @@ def tags_to_entities(tags):
     entities = []
     start = None
     for i, tag in enumerate(tags):
-        if tag is None:
-            continue
-        if tag.startswith("O"):
+        if tag is None or tag.startswith("-"):
             # TODO: We shouldn't be getting these malformed inputs. Fix this.
             if start is not None:
                 start = None
             else:
                 entities.append(("", i, i))
-            continue
-        elif tag == "-":
-            continue
+        elif tag.startswith("O"):
+            pass
         elif tag.startswith("I"):
             if start is None:
                 raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
-            continue
-        if tag.startswith("U"):
+        elif tag.startswith("U"):
             entities.append((tag[2:], i, i))
         elif tag.startswith("B"):
             start = i

From e4fc7e0222621c40b6d0aa025d3fc0450a672079 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 22:34:36 +0200
Subject: [PATCH 042/516] fixing output sample to proper 2D array

---
 spacy/cli/debug_model.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 58908c5e8..04a14bdc9 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -60,13 +60,12 @@ def debug_model_cli(
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
     pipe = nlp.get_pipe(component)
-    if hasattr(pipe, "model"):
-        model = pipe.model
-    else:
+    if not hasattr(pipe, "model"):
         msg.fail(
             f"The component '{component}' does not specify an object that holds a Model.",
             exits=1,
         )
+    model = pipe.model
     debug_model(model, print_settings=print_settings)
 
 
@@ -87,7 +86,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops.xp)
+    Y = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=Y)
@@ -113,9 +112,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
+    msg.good(f"Succesfully ended analysis - model looks good!")
+
 
 def get_gradient(model, Y):
-    goldY = _get_output(model.ops.xp)
+    goldY = _get_output(model.ops)
     return Y - goldY
 
 
@@ -133,8 +134,14 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _get_output(ops):
+    docs = len(_get_docs())
+    labels = 6
+    output = ops.alloc2f(d0=docs, d1=labels)
+    for i in range(docs):
+        for j in range(labels):
+            output[i, j] = 1 / (i+j+0.01)
+    return ops.xp.asarray(output)
 
 
 def _print_model(model, print_settings):

From a88106e852b08bcbbe607d5bb83929e5a13120f4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 03:01:29 +0200
Subject: [PATCH 043/516] Remove W106: HEAD and SENT_START in doc.from_array
 (#6086)

* Remove W106: HEAD and SENT_START in doc.from_array

This warning was hacky and being triggered too often.

* Fix test
---
 spacy/errors.py                 | 3 ---
 spacy/tests/doc/test_doc_api.py | 5 ++---
 spacy/tokens/doc.pyx            | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 173aedab9..81e3616be 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -119,9 +119,6 @@ class Warnings:
     W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
             "need to match on a stream of documents, you can use nlp.pipe and "
             "call the {matcher} on each Doc object.")
-    W106 = ("Both HEAD and SENT_START are included as attributes in "
-            "doc.from_array(). The parse trees based on the HEAD attribute "
-            "will override the values in SENT_START.")
     W107 = ("The property Doc.{prop} is deprecated. Use "
             "Doc.has_annotation(\"{attr}\") instead.")
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ce979d3d1..c979931b1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab):
     # fmt: on
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
-    # HEAD overrides SENT_START with warning
+    # HEAD overrides SENT_START without warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
-    with pytest.warns(UserWarning):
-        new_doc.from_array(attrs, arr)
+    new_doc.from_array(attrs, arr)
 
     # no warning using default attrs
     attrs = doc._get_array_attrs()
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5c5443258..2d9de278b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -817,8 +817,6 @@ cdef class Doc:
         if array.dtype != numpy.uint64:
             warnings.warn(Warnings.W028.format(type=array.dtype))
 
-        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
-            warnings.warn(Warnings.W106)
         cdef int i, col
         cdef int32_t abs_head_index
         cdef attr_id_t attr_id

From d32ce121beb38d05e1e926053f1fdf9cce8d2aa6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 18 Sep 2020 13:41:12 +0200
Subject: [PATCH 044/516] Fix docs [ci skip]

---
 website/docs/api/top-level.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..a37f24213 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -84,7 +84,7 @@ Create a blank pipeline of a given language class. This function is the twin of
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab` <Tag variant="new">3</Tag>  | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                             |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
-| `meta` <Tag variant="new">3</tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
+| `meta` <Tag variant="new">3</Tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
 | **RETURNS**                         | An empty `Language` object of the appropriate subclass. ~~Language~~                                                                                               |
 
 ### spacy.info {#spacy.info tag="function"}

From bbdb5f62b70e9e12c6d4a8d9581e064ce846d19c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 18 Sep 2020 14:26:42 +0200
Subject: [PATCH 045/516] Temporary work-around for scoring a subset of
 components (#6090)

* Try hacking the scorer to work around sentence boundaries

* Upd scorer

* Set dev version

* Upd scorer hack

* Fix version

* Improve comment on hack
---
 spacy/scorer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 7f7418237..da22d59d4 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -270,6 +270,18 @@ class Scorer:
         for example in examples:
             pred_doc = example.predicted
             gold_doc = example.reference
+            # TODO
+            # This is a temporary hack to work around the problem that the scorer
+            # fails if you have examples that are not fully annotated for all
+            # the tasks in your pipeline. For instance, you might have a corpus
+            # of NER annotations that does not set sentence boundaries, but the
+            # pipeline includes a parser or senter, and then the score_weights
+            # are used to evaluate that component. When the scorer attempts
+            # to read the sentences from the gold document, it fails.
+            try:
+                list(getter(gold_doc, attr))
+            except ValueError:
+                continue
             # Find all labels in gold and doc
             labels = set(
                 [k.label_ for k in getter(gold_doc, attr)]

From 0406200a1ea1c960cf6d07c11f91f3b4d7f2d551 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 18 Sep 2020 15:13:13 +0200
Subject: [PATCH 046/516] Update docs [ci skip]

---
 website/docs/api/data-formats.md | 43 +++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index b9e185d9c..3ed846b9e 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -123,20 +123,11 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 
 ### corpora {#config-corpora tag="section"}
 
-This section defines a dictionary mapping of string keys to `Callable`
-functions. Each callable takes an `nlp` object and yields
-[`Example`](/api/example) objects. By default, the two keys `train` and `dev`
-are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When
-pretraining, an additional pretrain section is added that defaults to a
-[`JsonlReader`](/api/top-level#JsonlReader).
-
-These subsections can be expanded with additional subsections, each referring to
-a callback of type `Callable[[Language], Iterator[Example]]`:
-
 > #### Example
 >
 > ```ini
 > [corpora]
+>
 > [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths:train}
@@ -148,28 +139,44 @@ a callback of type `Callable[[Language], Iterator[Example]]`:
 > [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = ${paths.raw}
-> min_length = 5
-> max_length = 500
 >
-> [corpora.mydata]
-> @readers = "my_reader.v1"
-> shuffle = true
+> [corpora.my_custom_data]
+> @readers = "my_custom_reader.v1"
 > ```
 
-Alternatively, the `corpora` block could refer to one function with return type
-`Dict[str, Callable[[Language], Iterator[Example]]]`:
+This section defines a **dictionary** mapping of string keys to functions. Each
+function takes an `nlp` object and yields [`Example`](/api/example) objects. By
+default, the two keys `train` and `dev` are specified and each refer to a
+[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
+section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+You can also register custom functions that return a callable.
+
+| Name       | Description                                                                                                                                                                 |
+| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `train`    | Training data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~                                                                     |
+| `dev`      | Development data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~                                                                  |
+| `pretrain` | Raw text for [pretraining](/usage/embeddings-transformers#pretraining), typically used in `[pretraining]` block (if available). ~~Callable[[Language], Iterator[Example]]~~ |
+| ...        | Any custom or alternative corpora. ~~Callable[[Language], Iterator[Example]]~~                                                                                              |
+
+Alternatively, the `[corpora]` block can refer to **one function** that returns
+a dictionary keyed by the corpus names. This can be useful if you want to load a
+single corpus once and then divide it up into `train` and `dev` partitions.
 
 > #### Example
 >
 > ```ini
 > [corpora]
-> @readers = "my_dict_reader.v1"
+> @readers = "my_custom_reader.v1"
 > train_path = ${paths:train}
 > dev_path = ${paths:dev}
 > shuffle = true
 >
 > ```
 
+| Name      | Description                                                                                                                                                                                                              |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `corpora` | A dictionary keyed by string names, mapped to corpus functions that receive the current `nlp` object and return an iterator of [`Example`](/api/example) objects. ~~Dict[str, Callable[[Language], Iterator[Example]]]~~ |
+
 ### training {#config-training tag="section"}
 
 This section defines settings and controls for the training and evaluation

From eed4b785f51fcff2783e06306441f55437fc95fb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 15:45:55 +0200
Subject: [PATCH 047/516] Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```
---
 spacy/cli/train.py       |  1 +
 spacy/default_config.cfg |  2 +-
 spacy/language.py        |  8 +++++++-
 spacy/schemas.py         |  3 ++-
 spacy/tests/test_util.py |  7 ++-----
 spacy/util.py            |  8 ++++++++
 spacy/vocab.pyx          | 17 +++++++++--------
 7 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 50306b350..c6b39c289 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -88,6 +88,7 @@ def train(
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
         nlp, config = util.load_model_from_config(config)
+    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
     if config["training"]["vectors"] is not None:
         util.load_vectors_into_model(nlp, config["training"]["vectors"])
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index c7c9593d7..1517421f0 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
 lang = null
 pipeline = []
 disabled = []
-load_vocab_data = true
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@@ -58,6 +57,7 @@ accumulate_gradient = 1
 init_tok2vec = ${paths.init_tok2vec}
 raw_text = ${paths.raw}
 vectors = null
+lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
diff --git a/spacy/language.py b/spacy/language.py
index d530e6b92..1d0990c55 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -31,6 +31,7 @@ from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
 from . import util
 from . import about
+from .lookups import load_lookups
 
 
 # This is the base config will all settings (training etc.)
@@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
     return tokenizer_factory
 
 
+@registry.misc("spacy.LoadLookupsData.v1")
+def load_lookups_data(lang, tables):
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
 class Language:
     """A text-processing pipeline. Usually you'll load this once per process,
     and pass the instance around your application.
@@ -152,7 +159,6 @@ class Language:
                 self.lang,
                 self.Defaults,
                 vectors_name=vectors_name,
-                load_data=self._config["nlp"]["load_vocab_data"],
             )
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 06bc4beed..c72b5ca8b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -8,6 +8,7 @@ from collections import defaultdict
 from thinc.api import Optimizer
 
 from .attrs import NAMES
+from .lookups import Lookups
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
     dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
     train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
@@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
     pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
     disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
     tokenizer: Callable = Field(..., title="The tokenizer to use")
-    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
     before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 1f073ab32..8c931d31e 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -69,7 +69,6 @@ def test_util_dot_section():
     [nlp]
     lang = "en"
     pipeline = ["textcat"]
-    load_vocab_data = false
 
     [components]
 
@@ -95,15 +94,13 @@ def test_util_dot_section():
     # not exclusive_classes
     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
     # Test that default values got overwritten
-    assert not en_config["nlp"]["load_vocab_data"]
-    assert nl_config["nlp"]["load_vocab_data"]  # default value True
+    assert en_config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_config["nlp"]["pipeline"] == [] # default value []
     # Test proper functioning of 'dot_to_object'
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.pipeline.tagger")
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.unknownattribute")
-    assert not dot_to_object(en_config, "nlp.load_vocab_data")
-    assert dot_to_object(nl_config, "nlp.load_vocab_data")
     assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 18b34e4d6..2e285a128 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -253,6 +253,14 @@ def load_vectors_into_model(
                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
+def load_vocab_data_into_model(
+    nlp: "Language", *, lookups: Optional["Lookups"]=None
+) -> None:
+    """Load vocab data."""
+    if lookups:
+        nlp.vocab.load_lookups(lookups)
+
+
 def load_model(
     name: Union[str, Path],
     *,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ef0847e54..94289036a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
 
 
-def create_vocab(lang, defaults, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None):
     # If the spacy-lookups-data package is installed, we pre-populate the lookups
     # with lexeme data, if available
-    if load_data:
-        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
-        lookups = load_lookups(lang, tables=tables, strict=False)
-    else:
-        lookups = Lookups()
     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
     # This is messy, but it's the minimal working fix to Issue #639.
     lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
     lex_attrs[NORM] = util.add_lookups(
         lex_attrs.get(NORM, LEX_ATTRS[NORM]),
         BASE_NORMS,
-        lookups.get_table("lexeme_norm", {}),
     )
     return Vocab(
         lex_attr_getters=lex_attrs,
-        lookups=lookups,
         writing_system=defaults.writing_system,
         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
         vectors_name=vectors_name,
@@ -424,6 +417,14 @@ cdef class Vocab:
             orth = self.strings.add(orth)
         return orth in self.vectors
 
+    def load_lookups(self, lookups):
+        self.lookups = lookups
+        if lookups.has_table("lexeme_norm"):
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters[NORM],
+                lookups.get_table("lexeme_norm"),
+            )
+
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.
 

From 73ff52b9ec9e61ae2d7faeacfef1b7bee53ea10e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 18 Sep 2020 16:43:15 +0200
Subject: [PATCH 048/516] hack for tok2vec listener

---
 spacy/cli/debug_model.py | 26 +++++++++++++++++---------
 spacy/errors.py          |  3 ++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 04a14bdc9..1d8d043fd 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -66,10 +66,12 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(model, print_settings=print_settings)
+    # call _link_components directly as we won't call nlp.begin_training
+    nlp._link_components()
+    debug_model(nlp, model, print_settings=print_settings)
 
 
-def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -86,10 +88,10 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops)
+    goldY = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=Y)
+        model.initialize(X=X, Y=goldY)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -97,9 +99,16 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     # STEP 2: Updating the model and printing again
     optimizer = Adam(0.001)
     set_dropout_rate(model, 0.2)
+    # ugly hack to deal with Tok2Vec listeners
+    tok2vec = None
+    if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
+        tok2vec = nlp.get_pipe("tok2vec")
+        tok2vec.model.initialize(X=X)
     for e in range(3):
-        Y, get_dX = model.begin_update(_get_docs())
-        dY = get_gradient(model, Y)
+        if tok2vec:
+            tok2vec.predict(X)
+        Y, get_dX = model.begin_update(X)
+        dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -107,7 +116,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         _print_model(model, print_settings)
 
     # STEP 3: the final prediction
-    prediction = model.predict(_get_docs())
+    prediction = model.predict(X)
     if print_settings.get("print_prediction"):
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
@@ -115,8 +124,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     msg.good(f"Succesfully ended analysis - model looks good!")
 
 
-def get_gradient(model, Y):
-    goldY = _get_output(model.ops)
+def get_gradient(goldY, Y):
     return Y - goldY
 
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 173aedab9..af307e069 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -545,7 +545,8 @@ class Errors:
     E949 = ("Can only create an alignment when the texts are the same.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
-    E954 = ("The Tok2Vec listener did not receive a valid input.")
+    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
+            "component.")
     E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")

From 47080fba98bf7efd7432a0ac831d5715fad91a59 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 19:43:19 +0200
Subject: [PATCH 049/516] Minor renaming / refactoring

* Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message
* Make `Vocab.lookups` a property
---
 spacy/language.py |  3 ++-
 spacy/util.py     |  2 +-
 spacy/vocab.pxd   |  2 +-
 spacy/vocab.pyx   | 19 ++++++++++++-------
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 1d0990c55..7d463731a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -87,8 +87,9 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
     return tokenizer_factory
 
 
-@registry.misc("spacy.LoadLookupsData.v1")
+@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
+    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
     lookups = load_lookups(lang=lang, tables=tables)
     return lookups
 
diff --git a/spacy/util.py b/spacy/util.py
index 2e285a128..88162b23a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -258,7 +258,7 @@ def load_vocab_data_into_model(
 ) -> None:
     """Load vocab data."""
     if lookups:
-        nlp.vocab.load_lookups(lookups)
+        nlp.vocab.lookups = lookups
 
 
 def load_model(
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 69cec7d3d..7d8dfd5d6 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -28,7 +28,7 @@ cdef class Vocab:
     cpdef readonly StringStore strings
     cpdef public Morphology morphology
     cpdef public object vectors
-    cpdef public object lookups
+    cpdef public object _lookups
     cpdef public object writing_system
     cpdef public object get_noun_chunks
     cdef readonly int length
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 94289036a..ce104d9db 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -417,13 +417,18 @@ cdef class Vocab:
             orth = self.strings.add(orth)
         return orth in self.vectors
 
-    def load_lookups(self, lookups):
-        self.lookups = lookups
-        if lookups.has_table("lexeme_norm"):
-            self.lex_attr_getters[NORM] = util.add_lookups(
-                self.lex_attr_getters[NORM],
-                lookups.get_table("lexeme_norm"),
-            )
+    property lookups:
+        def __get__(self):
+            return self._lookups
+
+        def __set__(self, lookups):
+            self._lookups = lookups
+            if lookups.has_table("lexeme_norm"):
+                self.lex_attr_getters[NORM] = util.add_lookups(
+                    self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+                    self.lookups.get_table("lexeme_norm"),
+                )
+
 
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.

From 39872de1f6e49c4b59ed747a2f15ca448a52f7db Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Sat, 19 Sep 2020 01:17:02 +0200
Subject: [PATCH 050/516] Introducing the gpu_allocator (#6091)

* rename 'use_pytorch_for_gpu_memory' to 'gpu_allocator'

* --code instead of --code-path

* update documentation

* avoid querying the "system" section directly

* add explanation of gpu_allocator to TF/PyTorch section in docs

* fix typo

* fix typo 2

* use set_gpu_allocator from thinc 8.0.0a34

* default null instead of empty string
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 ++--
 spacy/cli/debug_model.py                      |  9 +++++++--
 spacy/cli/pretrain.py                         | 17 +++++++++--------
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 spacy/cli/train.py                            | 13 ++++++-------
 spacy/default_config.cfg                      |  4 ++--
 spacy/schemas.py                              |  1 +
 website/docs/api/cli.md                       |  4 +++-
 website/docs/api/data-formats.md              |  1 +
 website/docs/api/top-level.md                 | 14 ++++++++------
 website/docs/usage/layers-architectures.md    | 12 ++++++++++++
 13 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a413a099c..5290660aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a33,<8.0.0a40",
+    "thinc>=8.0.0a34,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 55fe627b8..4d6c1dfd0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a33,<8.0.0a40
+thinc>=8.0.0a34,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 359e63172..dd0975800 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index a4899a458..349849f58 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
-from thinc.api import Model, data_validation
+from thinc.api import Model, data_validation, set_gpu_allocator
 import typer
 
 from ._util import Arg, Opt, debug_cli, show_validation_error
@@ -53,7 +53,12 @@ def debug_model_cli(
     }
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=config_overrides)
+        config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=True
+        )
+        allocator = config["training"]["gpu_allocator"]
+        if use_gpu >= 0 and allocator:
+            set_gpu_allocator(allocator)
         nlp, config = util.load_model_from_config(config_path)
     seed = config["training"]["seed"]
     if seed is not None:
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index aec077eb7..9e913396e 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -4,10 +4,9 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.api import Config
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import require_gpu, set_gpu_allocator
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import CosineDistance, L2Distance
+from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
 import srsly
 from functools import partial
@@ -32,7 +31,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -99,10 +98,12 @@ def pretrain(
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
 ):
-    if config["system"].get("seed") is not None:
-        fix_random_seed(config["system"]["seed"])
-    if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
-        use_pytorch_for_gpu_memory()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
     corpus = dot_to_object(config, P_cfg["corpus"])
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 00b77af4d..ef608e5e8 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -8,7 +8,7 @@ train = ""
 dev = ""
 
 [system]
-use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
+gpu_allocator = {{ "pytorch" if use_transformer else "" }}
 
 [nlp]
 lang = "{{ lang }}"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 50306b350..debecd0b1 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -6,8 +6,7 @@ from pathlib import Path
 from wasabi import msg
 import thinc
 import thinc.schedules
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
 import random
 import typer
 import logging
@@ -29,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
@@ -79,11 +78,11 @@ def train(
         config = util.load_config(
             config_path, overrides=config_overrides, interpolate=True
         )
-    if config.get("training", {}).get("seed") is not None:
+    if config["training"]["seed"] is not None:
         fix_random_seed(config["training"]["seed"])
-    if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
-        # It feels kind of weird to not have a default for this.
-        use_pytorch_for_gpu_memory()
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index c7c9593d7..f4a453f2a 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -6,7 +6,7 @@ init_tok2vec = null
 
 [system]
 seed = 0
-use_pytorch_for_gpu_memory = false
+gpu_allocator = null
 
 [nlp]
 lang = null
@@ -52,6 +52,7 @@ limit = 0
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 # Extra resources for transfer-learning or pseudo-rehearsal
@@ -75,7 +76,6 @@ train_corpus = "corpora.train"
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 06bc4beed..db71af9ca 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -207,6 +207,7 @@ class ConfigSchemaTraining(BaseModel):
     max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
     eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
     seed: Optional[StrictInt] = Field(..., title="Random seed")
+    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
     score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index bd65a1516..7374e1e3f 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                                                              |
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
@@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 
 | Name                    | Description                                                                                                                                                                           |
 | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                           |
+| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~  |
 | `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                             |
 | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                   |
+| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                            |
 | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                            |
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                  |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 3ed846b9e..6e80bb409 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -189,6 +189,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
 | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 5d850be01..3f51d21aa 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -145,9 +145,10 @@ pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```
 
-| Name        | Description                             |
-| ----------- | --------------------------------------- |
-| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | Whether the GPU was activated. ~~bool~~          |
 
 ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
 
@@ -164,9 +165,10 @@ and _before_ loading any pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```
 
-| Name        | Description     |
-| ----------- | --------------- |
-| **RETURNS** | `True` ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | `True` ~~bool~~                                  |
 
 ## displaCy {#displacy source="spacy/displacy"}
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index aefc64ece..f9787d815 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.
 
 </Infobox>
 
+Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU
+memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
+"tensorflow" in the training config, cupy will allocate memory via those respective libraries,
+preventing OOM errors when there's available memory sitting in the other
+library's pool.
+
+```ini
+### config.cfg (excerpt)
+[training]
+gpu_allocator = "pytorch"
+```
+
 ## Custom models with Thinc {#thinc}
 
 Of course it's also possible to define the `Model` from the previous section

From 6db1d5dc0dff848dded3d2990543f749707afc45 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 19 Sep 2020 19:11:30 +0200
Subject: [PATCH 051/516] trying some stuff

---
 spacy/cli/debug_model.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1d8d043fd..09feaf671 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -89,6 +89,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
     goldY = _get_output(model.ops)
+    # _set_output_dim(nO=goldY.shape[-1], model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=goldY)
@@ -108,6 +109,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
+        print("get_dX", get_dX)
         dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
@@ -152,6 +154,10 @@ def _get_output(ops):
     return ops.xp.asarray(output)
 
 
+def _get_output_old(xp):
+    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+
+
 def _print_model(model, print_settings):
     layers = print_settings.get("layers", "")
     parameters = print_settings.get("parameters", False)
@@ -200,3 +206,12 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
+
+
+def _set_output_dim(model, nO):
+    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 554c9a24978d968113da02783c7257b5133ec5e6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 12:30:53 +0200
Subject: [PATCH 052/516] Update docs [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja |  6 +++++-
 website/docs/api/data-formats.md              |  7 +++----
 website/docs/api/top-level.md                 | 10 ++++++++++
 website/docs/usage/embeddings-transformers.md | 10 ++++++++++
 website/docs/usage/projects.md                | 10 ++++------
 5 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index ef608e5e8..0db4c8a59 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -8,7 +8,11 @@ train = ""
 dev = ""
 
 [system]
-gpu_allocator = {{ "pytorch" if use_transformer else "" }}
+{% if use_transformer -%}
+gpu_allocator = "pytorch"
+{% else -%}
+gpu_allocator = null
+{% endif %}
 
 [nlp]
 lang = "{{ lang }}"
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 6e80bb409..3a214428b 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
 > [nlp]
 > lang = "en"
 > pipeline = ["tagger", "parser", "ner"]
-> load_vocab_data = true
 > before_creation = null
 > after_creation = null
 > after_pipeline_creation = null
@@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
 | `lang`                    | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~                                                                                                                                                                                        |
 | `pipeline`                | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~                                                                        |
 | `disabled`                | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
-| `load_vocab_data`         | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~                                                                                                                                |
 | `before_creation`         | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~                                                                                                      |
 | `after_creation`          | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                                    |
 | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                   |
@@ -189,9 +187,10 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            |
 | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
+| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
 | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
@@ -476,7 +475,7 @@ lexical data.
 Here's an example of the 20 most frequent lexemes in the English training data:
 
 ```json
-%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl
+%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
 ```
 
 ## Pipeline meta {#meta}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 3f51d21aa..7afe02403 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -458,6 +458,16 @@ remain in the config file stored on your local system.
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
 
+<Project id="integrations/wandb">
+
+Get started with tracking your spaCy training runs in Weights & Biases using our
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.
+
+</Project>
+
 ## Readers {#readers source="spacy/training/corpus.py" new="3"}
 
 Corpus readers are registered functions that load data and return a function
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 4adcd927c..c6c703842 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -655,6 +655,16 @@ and pass in optional config overrides, like the path to the raw text file:
 $ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```
 
+The following defaults are used for the `[pretraining]` block and merged into
+your existing config when you run [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
+you can [configure](#pretraining-configure) the settings and hyperparameters or
+change the [objective](#pretraining-details).
+
+```ini
+%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
+```
+
 ### How pretraining works {#pretraining-details}
 
 The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 665caa15b..08bfb9da2 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -976,14 +976,12 @@ your results.
 
 ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
 
-<!-- TODO:
-
 <Project id="integrations/wandb">
 
 Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It includes a simple config using the `WandbLogger`, as well
-as a custom logger implementation you can adjust for your specific use case.
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.
 
 </Project>
-
--->

From 889128e5c586f39eb6f18ae6a6b6fbe1505f4080 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:20:57 +0200
Subject: [PATCH 053/516] Improve error handling in run_command

---
 spacy/util.py | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 88162b23a..6e7b28fec 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -659,8 +659,8 @@ def join_command(command: List[str]) -> str:
 def run_command(
     command: Union[str, List[str]],
     *,
-    capture: bool = False,
     stdin: Optional[Any] = None,
+    capture: bool=False,
 ) -> Optional[subprocess.CompletedProcess]:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.
@@ -668,33 +668,46 @@ def run_command(
     command (str / List[str]): The command. If provided as a string, the
         string will be split using shlex.split.
     stdin (Optional[Any]): stdin to read from or None.
-    capture (bool): Whether to capture the output.
+    capture (bool): Whether to capture the output and errors. If False,
+        the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the returncode. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
     RETURNS (Optional[CompletedProcess]): The process object.
     """
     if isinstance(command, str):
-        command = split_command(command)
+        cmd_list = split_command(command)
+        cmd_str = command
+    else:
+        cmd_list = command
+        cmd_str = " ".join(command)
     try:
         ret = subprocess.run(
-            command,
+            cmd_list,
             env=os.environ.copy(),
             input=stdin,
             encoding="utf8",
-            check=True,
+            check=False,
             stdout=subprocess.PIPE if capture else None,
-            stderr=subprocess.PIPE if capture else None,
+            stderr=subprocess.STDOUT if capture else None,
         )
     except FileNotFoundError:
+        # Indicates the *command* wasn't found, it's an error before the command
+        # is run.
         raise FileNotFoundError(
-            Errors.E970.format(str_command=" ".join(command), tool=command[0])
+            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
         ) from None
-    except subprocess.CalledProcessError as e:
-        # We don't want a duplicate traceback here so we're making sure the
-        # CalledProcessError isn't re-raised. We also print both the string
-        # message and the stderr, in case the error only has one of them.
-        print(e.stderr)
-        print(e)
-        sys.exit(1)
-    if ret.returncode != 0:
+    if ret.returncode != 0 and capture:
+        message = f"Error running command:\n\n{cmd_str}\n\n"
+        message += f"Subprocess exited with status {ret.returncode}"
+        if ret.stdout is not None:
+            message += f"\n\nProcess log (stdout and stderr):\n\n"
+            message += ret.stdout
+        error = subprocess.SubprocessError(message)
+        error.ret = ret
+        error.command = cmd_str
+        raise error
+    elif ret.returncode != 0:
         sys.exit(ret.returncode)
     return ret
 

From 2c24d633d0f81e17dca2158b5185f316ae910130 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:21:43 +0200
Subject: [PATCH 054/516] Use updated run_command

---
 spacy/cli/package.py     | 2 +-
 spacy/cli/project/run.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8d6cd84c1..49a0ab75d 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -110,7 +110,7 @@ def package(
     msg.good(f"Successfully created package '{model_name_v}'", main_path)
     if create_sdist:
         with util.working_dir(main_path):
-            util.run_command([sys.executable, "setup.py", "sdist"])
+            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
         zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
         msg.good(f"Successfully created zipped Python package", zip_file)
 
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index eb7b8cc5b..13c28f1da 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -144,7 +144,7 @@ def run_commands(
         if not silent:
             print(f"Running command: {join_command(command)}")
         if not dry:
-            run_command(command)
+            run_command(command, capture=False)
 
 
 def validate_subcommand(

From a0fb5e50dbb1e24901f7b1470ee53cc6bce7a4d6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:22:04 +0200
Subject: [PATCH 055/516] Use simple git clone call if not sparse

---
 spacy/cli/_util.py | 77 ++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 44 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index e8f3be995..6675f4d50 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -308,6 +308,31 @@ def git_checkout(
         msg.fail("Destination of checkout must not exist", exits=1)
     if not dest.parent.exists():
         raise IOError("Parent of destination of checkout must exist")
+
+    if sparse and git_version >= (2, 22):
+        return git_sparse_checkout(repo, subpath, dest, branch)
+    elif sparse:
+        # Only show warnings if the user explicitly wants sparse checkout but
+        # the Git version doesn't support it
+        err_old = (
+            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
+            f"that doesn't fully support sparse checkout yet."
+        )
+        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
+        msg.warn(
+            f"{err_unk if git_version == (0, 0) else err_old} "
+            f"This means that more files than necessary may be downloaded "
+            f"temporarily. To only download the files needed, make sure "
+            f"you're using Git v2.22 or above."
+        )
+    with make_tempdir() as tmp_dir:
+        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
+        ret = run_command(cmd, capture=True)
+        # We need Path(name) to make sure we also support subdirectories
+        shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
+
+
+def git_sparse_checkout(repo, subpath, dest, branch):
     # We're using Git, partial clone and sparse checkout to
     # only clone the files we need
     # This ends up being RIDICULOUS. omg.
@@ -324,47 +349,28 @@ def git_checkout(
     # *that* we can do by path.
     # We're using Git and sparse checkout to only clone the files we need
     with make_tempdir() as tmp_dir:
-        supports_sparse = git_version >= (2, 22)
-        use_sparse = supports_sparse and sparse
         # This is the "clone, but don't download anything" part.
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
-        if use_sparse:
-            cmd += f"--filter=blob:none"  # <-- The key bit
-        # Only show warnings if the user explicitly wants sparse checkout but
-        # the Git version doesn't support it
-        elif sparse:
-            err_old = (
-                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-                f"that doesn't fully support sparse checkout yet."
-            )
-            err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
-            msg.warn(
-                f"{err_unk if git_version == (0, 0) else err_old} "
-                f"This means that more files than necessary may be downloaded "
-                f"temporarily. To only download the files needed, make sure "
-                f"you're using Git v2.22 or above."
-            )
-        try_run_command(cmd)
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none"
+        run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
         cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
-        ret = try_run_command(cmd)
+        ret = run_command(cmd, capture=True)
         git_repo = _from_http_to_git(repo)
         # Now pass those missings into another bit of git internals
         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        if use_sparse and not missings:
+        if not missings:
             err = (
                 f"Could not find any relevant files for '{subpath}'. "
                 f"Did you specify a correct and complete path within repo '{repo}' "
                 f"and branch {branch}?"
             )
             msg.fail(err, exits=1)
-        if use_sparse:
-            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-            try_run_command(cmd)
+        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
+        run_command(cmd, capture=True)
         # And finally, we can checkout our subpath
         cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        try_run_command(cmd)
+        run_command(cmd, capture=True)
         # We need Path(name) to make sure we also support subdirectories
         shutil.move(str(tmp_dir / Path(subpath)), str(dest))
 
@@ -378,7 +384,7 @@ def get_git_version(
     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
         (0, 0) if the version couldn't be determined.
     """
-    ret = try_run_command(["git", "--version"], error=error)
+    ret = run_command("git --version", capture=True)
     stdout = ret.stdout.strip()
     if not stdout or not stdout.startswith("git version"):
         return (0, 0)
@@ -386,23 +392,6 @@ def get_git_version(
     return (int(version[0]), int(version[1]))
 
 
-def try_run_command(
-    cmd: Union[str, List[str]], error: str = "Could not run command"
-) -> subprocess.CompletedProcess:
-    """Try running a command and raise an error if it fails.
-
-    cmd (Union[str, List[str]]): The command to run.
-    error (str): The error message.
-    RETURNS (CompletedProcess): The completed process if the command ran.
-    """
-    try:
-        return run_command(cmd, capture=True)
-    except subprocess.CalledProcessError as e:
-        msg.fail(error)
-        print(cmd)
-        sys.exit(1)
-
-
 def _from_http_to_git(repo: str) -> str:
     if repo.startswith("http://"):
         repo = repo.replace(r"http://", r"https://")

From dc22771f879455a81d8338588aa726a58b08bf50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:30:05 +0200
Subject: [PATCH 056/516] Fix sparse checkout

---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 6675f4d50..cc7be1144 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -354,7 +354,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
         run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
+        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
         ret = run_command(cmd, capture=True)
         git_repo = _from_http_to_git(repo)
         # Now pass those missings into another bit of git internals

From 8fb59d958c9676f32d84227c0b042a26b088da35 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:31:48 +0200
Subject: [PATCH 057/516] Format

---
 spacy/cli/_util.py | 5 ++++-
 spacy/util.py      | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index cc7be1144..c67863ef1 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -350,7 +350,10 @@ def git_sparse_checkout(repo, subpath, dest, branch):
     # We're using Git and sparse checkout to only clone the files we need
     with make_tempdir() as tmp_dir:
         # This is the "clone, but don't download anything" part.
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none"
+        cmd = (
+            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
+            f"-b {branch} --filter=blob:none"
+        )
         run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
diff --git a/spacy/util.py b/spacy/util.py
index 6e7b28fec..93000ea27 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -254,7 +254,7 @@ def load_vectors_into_model(
 
 
 def load_vocab_data_into_model(
-    nlp: "Language", *, lookups: Optional["Lookups"]=None
+    nlp: "Language", *, lookups: Optional["Lookups"] = None
 ) -> None:
     """Load vocab data."""
     if lookups:
@@ -660,7 +660,7 @@ def run_command(
     command: Union[str, List[str]],
     *,
     stdin: Optional[Any] = None,
-    capture: bool=False,
+    capture: bool = False,
 ) -> Optional[subprocess.CompletedProcess]:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.

From 744f259b9c93858d97937157414cb67641d4c846 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 16:37:23 +0200
Subject: [PATCH 058/516] Update landing [ci skip]

---
 website/src/widgets/landing.js | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 77fcdfd81..41b009010 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'
 
 const CODE_EXAMPLE = `# pip install spacy
 # python -m spacy download en_core_web_sm
-
 import spacy
 
 # Load English tokenizer, tagger, parser and NER
@@ -120,7 +119,7 @@ const Landing = ({ data }) => {
                         </Li>
                         <Li>
                             ✅ Components for <strong>named entity</strong> recognition,
-                            part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
+                            part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
                             <strong>text classification</strong>, lemmatization, morphological
                             analysis, entity linking and more
                         </Li>

From b2302c0a1ce7bacafdde22039cbd8da9782a3f27 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:44:51 +0200
Subject: [PATCH 059/516] Improve error for missing dependency

---
 spacy/cli/project/run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 13c28f1da..d7e1075f3 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -59,8 +59,9 @@ def project_run(
         for dep in cmd.get("deps", []):
             if not (project_dir / dep).exists():
                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command?"
                 err_kwargs = {"exits": 1} if not dry else {}
-                msg.fail(err, **err_kwargs)
+                msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:
             rerun = check_rerun(current_dir, cmd)
             if not rerun and not force:

From 012b3a709636224534e44720bca00cb0cc6e3f92 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:44:58 +0200
Subject: [PATCH 060/516] Update docs [ci skip]

---
 website/docs/api/cli.md                       |  4 +--
 website/docs/usage/embeddings-transformers.md |  5 ++-
 website/docs/usage/facts-figures.md           |  6 ++--
 website/docs/usage/layers-architectures.md    | 12 +++----
 website/docs/usage/models.md                  |  2 --
 website/docs/usage/projects.md                | 18 ++++------
 website/docs/usage/saving-loading.md          | 13 +++++--
 website/docs/usage/training.md                | 11 +++++-
 website/docs/usage/v3.md                      | 34 +++++++++++--------
 website/meta/site.json                        |  1 +
 website/src/components/tag.js                 |  2 +-
 website/src/components/util.js                |  1 +
 website/src/widgets/landing.js                |  9 ++---
 website/src/widgets/project.js                | 18 ++++++----
 14 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7374e1e3f..53cd954be 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -895,8 +895,6 @@ what you need. By default, spaCy's
 can provide any other repo (public or private) that you have access to using the
 `--repo` option.
 
-<!-- TODO: update example once we've decided on repo structure -->
-
 ```cli
 $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 ```
@@ -904,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 > #### Example
 >
 > ```cli
-> $ python -m spacy project clone some_example
+> $ python -m spacy project clone pipelines/ner_wikiner
 > ```
 >
 > Clone from custom repo:
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index c6c703842..a855d703c 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register
 yourself. For details on how to get started with training your own model, check
 out the [training quickstart](/usage/training#quickstart).
 
-<!-- TODO:
-<Project id="en_core_trf_lg">
+<!-- TODO: <Project id="en_core_trf_lg">
 
 The easiest way to get started is to clone a transformers-based project
 template. Swap in your data, edit the settings and hyperparameters and train,
@@ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/toplevel#jsonlreader).
+the [`JsonlReader`](/api/top-level#jsonlreader).
 
 > #### Raw text format
 >
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index bff31d0d6..75f92070a 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
 right up to **current state-of-the-art**. You can also use a CPU-optimized
 pipeline, which is less accurate but much cheaper to run.
 
-<!-- TODO: -->
+<!-- TODO: update benchmarks and intro -->
 
 > #### Evaluation details
 >
@@ -68,6 +68,6 @@ our project template.
 
 </Project>
 
-<!-- ## Citing spaCy {#citation}
+<!-- TODO: ## Citing spaCy {#citation}
 
-<!-- TODO: update -->
+-->
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index f9787d815..a58ba2ba9 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -356,11 +356,11 @@ that training configs are complete and experiments fully reproducible.
 
 </Infobox>
 
-Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU
-memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
-"tensorflow" in the training config, cupy will allocate memory via those respective libraries,
-preventing OOM errors when there's available memory sitting in the other
-library's pool.
+Note that when using a PyTorch or Tensorflow model, it is recommended to set the
+GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
+"tensorflow" in the training config, cupy will allocate memory via those
+respective libraries, preventing OOM errors when there's available memory
+sitting in the other library's pool.
 
 ```ini
 ### config.cfg (excerpt)
@@ -489,7 +489,7 @@ with Model.define_operators({">>": chain}):
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
-<!-- TODO:
+<!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `begin_training`, correlation with add_label
 Example: relation extraction component (implemented as project template)
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index e94cdfe9e..9b686c947 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
 native Python packaging. This allows your application to handle a spaCy pipeline
 like any other package dependency.
 
-<!-- TODO: reference relevant spaCy project -->
-
 ### Downloading and requiring package dependencies {#models-download}
 
 spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 08bfb9da2..f8d5a3761 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new
 
 ![Illustration of project workflow and commands](../images/projects.svg)
 
-<!-- TODO:
-<Project id="some_example_project">
+<Project id="pipelines/tagger_parser_ud">
 
-Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
-mattis pretium.
+The easiest way to get started is to clone a project template and run it – for
+example, this end-to-end template that lets you train a **part-of-speech
+tagger** and **dependency parser** on a Universal Dependencies treebank.
 
 </Project>
--->
 
 spaCy projects make it easy to integrate with many other **awesome tools** in
 the data science and machine learning ecosystem to track and manage your data
@@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
 project, e.g. to train a pipeline and edit the commands and scripts to build
 fully custom workflows.
 
-<!-- TODO: update with real example project -->
-
 ```cli
-python -m spacy project clone some_example_project
+python -m spacy project clone pipelines/tagger_parser_ud
 ```
 
 By default, the project will be cloned into the current working directory. You
@@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
 a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.
 
-<!-- TODO: update with better (final) example -->
-
 ```yaml
-https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
+https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
 ```
 
 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index c0fe1323c..3a95bf6aa 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
 meta and configuration will be written out. To make the pipeline more convenient
 to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
 
-<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
+<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>
 
 When you save a pipeline in spaCy v3.0+, two files will be exported: a
 [`config.cfg`](/api/data-formats#config) based on
@@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).
 
 </Accordion>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started with an end-to-end workflow is to clone a
+[project template](/usage/projects) and run it – for example, this template that
+lets you train a **part-of-speech tagger** and **dependency parser** on a
+Universal Dependencies treebank and generates an installable Python package.
+
+</Project>
+
 ### Generating a pipeline package {#models-generating}
 
 <Infobox title="Important note" variant="warning">
@@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
 ```python
 nlp = spacy.blank("en").from_disk("/path/to/data")
 ```
-
-<!-- TODO: point to spaCy projects? -->
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index c0f4caad7..6e9de62c5 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
 $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
 ```
 
-<Accordion title="How are the config recommendations generated?" id="quickstart-source">
+<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>
 
 The recommended config settings generated by the quickstart widget and the
 [`init config`](/api/cli#init-config) command are based on some general **best
@@ -112,6 +112,15 @@ as we run more experiments.
 
 </Accordion>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ## Training config {#config}
 
 Training config files include all **settings and hyperparameters** for training
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 24babc9bd..5abeb5707 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.
 
 ### Manage end-to-end workflows with projects {#features-projects}
 
-<!-- TODO: update example -->
-
 > #### Example
 >
 > ```cli
 > # Clone a project template
-> $ python -m spacy project clone example
-> $ cd example
+> $ python -m spacy project clone pipelines/tagger_parser_ud
+> $ cd tagger_parser_ud
 > # Download data assets
 > $ python -m spacy project assets
 > # Run a workflow
-> $ python -m spacy project run train
+> $ python -m spacy project run all
 > ```
 
 spaCy projects let you manage and share **end-to-end spaCy workflows** for
@@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
 [Ray](/usage/projects#ray) for parallel training,
 [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
 
-<!-- <Project id="some_example_project">
-
-The easiest way to get started with an end-to-end training process is to clone a
-[project](/usage/projects) template. Projects let you manage multi-step
-workflows, from data preprocessing to training and packaging your pipeline.
-
-</Project>-->
-
 <Infobox title="Details & Documentation" emoji="📖" list>
 
 - **Usage:** [spaCy projects](/usage/projects),
@@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.
 
 </Infobox>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ### Parallel and distributed training with Ray {#features-parallel-training}
 
 > #### Example
@@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
 + python -m spacy train ./config.cfg --output ./output
 ```
 
-<!-- TODO: project template -->
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
 
 #### Training via the Python API {#migrating-training-python}
 
diff --git a/website/meta/site.json b/website/meta/site.json
index 1955932b9..1a96ca660 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -12,6 +12,7 @@
     "companyUrl": "https://explosion.ai",
     "repo": "explosion/spaCy",
     "modelsRepo": "explosion/spacy-models",
+    "projectsRepo": "explosion/projects/tree/v3",
     "social": {
         "twitter": "spacy_io",
         "github": "explosion"
diff --git a/website/src/components/tag.js b/website/src/components/tag.js
index 3f2b4e994..b406e771e 100644
--- a/website/src/components/tag.js
+++ b/website/src/components/tag.js
@@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
         const isValid = isString(children) && !isNaN(children)
         const version = isValid ? Number(children).toFixed(1) : children
         const tooltipText = `This feature is new and was introduced in spaCy v${version}`
-        // TODO: we probably want to handle this more elegantly, but the idea is
+        // We probably want to handle this more elegantly, but the idea is
         // that we can hide tags referring to old versions
         const major = isString(version) ? Number(version.split('.')[0]) : version
         return major < MIN_VERSION ? null : (
diff --git a/website/src/components/util.js b/website/src/components/util.js
index 3d86cf37e..be55f0bb3 100644
--- a/website/src/components/util.js
+++ b/website/src/components/util.js
@@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
 const DEFAULT_BRANCH = 'develop'
 export const repo = siteMetadata.repo
 export const modelsRepo = siteMetadata.modelsRepo
+export const projectsRepo = siteMetadata.projectsRepo
 
 /**
  * This is used to provide selectors for headings so they can be crawled by
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 41b009010..2e75c893a 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -222,10 +222,11 @@ const Landing = ({ data }) => {
                     <br />
                     <br />
                     <br />
-                    {/** TODO: update with actual example */}
-                    <Project id="some_example">
-                        Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-                        sodales lectus.
+                    <Project id="pipelines/tagger_parser_ud" title="Get started">
+                        The easiest way to get started is to clone a project template and run it
+                        – for example, this template for training a{' '}
+                        <strong>part-of-speech tagger</strong> and{' '}
+                        <strong>dependency parser</strong> on a Universal Dependencies treebank.
                     </Project>
                 </LandingCol>
                 <LandingCol>
diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js
index 0bd74bc90..8d309394d 100644
--- a/website/src/widgets/project.js
+++ b/website/src/widgets/project.js
@@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
 import Infobox from '../components/infobox'
 import Link from '../components/link'
 import { InlineCode } from '../components/code'
+import { projectsRepo } from '../components/util'
 
-// TODO: move to meta?
-const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
 const COMMAND = 'python -m spacy project clone'
 
-export default function Project({ id, repo, children }) {
+export default function Project({
+    title = 'Get started with a project template',
+    id,
+    repo,
+    children,
+}) {
     const repoArg = repo ? ` --repo ${repo}` : ''
     const text = `${COMMAND} ${id}${repoArg}`
-    const url = `${repo || DEFAULT_REPO}/${id}`
-    const title = (
+    const url = `${repo || projectsRepo}/${id}`
+    const header = (
         <>
-            Get started with a project template:{' '}
+            {title}:{' '}
             <Link to={url}>
                 <InlineCode>{id}</InlineCode>
             </Link>
         </>
     )
     return (
-        <Infobox title={title} emoji="🪐">
+        <Infobox title={header} emoji="🪐">
             {children}
             <CopyInput text={text} prefix="$" />
         </Infobox>

From b9d2b29684c051f956ec808705a2e7288ccf27dd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:49:09 +0200
Subject: [PATCH 061/516] Update docs [ci skip]

---
 website/src/styles/copy.module.sass | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/website/src/styles/copy.module.sass b/website/src/styles/copy.module.sass
index c6d2f68cb..3a942552d 100644
--- a/website/src/styles/copy.module.sass
+++ b/website/src/styles/copy.module.sass
@@ -15,6 +15,10 @@
     background: transparent
     resize: none
     font: inherit
+    overflow: hidden
+    white-space: nowrap
+    text-overflow: ellipsis
+    margin-right: 1rem
 
 .prefix
     margin-right: 0.75em

From 3aa57ce6c9ab162715cad72563b25f5aecb28966 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:07:20 +0200
Subject: [PATCH 062/516] Update alignment mode in Doc.char_span docs

---
 website/docs/api/doc.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..44316ea1e 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -187,8 +187,8 @@ Remove a previously registered extension.
 ## Doc.char_span {#char_span tag="method" new="2"}
 
 Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
-`None` if the character indices don't map to a valid span using the default mode
-`"strict".
+`None` if the character indices don't map to a valid span using the default
+alignment mode `"strict".
 
 > #### Example
 >
@@ -198,15 +198,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 > assert span.text == "New York"
 > ```
 
-| Name                                 | Description                                                                                                                                                                                                                                                                 |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                       |
-| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                      |
-| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                 |
-| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                   |
-| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                              |
-| `mode`                               | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"inside"` (span of all tokens completely within the character span), `"outside"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                  |
+| Name                                 | Description                                                                                                                                                                                                                                                                  |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                       |
+| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 

From cc71ec901f26ae1c3bfb62b6bd776295200f418e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:08:55 +0200
Subject: [PATCH 063/516] Fix typo in saving and loading usage docs

---
 website/docs/usage/saving-loading.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 3a95bf6aa..06fb18591 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -299,9 +299,10 @@ installed in the same environment – that's it.
 
 When you load a pipeline, spaCy will generally use its `config.cfg` to set up
 the language class and construct the pipeline. The pipeline is specified as a
-list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
-strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
-defined by the decorators [`@Language.component`](/api/language#component) and
+list of strings, e.g. `pipeline = ["tagger", "parser", "ner"]`. For each of
+those strings, spaCy will call `nlp.add_pipe` and look up the name in all
+factories defined by the decorators
+[`@Language.component`](/api/language#component) and
 [`@Language.factory`](/api/language#factory). This means that you have to import
 your custom components _before_ loading the pipeline.
 

From 9d32cac736da47351e3f38f961aae2fc9e591401 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 10:55:36 +0200
Subject: [PATCH 064/516] Update docs [ci skip]

---
 website/docs/usage/projects.md | 12 ++++++++----
 website/docs/usage/training.md |  8 ++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index f8d5a3761..95e20525a 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -921,6 +921,14 @@ package is installed in the same environment as spaCy, it will automatically add
 [parallel training](/usage/training#parallel-training) for more details on how
 it works under the hood.
 
+<Project id="integrations/ray">
+
+Get started with parallel training using our project template. It trains a
+simple model on a Universal Dependencies Treebank and lets you parallelize the
+training with Ray.
+
+</Project>
+
 You can integrate [`spacy ray train`](/api/cli#ray-train) into your
 `project.yml` just like the regular training command and pass it the config, and
 optional output directory or remote storage URL and config overrides if needed.
@@ -940,10 +948,6 @@ commands:
       - "training/model-best"
 ```
 
-<!-- TODO: <Project id="integrations/ray">
-
-</Project> -->
-
 ---
 
 ### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 6e9de62c5..071434162 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -895,9 +895,13 @@ cluster. If it's not set, Ray will run locally.
 python -m spacy ray train config.cfg --n-workers 2
 ```
 
-<!-- TODO: <Project id="integrations/ray">
+<Project id="integrations/ray">
 
-</Project> -->
+Get started with parallel training using our project template. It trains a
+simple model on a Universal Dependencies Treebank and lets you parallelize the
+training with Ray.
+
+</Project>
 
 ### How parallel training works {#parallel-training-details}
 

From 1114219ae3034a9bec070967cdbf03001ea747d8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 10:59:07 +0200
Subject: [PATCH 065/516] Tidy up and auto-format

---
 spacy/cli/_util.py                              |  3 +--
 spacy/language.py                               |  8 ++------
 spacy/ml/models/tok2vec.py                      | 16 +++++++++++-----
 spacy/schemas.py                                |  4 ++--
 spacy/tests/doc/test_span.py                    |  7 ++++++-
 spacy/tests/parser/test_parse_navigate.py       |  7 ++++++-
 spacy/tests/pipeline/test_pipe_factories.py     | 15 +++------------
 spacy/tests/regression/test_issue1501-2000.py   | 12 ++++++++++--
 .../tests/serialize/test_serialize_pipeline.py  |  8 +++++++-
 spacy/tests/test_cli.py                         |  1 -
 spacy/tests/test_language.py                    |  3 +--
 spacy/tests/test_util.py                        |  2 +-
 spacy/tests/training/test_readers.py            | 17 ++++++++---------
 spacy/tests/training/test_training.py           | 12 +++++++++++-
 14 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c67863ef1..040434c05 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -6,7 +6,6 @@ from wasabi import msg
 import srsly
 import hashlib
 import typer
-import subprocess
 from click import NoSuchOption
 from typer.main import get_command
 from contextlib import contextmanager
@@ -327,7 +326,7 @@ def git_checkout(
         )
     with make_tempdir() as tmp_dir:
         cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
-        ret = run_command(cmd, capture=True)
+        run_command(cmd, capture=True)
         # We need Path(name) to make sure we also support subdirectories
         shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
 
diff --git a/spacy/language.py b/spacy/language.py
index 7d463731a..4dffd9679 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -156,11 +156,7 @@ class Language:
             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
         if vocab is True:
             vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(
-                self.lang,
-                self.Defaults,
-                vectors_name=vectors_name,
-            )
+            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -1462,7 +1458,7 @@ class Language:
         # here :(
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
-                for name2, proc2 in self.pipeline[i+1:]:
+                for name2, proc2 in self.pipeline[i + 1 :]:
                     if isinstance(getattr(proc2, "model", None), Model):
                         proc1.find_listeners(proc2.model)
 
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 7ced4bd04..fec478e21 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -164,7 +164,9 @@ def MultiHashEmbed(
 
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
+def CharacterEmbed(
+    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
+):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is
@@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
                 ),
                 StaticVectors(width, dropout=0.0),
             ),
-            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
+            with_array(
+                Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
+            ),
             ragged2list(),
-    )
+        )
     else:
         model = chain(
             concatenate(
@@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),
             ),
-            with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
+            with_array(
+                Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
+            ),
             ragged2list(),
-    )
+        )
     return model
 
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 60655da8c..b0f26dcd7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
@@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel):
     batcher: Batcher = Field(..., title="Batcher for the training data")
     component: str = Field(..., title="Component to find the layer to pretrain")
     layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
- 
+
     # TODO: use a more detailed schema for this?
     objective: Dict[str, Any] = Field(..., title="Pretraining objective")
     # fmt: on
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ad4f49042..0c538a0eb 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=[2, 1, 1, 0],
+        deps=["dep"] * 4,
+    )
     lca = doc[:2].get_lca_matrix()
     assert lca.shape == (2, 2)
     assert lca[0, 0] == 0  # the & the -> the
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index db1e98ba0..f181a799a 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=["dep"] * len(heads),
+    )
 
     lefts = {}
     rights = {}
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 1cf06d97f..881460704 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -345,10 +345,7 @@ def test_language_factories_invalid():
             [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
             {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        (
-            [{"a": 0.5, "b": 0.5}, {"b": 1.0}],
-            {"a": 0.25, "b": 0.75},
-        ),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
     ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
@@ -363,16 +360,10 @@ def test_language_factories_scores():
     weights1 = {"a1": 0.5, "a2": 0.5}
     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
     Language.factory(
-        f"{name}1",
-        scores=list(weights1),
-        default_score_weights=weights1,
-        func=func,
+        f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
     )
     Language.factory(
-        f"{name}2",
-        scores=list(weights2),
-        default_score_weights=weights2,
-        func=func,
+        f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
     )
     meta1 = Language.get_factory_meta(f"{name}1")
     assert meta1.default_score_weights == weights1
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index e226c8524..71ed2ea03 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -212,9 +212,17 @@ def test_issue1834():
         heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
         deps=["dep"] * len(words),
     )
-    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
+    print(
+        doc.has_annotation("DEP"),
+        [t.head.i for t in doc],
+        [t.is_sent_start for t in doc],
+    )
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
+    print(
+        new_doc.has_annotation("DEP"),
+        [t.head.i for t in new_doc],
+        [t.is_sent_start for t in new_doc],
+    )
     assert new_doc[6].sent_start
     assert new_doc.has_annotation("DEP")
     assert new_doc.has_annotation("TAG")
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index eedad31e0..d1c4553be 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_TEXTCAT_MODEL}
     model = registry.make_from_config(cfg, validate=True)["model"]
-    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
+    textcat = TextCategorizer(
+        en_vocab,
+        model,
+        labels=["ENTITY", "ACTION", "MODIFIER"],
+        threshold=0.5,
+        positive_label=None,
+    )
     textcat.to_bytes(exclude=["vocab"])
 
 
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0a2300455..422ae74b4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -3,7 +3,6 @@ from click import NoSuchOption
 
 from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
-from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 2a24d368a..da46ad424 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -291,8 +291,7 @@ def test_spacy_blank():
 
 
 @pytest.mark.parametrize(
-    "value",
-    [False, None, ["x", "y"], Language, Vocab],
+    "value", [False, None, ["x", "y"], Language, Vocab],
 )
 def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 8c931d31e..1668991cd 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -95,7 +95,7 @@ def test_util_dot_section():
     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
     # Test that default values got overwritten
     assert en_config["nlp"]["pipeline"] == ["textcat"]
-    assert nl_config["nlp"]["pipeline"] == [] # default value []
+    assert nl_config["nlp"]["pipeline"] == []  # default value []
     # Test proper functioning of 'dot_to_object'
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.pipeline.tagger")
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 898746c2a..d20a032e8 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,7 +1,6 @@
 from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
-
 from spacy import Language
 from spacy.util import load_model_from_config, registry, dot_to_object
 from spacy.training import Example
@@ -10,19 +9,19 @@ from spacy.training import Example
 def test_readers():
     config_string = """
     [training]
-    
+
     [corpora]
     @readers = "myreader.v1"
 
     [nlp]
     lang = "en"
     pipeline = ["tok2vec", "textcat"]
-    
+
     [components]
-    
+
     [components.tok2vec]
     factory = "tok2vec"
-    
+
     [components.textcat]
     factory = "textcat"
     """
@@ -69,19 +68,19 @@ def test_readers():
 def test_cat_readers(reader, additional_config):
     nlp_config_string = """
     [training]
-    
+
     [corpora]
     @readers = "PLACEHOLDER"
 
     [nlp]
     lang = "en"
     pipeline = ["tok2vec", "textcat"]
-    
+
     [components]
-    
+
     [components.tok2vec]
     factory = "tok2vec"
-    
+
     [components.textcat]
     factory = "textcat"
     """
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 1d3c72a8b..b09487965 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -34,7 +34,17 @@ def doc():
     # fmt: on
     nlp = English()
     words = [t.text for t in nlp.make_doc(text)]
-    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
+    doc = get_doc(
+        nlp.vocab,
+        words=words,
+        tags=tags,
+        pos=pos,
+        morphs=morphs,
+        heads=heads,
+        deps=deps,
+        lemmas=lemmas,
+        ents=ents,
+    )
     doc.cats = cats
     return doc
 

From 5497acf49aef93a1d6d451da11cc9f3d2841b345 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 11:25:10 +0200
Subject: [PATCH 066/516] Support config overrides via environment variables

---
 spacy/cli/_util.py      | 58 ++++++++++++++++++++++++++++++++---------
 spacy/tests/test_cli.py | 16 ++++++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 040434c05..0159dd473 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -11,9 +11,10 @@ from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError
+import os
 
 from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry
+from ..util import import_file, run_command, make_tempdir, registry, logger
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -61,16 +62,38 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
+def parse_config_env_overrides(
+    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+) -> Dict[str, Any]:
+    """Generate a dictionary of config overrides based on environment variables,
+    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
+    setting.
+
+    prefix (str): The env variable prefix for config overrides.
+    dot (str): String used to represent the "dot", e.g. in training.batch_size.
+    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
+    """
+    result = {}
+    for env_key, value in os.environ.items():
+        if env_key.startswith(prefix):
+            opt = env_key[len(prefix) :].lower().replace(dot, ".")
+            if "." in opt:
+                result[opt] = try_json_loads(value)
+    return result
+
+
+def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
     args (List[str]): The extra arguments from the command line.
+    env_vars (bool): Include environment variables.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    result = {}
+    env_overrides = parse_config_env_overrides() if env_vars else {}
+    cli_overrides = {}
     while args:
         opt = args.pop(0)
         err = f"Invalid CLI argument '{opt}'"
@@ -87,18 +110,27 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
                     value = "true"
                 else:
                     value = args.pop(0)
-            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd be unintuitive to
-            # explicitly mark strings with escaped quotes. So we're working
-            # around that here by falling back to a string if parsing fails.
-            # TODO: improve logic to handle simple types like list of strings?
-            try:
-                result[opt] = srsly.json_loads(value)
-            except ValueError:
-                result[opt] = str(value)
+            if opt not in env_overrides:
+                cli_overrides[opt] = try_json_loads(value)
         else:
             msg.fail(f"{err}: override option should start with --", exits=1)
-    return result
+    if cli_overrides:
+        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def try_json_loads(value: Any) -> Any:
+    # Just like we do in the config, we're calling json.loads on the
+    # values. But since they come from the CLI, it'd be unintuitive to
+    # explicitly mark strings with escaped quotes. So we're working
+    # around that here by falling back to a string if parsing fails.
+    # TODO: improve logic to handle simple types like list of strings?
+    try:
+        return srsly.json_loads(value)
+    except ValueError:
+        return str(value)
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 422ae74b4..d81437f18 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,15 +1,15 @@
 import pytest
 from click import NoSuchOption
-
 from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list
+from spacy.cli._util import string_to_list, parse_config_env_overrides
 from thinc.config import ConfigValidationError
 import srsly
+import os
 
 from .util import make_tempdir
 
@@ -341,6 +341,18 @@ def test_parse_config_overrides_invalid_2(args):
         parse_config_overrides(args)
 
 
+def test_parse_cli_overrides():
+    prefix = "SPACY_CONFIG_"
+    dot = "__"
+    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
+    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
+    os.environ[prefix] = "bad"
+    result = parse_config_env_overrides(prefix=prefix, dot=dot)
+    assert len(result) == 2
+    assert result["training.batch_size"] == 123
+    assert result["foo.bar.baz"] == "hello"
+
+
 @pytest.mark.parametrize("lang", ["en", "nl"])
 @pytest.mark.parametrize(
     "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]

From 758ead8a476fa5f5e55c64c3c4bd242c7cb83d1e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 12:50:13 +0200
Subject: [PATCH 067/516] Sync overrides with CLI overrides

---
 spacy/cli/_util.py      | 80 ++++++++++++++++++-----------------------
 spacy/tests/test_cli.py | 26 ++++++++------
 2 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0159dd473..0dd2ee380 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -7,6 +7,7 @@ import srsly
 import hashlib
 import typer
 from click import NoSuchOption
+from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
@@ -38,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
+OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@@ -62,46 +64,41 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_env_overrides(
-    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+def parse_config_overrides(
+    args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
 ) -> Dict[str, Any]:
-    """Generate a dictionary of config overrides based on environment variables,
-    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
-    setting.
-
-    prefix (str): The env variable prefix for config overrides.
-    dot (str): String used to represent the "dot", e.g. in training.batch_size.
-    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
-    """
-    result = {}
-    for env_key, value in os.environ.items():
-        if env_key.startswith(prefix):
-            opt = env_key[len(prefix) :].lower().replace(dot, ".")
-            if "." in opt:
-                result[opt] = try_json_loads(value)
-    return result
-
-
-def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
-    args (List[str]): The extra arguments from the command line.
-    env_vars (bool): Include environment variables.
+    env_vars (Optional[str]): Optional environment variable to read from.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    env_overrides = parse_config_env_overrides() if env_vars else {}
-    cli_overrides = {}
+    env_string = os.environ.get(env_var, "") if env_var else ""
+    env_overrides = _parse_overrides(split_arg_string(env_string))
+    cli_overrides = _parse_overrides(args, is_cli=True)
+    if cli_overrides:
+        keys = [k for k in cli_overrides if k not in env_overrides]
+        logger.debug(f"Config overrides from CLI: {keys}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
+    result = {}
     while args:
         opt = args.pop(0)
-        err = f"Invalid CLI argument '{opt}'"
+        err = f"Invalid config override '{opt}'"
         if opt.startswith("--"):  # new argument
             orig_opt = opt
             opt = opt.replace("--", "")
             if "." not in opt:
-                raise NoSuchOption(orig_opt)
+                if is_cli:
+                    raise NoSuchOption(orig_opt)
+                else:
+                    msg.fail(f"{err}: can't override top-level sections", exits=1)
             if "=" in opt:  # we have --opt=value
                 opt, value = opt.split("=", 1)
                 opt = opt.replace("-", "_")
@@ -110,27 +107,18 @@ def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str,
                     value = "true"
                 else:
                     value = args.pop(0)
-            if opt not in env_overrides:
-                cli_overrides[opt] = try_json_loads(value)
+            # Just like we do in the config, we're calling json.loads on the
+            # values. But since they come from the CLI, it'd be unintuitive to
+            # explicitly mark strings with escaped quotes. So we're working
+            # around that here by falling back to a string if parsing fails.
+            # TODO: improve logic to handle simple types like list of strings?
+            try:
+                result[opt] = srsly.json_loads(value)
+            except ValueError:
+                result[opt] = str(value)
         else:
-            msg.fail(f"{err}: override option should start with --", exits=1)
-    if cli_overrides:
-        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
-    if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
-    return {**cli_overrides, **env_overrides}
-
-
-def try_json_loads(value: Any) -> Any:
-    # Just like we do in the config, we're calling json.loads on the
-    # values. But since they come from the CLI, it'd be unintuitive to
-    # explicitly mark strings with escaped quotes. So we're working
-    # around that here by falling back to a string if parsing fails.
-    # TODO: improve logic to handle simple types like list of strings?
-    try:
-        return srsly.json_loads(value)
-    except ValueError:
-        return str(value)
+            msg.fail(f"{err}: name should start with --", exits=1)
+    return result
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index d81437f18..a9c9d8ca5 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -6,7 +6,7 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list, parse_config_env_overrides
+from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 from thinc.config import ConfigValidationError
 import srsly
 import os
@@ -342,15 +342,21 @@ def test_parse_config_overrides_invalid_2(args):
 
 
 def test_parse_cli_overrides():
-    prefix = "SPACY_CONFIG_"
-    dot = "__"
-    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
-    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
-    os.environ[prefix] = "bad"
-    result = parse_config_env_overrides(prefix=prefix, dot=dot)
-    assert len(result) == 2
-    assert result["training.batch_size"] == 123
-    assert result["foo.bar.baz"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    result = parse_config_overrides([])
+    assert len(result) == 4
+    assert result["x.foo"] == "bar"
+    assert result["x.bar"] == 12
+    assert result["x.baz"] is False
+    assert result["y.foo"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x"
+    assert parse_config_overrides([], env_var=None) == {}
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    os.environ[OVERRIDES_ENV_VAR] = "hello world"
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    del os.environ[OVERRIDES_ENV_VAR]
 
 
 @pytest.mark.parametrize("lang", ["en", "nl"])

From bc02e864943a790cfc7ec991c67d20cc774417df Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:01:26 +0200
Subject: [PATCH 068/516] Extend Doc.__init__ with additional annotation

Mostly copying from `spacy.tests.util.get_doc`, add additional kwargs to
`Doc.__init__` to initialize the most common doc/token values.
---
 spacy/errors.py                          |  5 +-
 spacy/tests/util.py                      | 60 ++----------------
 spacy/tokens/doc.pyx                     | 77 ++++++++++++++++++++++--
 spacy/training/converters/conllu2docs.py | 35 ++++++-----
 website/docs/api/doc.md                  | 19 ++++--
 5 files changed, 118 insertions(+), 78 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 81e3616be..f219496a5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -57,7 +57,10 @@ class Warnings:
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
             "the Knowledge Base.")
-    W026 = ("Unable to set all sentence boundaries from dependency parses.")
+    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
+            "you are constructing a parse tree incrementally by setting "
+            "token.head values, you can probably ignore this warning. Consider "
+            "using Doc(words, ..., heads=heads, deps=deps) instead.")
     W027 = ("Found a large training file of {size} bytes. Note that it may "
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 741753c89..7bc32bf34 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -30,60 +30,12 @@ def get_doc(
     morphs=None,
 ):
     """Create Doc object from given vocab, words and annotations."""
-    if deps and not heads:
-        heads = [0] * len(deps)
-    headings = []
-    values = []
-    annotations = [pos, heads, deps, lemmas, tags, morphs]
-    possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
-    for a, annot in enumerate(annotations):
-        if annot is not None:
-            if len(annot) != len(words):
-                raise ValueError(Errors.E189)
-            headings.append(possible_headings[a])
-            if annot is not heads:
-                values.extend(annot)
-    for value in values:
-        vocab.strings.add(value)
-
-    doc = Doc(vocab, words=words)
-
-    # if there are any other annotations, set them
-    if headings:
-        attrs = doc.to_array(headings)
-
-        j = 0
-        for annot in annotations:
-            if annot:
-                if annot is heads:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = heads[i]
-                        else:
-                            attrs[i, j] = heads[i]
-                elif annot is morphs:
-                    for i in range(len(words)):
-                        morph_key = vocab.morphology.add(morphs[i])
-                        if attrs.ndim == 1:
-                            attrs[i] = morph_key
-                        else:
-                            attrs[i, j] = morph_key
-                else:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = doc.vocab.strings[annot[i]]
-                        else:
-                            attrs[i, j] = doc.vocab.strings[annot[i]]
-                j += 1
-        doc.from_array(headings, attrs)
-
-    # finally, set the entities
-    if ents:
-        doc.ents = [
-            Span(doc, start, end, label=doc.vocab.strings[label])
-            for start, end, label in ents
-        ]
-    return doc
+    if heads is not None:
+        heads = [i + head for i, head in enumerate(heads)]
+    if ents is not None:
+        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
+    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
+        ents=ents, lemmas=lemmas, morphs=morphs)
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2d9de278b..de7e0f862 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -158,17 +158,38 @@ cdef class Doc:
             raise ValueError(Errors.E046.format(name=name))
         return Underscore.doc_extensions.pop(name)
 
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
+    def __init__(
+        self,
+        Vocab vocab,
+        words=None,
+        spaces=None,
+        user_data=None,
+        *,
+        tags=None,
+        pos=None,
+        morphs=None,
+        lemmas=None,
+        heads=None,
+        deps=None,
+        ents=None,
+    ):
         """Create a Doc object.
 
         vocab (Vocab): A vocabulary object, which must match any models you
             want to use (e.g. tokenizer, parser, entity recognizer).
-        words (list or None): A list of unicode strings to add to the document
+        words (Optional[List[str]]): A list of unicode strings to add to the document
             as words. If `None`, defaults to empty list.
-        spaces (list or None): A list of boolean values, of the same length as
+        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
+        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -217,6 +238,55 @@ cdef class Doc:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
 
+        if heads is not None:
+            heads = [head - i for i, head in enumerate(heads)]
+        if deps and not heads:
+            heads = [0] * len(deps)
+        headings = []
+        values = []
+        annotations = [pos, heads, deps, lemmas, tags, morphs]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        for a, annot in enumerate(annotations):
+            if annot is not None:
+                if len(annot) != len(words):
+                    raise ValueError(Errors.E189)
+                headings.append(possible_headings[a])
+                if annot is not heads:
+                    values.extend(annot)
+        for value in values:
+            self.vocab.strings.add(value)
+
+        # if there are any other annotations, set them
+        if headings:
+            attrs = self.to_array(headings)
+
+            j = 0
+            for annot in annotations:
+                if annot:
+                    if annot is heads:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = heads[i]
+                            else:
+                                attrs[i, j] = heads[i]
+                    elif annot is morphs:
+                        for i in range(len(words)):
+                            morph_key = vocab.morphology.add(morphs[i])
+                            if attrs.ndim == 1:
+                                attrs[i] = morph_key
+                            else:
+                                attrs[i, j] = morph_key
+                    else:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = self.vocab.strings[annot[i]]
+                            else:
+                                attrs[i, j] = self.vocab.strings[annot[i]]
+                    j += 1
+            self.from_array(headings, attrs)
+        if ents is not None:
+            self.ents = ents
+
     @property
     def _(self):
         """Custom extension attributes registered via `set_extension`."""
@@ -1344,7 +1414,6 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
-
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index ebd123375..b4d8b3ac4 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
         heads.append(head)
         deps.append(dep)
 
-    doc = Doc(vocab, words=words, spaces=spaces)
+    doc = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        pos=poses,
+        deps=deps,
+        lemmas=lemmas,
+        heads=heads,
+    )
     for i in range(len(doc)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = poses[i]
-        doc[i].dep_ = deps[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].head = doc[heads[i]]
         doc[i]._.merged_orth = words[i]
         doc[i]._.merged_morph = morphs[i]
         doc[i]._.merged_lemma = lemmas[i]
@@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
         heads.append(t.head.i)
         deps.append(t.dep_)
 
-    doc_x = Doc(vocab, words=words, spaces=spaces)
-    for i in range(len(doc)):
-        doc_x[i].tag_ = tags[i]
-        doc_x[i].morph_ = morphs[i]
-        doc_x[i].lemma_ = lemmas[i]
-        doc_x[i].pos_ = poses[i]
-        doc_x[i].dep_ = deps[i]
-        doc_x[i].head = doc_x[heads[i]]
+    doc_x = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        morphs=morphs,
+        lemmas=lemmas,
+        pos=poses,
+        deps=deps,
+        heads=heads,
+    )
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
 
     return doc_x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..680523c60 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -30,11 +30,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name     | Description                                                                                                                                                                                  |
-| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`  | A storage container for lexical types. ~~Vocab~~                                                                                                                                             |
-| `words`  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                           |
-| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
+| Name           | Description                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
+| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
+| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
+| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
+| _keyword-only_ |                                                                                                                                                                                                |
+| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
+| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 

From 9b8d0b7f904f8751a804f112825a38cebe102ce9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:46:21 +0200
Subject: [PATCH 069/516] Alphabetize API sidebars

---
 website/meta/sidebars.json | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 76d5e63d6..e27817c92 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -75,63 +75,63 @@
             {
                 "label": "Containers",
                 "items": [
-                    { "text": "Language", "url": "/api/language" },
                     { "text": "Doc", "url": "/api/doc" },
-                    { "text": "Token", "url": "/api/token" },
-                    { "text": "Span", "url": "/api/span" },
-                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "DocBin", "url": "/api/docbin" },
                     { "text": "Example", "url": "/api/example" },
-                    { "text": "DocBin", "url": "/api/docbin" }
+                    { "text": "Language", "url": "/api/language" },
+                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "Span", "url": "/api/span" },
+                    { "text": "Token", "url": "/api/token" }
                 ]
             },
             {
                 "label": "Pipeline",
                 "items": [
-                    { "text": "Tokenizer", "url": "/api/tokenizer" },
-                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
-                    { "text": "Transformer", "url": "/api/transformer" },
-                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
-                    { "text": "Morphologizer", "url": "/api/morphologizer" },
-                    { "text": "Tagger", "url": "/api/tagger" },
                     { "text": "AttributeRuler", "url": "/api/attributeruler" },
                     { "text": "DependencyParser", "url": "/api/dependencyparser" },
+                    { "text": "EntityLinker", "url": "/api/entitylinker" },
                     { "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
                     { "text": "EntityRuler", "url": "/api/entityruler" },
-                    { "text": "EntityLinker", "url": "/api/entitylinker" },
-                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
-                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
+                    { "text": "Morphologizer", "url": "/api/morphologizer" },
+                    { "text": "Pipe", "url": "/api/pipe" },
                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
-                    { "text": "Other Functions", "url": "/api/pipeline-functions" },
-                    { "text": "Pipe", "url": "/api/pipe" }
+                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Tagger", "url": "/api/tagger" },
+                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
+                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
+                    { "text": "Tokenizer", "url": "/api/tokenizer" },
+                    { "text": "Transformer", "url": "/api/transformer" },
+                    { "text": "Other Functions", "url": "/api/pipeline-functions" }
                 ]
             },
             {
                 "label": "Matchers",
                 "items": [
+                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
                     { "text": "Matcher", "url": "/api/matcher" },
-                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" },
-                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
+                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" }
                 ]
             },
             {
                 "label": "Other",
                 "items": [
-                    { "text": "Vocab", "url": "/api/vocab" },
-                    { "text": "StringStore", "url": "/api/stringstore" },
-                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Corpus", "url": "/api/corpus" },
+                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
                     { "text": "Morphology", "url": "/api/morphology" },
-                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Scorer", "url": "/api/scorer" },
-                    { "text": "Corpus", "url": "/api/corpus" }
+                    { "text": "StringStore", "url": "/api/stringstore" },
+                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Vocab", "url": "/api/vocab" }
                 ]
             },
             {
                 "label": "Cython",
                 "items": [
                     { "text": "Architecture", "url": "/api/cython" },
-                    { "text": "Structs", "url": "/api/cython-structs" },
-                    { "text": "Classes", "url": "/api/cython-classes" }
+                    { "text": "Classes", "url": "/api/cython-classes" },
+                    { "text": "Structs", "url": "/api/cython-structs" }
                 ]
             }
         ]

From ce455f30ca847fc8038d034f39977cb6f3ed53c3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:52:46 +0200
Subject: [PATCH 070/516] Fix formatting

---
 spacy/tests/util.py  | 13 +++++++++++--
 spacy/tokens/doc.pyx |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 7bc32bf34..6c67d2ee1 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -34,8 +34,17 @@ def get_doc(
         heads = [i + head for i, head in enumerate(heads)]
     if ents is not None:
         ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
-        ents=ents, lemmas=lemmas, morphs=morphs)
+    return Doc(
+        vocab,
+        words=words,
+        pos=pos,
+        heads=heads,
+        deps=deps,
+        tags=tags,
+        ents=ents,
+        lemmas=lemmas,
+        morphs=morphs,
+    )
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index de7e0f862..13167c2d4 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1414,6 +1414,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
+
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.

From e548654aca291621ddcbd8739f620b74c9932166 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 14:46:55 +0200
Subject: [PATCH 071/516] Update docs [ci skip]

---
 website/docs/usage/training.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 071434162..b63145636 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -214,6 +214,24 @@ overrides. Overrides are added before [variables](#config-interpolation) are
 resolved, by the way – so if you need to use a value in multiple places,
 reference it across your config and override it on the CLI once.
 
+> #### 💡 Tip: Verbose logging
+>
+> If you're using config overrides, you can set the `--verbose` flag on
+> [`spacy train`](/api/cli#train) to make spaCy log more info, including which
+> overrides were set via the CLI and environment variables.
+
+#### Adding overrides via environment variables {#config-overrides-env}
+
+Instead of defining the overrides as CLI arguments, you can also use the
+`SPACY_CONFIG_OVERRIDES` environment variable using the same argument syntax.
+This is especially useful if you're training models as part of an automated
+process. Environment variables **take precedence** over CLI overrides and values
+defined in the config file.
+
+```cli
+$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
+```
+
 ### Defining pipeline components {#config-components}
 
 You typically train a [pipeline](/usage/processing-pipelines) of **one or more

From 177df15d89da7eccc1603c33b847a12c43a56e0c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 15:54:05 +0200
Subject: [PATCH 072/516] Implement Doc.set_ents

---
 spacy/errors.py                 |   9 +++
 spacy/tests/doc/test_doc_api.py |  63 +++++++++++++++--
 spacy/tests/parser/test_ner.py  |   4 +-
 spacy/tokens/doc.pyx            | 122 +++++++++++++++++++++++++++++---
 spacy/training/example.pyx      |  10 ++-
 spacy/training/iob_utils.py     |   5 +-
 6 files changed, 192 insertions(+), 21 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 81e3616be..a21ff5476 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -682,6 +682,15 @@ class Errors:
     E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
              "through token.morph_ instead or add the string to the "
              "StringStore with `nlp.vocab.strings.add(string)`.")
+    E1010 = ("Unable to set entity information for token {i} which is included "
+             "in more than one span in entities, blocked, missing or outside.")
+    E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
+             "options: {modes}")
+    E1012 = ("Spans provided to doc.set_ents must be provided as a list of "
+             "`Span` objects.")
+    E1013 = ("Unable to set entity for span with empty label. Entity spans are "
+             "required to have a label. To set entity information as missing "
+             "or blocked, use the keyword arguments with doc.set_ents.")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index a08efe9d7..7339a9aef 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -425,7 +425,7 @@ def test_has_annotation(en_vocab):
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
-    doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")]
+    doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
 
     for attr in attrs:
         assert doc.has_annotation(attr)
@@ -455,15 +455,68 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_sentenced
 
 
-def test_block_ents(en_tokenizer):
+def test_set_ents(en_tokenizer):
+    # set ents
     doc = en_tokenizer("a b c d e")
-    doc.block_ents([doc[1:2], doc[3:5]])
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # add ents, invalid IOB repaired
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+    doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
+    assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
+    assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
+
+    # missing ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # outside ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
+        outside=[doc[4:5]],
+        default="missing",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
+    assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+    # blocked ents
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
     assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
     assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
     assert doc.ents == tuple()
 
-    # invalid IOB repaired
+    # invalid IOB repaired after blocked
     doc.ents = [Span(doc, 3, 5, "ENT")]
     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
-    doc.block_ents([doc[3:4]])
+    doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
+
+    # all types
+    doc = en_tokenizer("a b c d e")
+    doc.set_ents(
+        [Span(doc, 0, 1, 10)],
+        blocked=[doc[1:2]],
+        missing=[doc[2:3]],
+        outside=[doc[3:4]],
+        default="unmodified",
+    )
+    assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
+    assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
+
+    doc = en_tokenizer("a b c d e")
+    # single span instead of a list
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=doc[1:2])
+    # invalid default mode
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], default="none")
+    # conflicting/overlapping specifications
+    with pytest.raises(ValueError):
+        doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b8fdf15f9..cd5581769 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
     ner2 = nlp2.create_pipe("ner", config=config)
 
     # set "New York" to a blocked entity
-    doc2.block_ents([doc2[3:5]])
+    doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.block_ents([doc[self.start:self.end]])
+        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
         return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cc621b443..be99bacf3 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -7,6 +7,7 @@ from libc.stdint cimport int32_t, uint64_t
 
 import copy
 from collections import Counter
+from enum import Enum
 import numpy
 import srsly
 from thinc.api import get_array_module
@@ -86,6 +87,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
         return get_token_attr(token, feat_name)
 
 
+class SetEntsDefault(str, Enum):
+    blocked = "blocked"
+    missing = "missing"
+    outside = "outside"
+    unmodified = "unmodified"
+
+    @classmethod
+    def values(cls):
+        return list(cls.__members__.keys())
+
+
 cdef class Doc:
     """A sequence of Token objects. Access sentences and named entities, export
     annotations to numpy arrays, losslessly serialize to compressed binary
@@ -597,9 +609,9 @@ cdef class Doc:
                 if i in tokens_in_ents.keys():
                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
                     if entity_type is None or entity_type <= 0:
-                        # Empty label: Missing, unset this token
-                        ent_iob = 0
-                        entity_type = 0
+                        # Only allow labelled spans
+                        print(i, ent_start, ent_end, entity_type)
+                        raise ValueError(Errors.E1013)
                     elif ent_start == i:
                         # Marking the start of an entity
                         ent_iob = 3
@@ -611,19 +623,107 @@ cdef class Doc:
                 self.c[i].ent_kb_id = kb_id
                 self.c[i].ent_iob = ent_iob
 
-    def block_ents(self, spans):
-        """Mark spans as never an entity for the EntityRecognizer.
+    def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
+        """Set entity annotation.
 
-        spans (List[Span]): The spans to block as never entities.
+        entities (List[Span]): Spans with labels to set as entities.
+        blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
+            entity) for spacy's built-in NER component. Other components may
+            ignore this setting.
+        missing (Optional[List[Span]]): Spans with missing/unknown entity
+            information.
+        outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
+        default (str): How to set entity annotation for tokens outside of any
+            provided spans. Options: "blocked", "missing", "outside" and
+            "unmodified" (preserve current state). Defaults to "outside".
         """
-        for span in spans:
+        if default not in SetEntsDefault.values():
+            raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
+
+        if blocked is None:
+            blocked = tuple()
+        if missing is None:
+            missing = tuple()
+        if outside is None:
+            outside = tuple()
+
+        # Find all tokens covered by spans and check that none are overlapping
+        seen_tokens = set()
+        for span in entities:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in blocked:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in missing:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+        for span in outside:
+            if not isinstance(span, Span):
+                raise ValueError(Errors.E1012.format(span=span))
+            for i in range(span.start, span.end):
+                if i in seen_tokens:
+                    raise ValueError(Errors.E1010.format(i=i))
+                seen_tokens.add(i)
+
+        # Set all specified entity information
+        for span in entities:
+            for i in range(span.start, span.end):
+                if not span.label:
+                    raise ValueError(Errors.E1013)
+                if i == span.start:
+                    self.c[i].ent_iob = 3
+                else:
+                    self.c[i].ent_iob = 1
+                self.c[i].ent_type = span.label
+        for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3
                 self.c[i].ent_type = 0
-            # if the following token is I, set to B
-            if span.end < self.length:
-                if self.c[span.end].ent_iob == 1:
-                    self.c[span.end].ent_iob = 3
+        for span in missing:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 0
+                self.c[i].ent_type = 0
+        for span in outside:
+            for i in range(span.start, span.end):
+                self.c[i].ent_iob = 2
+                self.c[i].ent_type = 0
+
+        # Set tokens outside of all provided spans
+        if default != SetEntsDefault.unmodified:
+            for i in range(self.length):
+                if i not in seen_tokens:
+                    self.c[i].ent_type = 0
+                    if default == SetEntsDefault.outside:
+                        self.c[i].ent_iob = 2
+                    elif default == SetEntsDefault.missing:
+                        self.c[i].ent_iob = 0
+                    elif default == SetEntsDefault.blocked:
+                        self.c[i].ent_iob = 3
+
+        # Fix any resulting inconsistent annotation
+        for i in range(self.length - 1):
+            # I must follow B or I: convert I to B
+            if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3
+            # Change of type with BI or II: convert second I to B
+            if self.c[i].ent_type != self.c[i+1].ent_type and \
+                    (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
+                    self.c[i+1].ent_iob == 1:
+                self.c[i+1].ent_iob = 3
 
     @property
     def noun_chunks(self):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index d396a2040..82d8b6fce 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
 
 
 def _add_entities_to_doc(doc, ner_data):
+    print(ner_data)
     if ner_data is None:
         return
     elif ner_data == []:
@@ -303,7 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
             spans_from_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
-        doc.ents = ner_data
+        entities = []
+        missing = []
+        for span in ner_data:
+            if span.label:
+                entities.append(span)
+            else:
+                missing.append(span)
+        doc.set_ents(entities, missing=missing)
     else:
         raise ValueError(Errors.E973)
 
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 33a4733ca..b435c8ecb 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -149,9 +149,10 @@ def spans_from_biluo_tags(doc, tags):
 
     doc (Doc): The document that the BILUO tags refer to.
     entities (iterable): A sequence of BILUO tags with each tag describing one
-        token. Each tags string will be of the form of either "", "O" or
+        token. Each tag string will be of the form of either "", "O" or
         "{action}-{label}", where action is one of "B", "I", "L", "U".
-    RETURNS (list): A sequence of Span objects.
+    RETURNS (list): A sequence of Span objects. Each token with a missing IOB
+        tag is returned as a Span with an empty label.
     """
     token_offsets = tags_to_entities(tags)
     spans = []

From 6aa91c7ca02acd0df8d5dfba236faf09c3a5a477 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 16:00:06 +0200
Subject: [PATCH 073/516] Make user_data keyword-only

---
 spacy/tokens/doc.pyx    | 2 +-
 website/docs/api/doc.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 13167c2d4..27efa6cef 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -163,8 +163,8 @@ cdef class Doc:
         Vocab vocab,
         words=None,
         spaces=None,
-        user_data=None,
         *,
+        user_data=None,
         tags=None,
         pos=None,
         morphs=None,
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 680523c60..baf264b80 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -35,8 +35,8 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
 | `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
 | `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | _keyword-only_ |                                                                                                                                                                                                |
+| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |

From e8bcaa44f17be63302feca946997a6fe20761cd7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:01:46 +0200
Subject: [PATCH 074/516] Don't auto-decompress archives with smart_open [ci
 skip]

---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0dd2ee380..797a701b9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -306,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
     if dest.exists() and not force:
         return None
     src = str(src)
-    with smart_open.open(src, mode="rb") as input_file:
+    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
         with dest.open(mode="wb") as output_file:
             output_file.write(input_file.read())
 

From b3327c1e45d14c6ef03c70455e09f449ed8ad6f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:04:30 +0200
Subject: [PATCH 075/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4fb6dfff1..ec3c168a5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a19"
+__version__ = "3.0.0a20"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 447b3e5787dec59f2ed4b8a96c4b2ceb808d182f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 16:58:40 +0200
Subject: [PATCH 076/516] Merge remote-tracking branch 'upstream/develop' into
 fix/debug_model

# Conflicts:
#	spacy/cli/debug_model.py
---
 spacy/cli/debug_model.py | 48 ++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index af961d033..3d76cdbde 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
@@ -93,11 +93,10 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    goldY = _get_output(model.ops)
-    # _set_output_dim(nO=goldY.shape[-1], model=model)
+    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=goldY)
+        model.initialize(X=X)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -110,12 +109,15 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
         tok2vec.model.initialize(X=X)
+    goldY = None
     for e in range(3):
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        print("get_dX", get_dX)
-        dY = get_gradient(goldY, Y)
+        # simulate a goldY value
+        if not goldY:
+            goldY = _simulate_gold(Y)
+        dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -128,11 +130,20 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
-    msg.good(f"Succesfully ended analysis - model looks good!")
+    msg.good(f"Succesfully ended analysis - model looks good.")
 
 
-def get_gradient(goldY, Y):
-    return Y - goldY
+def _simulate_gold(element, counter=1):
+    if isinstance(element, Iterable):
+        for i in range(len(element)):
+            element[i] = _simulate_gold(element[i], counter+i)
+        return element
+    else:
+        return 1/counter
+
+
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
 
 
 def _sentences():
@@ -149,18 +160,13 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(ops):
-    docs = len(_get_docs())
-    labels = 6
-    output = ops.alloc2f(d0=docs, d1=labels)
-    for i in range(docs):
-        for j in range(labels):
-            output[i, j] = 1 / (i+j+0.01)
-    return ops.xp.asarray(output)
-
-
-def _get_output_old(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _set_output_dim(model, nO):
+    # simulating dim inference by directly setting the nO argument of the model
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
 
 
 def _print_model(model, print_settings):

From f212303729cb0775bb00eebb6eef0a6c646f92da Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 17:59:09 +0200
Subject: [PATCH 077/516] Add sent_starts to Doc.__init__

Add sent_starts to `Doc.__init__`. Officially specify `is_sent_start`
values but also convert to and accept `sent_start` internally.
---
 spacy/tests/doc/test_doc_api.py | 20 ++++++++++++++
 spacy/tokens/doc.pyx            | 46 +++++++++++++++++++++++----------
 website/docs/api/doc.md         |  1 +
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c979931b1..0579642c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 from ..util import get_doc
 
 
+def test_doc_api_init(en_vocab):
+    # set sent_start by sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # set sent_start by heads
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # heads override sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+
 @pytest.mark.parametrize("text", [["one", "two", "three"]])
 def test_doc_api_compare_by_string_position(en_vocab, text):
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 27efa6cef..c5f1f6801 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -171,6 +171,7 @@ cdef class Doc:
         lemmas=None,
         heads=None,
         deps=None,
+        sent_starts=None,
         ents=None,
     ):
         """Create a Doc object.
@@ -183,13 +184,24 @@ cdef class Doc:
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
-        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
-        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
-        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
-        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
-        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
-        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
+        tags (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as
+            words, to assign as heads. Head indices are the position of the
+            head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.dep. Defaults to None.
+        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
+            the same length as words, to assign as token.is_sent_start. Will be
+            overridden by heads if heads is provided. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
+            Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -242,16 +254,24 @@ cdef class Doc:
             heads = [head - i for i, head in enumerate(heads)]
         if deps and not heads:
             heads = [0] * len(deps)
+        if sent_starts is not None:
+            for i in range(len(sent_starts)):
+                if sent_starts[i] is True:
+                    sent_starts[i] = 1
+                elif sent_starts[i] is False:
+                    sent_starts[i] = -1
+                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
+                    sent_starts[i] = 0
         headings = []
         values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
         for a, annot in enumerate(annotations):
             if annot is not None:
                 if len(annot) != len(words):
                     raise ValueError(Errors.E189)
                 headings.append(possible_headings[a])
-                if annot is not heads:
+                if annot is not heads and annot is not sent_starts:
                     values.extend(annot)
         for value in values:
             self.vocab.strings.add(value)
@@ -263,12 +283,12 @@ cdef class Doc:
             j = 0
             for annot in annotations:
                 if annot:
-                    if annot is heads:
+                    if annot is heads or annot is sent_starts:
                         for i in range(len(words)):
                             if attrs.ndim == 1:
-                                attrs[i] = heads[i]
+                                attrs[i] = annot[i]
                             else:
-                                attrs[i, j] = heads[i]
+                                attrs[i, j] = annot[i]
                     elif annot is morphs:
                         for i in range(len(words)):
                             morph_key = vocab.morphology.add(morphs[i])
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index baf264b80..52f94a83d 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -43,6 +43,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
 | heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
 | deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
 | ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}

From 67fbcb3da57c9830be34bf56518d8ec659ed65b6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 20:43:54 +0200
Subject: [PATCH 078/516] Tidy up tests and docs

---
 CONTRIBUTING.md                               |   4 +-
 spacy/errors.py                               |   2 +-
 spacy/tests/README.md                         |  84 ++++-------
 spacy/tests/conftest.py                       |   5 +
 spacy/tests/doc/test_add_entities.py          |  10 +-
 spacy/tests/doc/test_array.py                 |   8 +-
 spacy/tests/doc/test_doc_api.py               |  74 ++++------
 spacy/tests/doc/test_retokenize_merge.py      |  78 ++++------
 spacy/tests/doc/test_retokenize_split.py      |  10 +-
 spacy/tests/doc/test_span.py                  |  14 +-
 spacy/tests/doc/test_to_json.py               |   7 +-
 spacy/tests/doc/test_token_api.py             |  89 ++++-------
 spacy/tests/lang/de/test_parser.py            |  26 ++--
 spacy/tests/lang/en/test_noun_chunks.py       |   9 +-
 spacy/tests/lang/en/test_parser.py            |  57 +++-----
 spacy/tests/lang/en/test_sbd.py               |  22 +--
 spacy/tests/lang/ru/test_lemmatizer.py        |  15 +-
 spacy/tests/lang/sv/test_noun_chunks.py       |  16 +-
 .../tests/matcher/test_dependency_matcher.py  |  13 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/parser/test_nonproj.py            |  25 +---
 spacy/tests/parser/test_parse.py              |  94 ++++++------
 spacy/tests/parser/test_parse_navigate.py     | 120 ++++++++-------
 spacy/tests/parser/test_space_attachment.py   |  46 +++---
 spacy/tests/pipeline/test_attributeruler.py   |  19 +--
 spacy/tests/pipeline/test_functions.py        |  47 ++----
 spacy/tests/regression/test_issue1-1000.py    |   9 +-
 spacy/tests/regression/test_issue1501-2000.py |  21 +--
 spacy/tests/regression/test_issue2001-2500.py |  11 +-
 spacy/tests/regression/test_issue2501-3000.py |  10 +-
 spacy/tests/regression/test_issue3001-3500.py |  26 +---
 spacy/tests/regression/test_issue3501-4000.py |  12 +-
 spacy/tests/regression/test_issue5001-5500.py | 138 ++++++++++++++++++
 spacy/tests/regression/test_issue5048.py      |  32 ----
 spacy/tests/regression/test_issue5082.py      |  37 -----
 spacy/tests/regression/test_issue5137.py      |  32 ----
 spacy/tests/regression/test_issue5141.py      |  11 --
 spacy/tests/regression/test_issue5152.py      |  20 ---
 spacy/tests/regression/test_issue5458.py      |  23 ---
 spacy/tests/regression/test_issue5918.py      |   4 +-
 spacy/tests/test_displacy.py                  |  18 +--
 spacy/tests/test_scorer.py                    |  23 +--
 spacy/tests/training/test_training.py         |  55 ++-----
 spacy/tests/util.py                           |  35 +----
 spacy/tokens/doc.pyx                          |  10 +-
 spacy/training/example.pyx                    |   4 +-
 website/docs/api/doc.md                       |  44 ++++--
 website/docs/usage/v3.md                      |   9 +-
 48 files changed, 612 insertions(+), 875 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5001-5500.py
 delete mode 100644 spacy/tests/regression/test_issue5048.py
 delete mode 100644 spacy/tests/regression/test_issue5082.py
 delete mode 100644 spacy/tests/regression/test_issue5137.py
 delete mode 100644 spacy/tests/regression/test_issue5141.py
 delete mode 100644 spacy/tests/regression/test_issue5152.py
 delete mode 100644 spacy/tests/regression/test_issue5458.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0abde2abf..70324d8fd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -224,7 +224,7 @@ for that particular code. Here's an example:
 ```python
 # fmt: off
 text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
 deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
         "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
         "poss", "nsubj", "ccomp", "punct"]
@@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
 `@pytest.mark.models`. Loading the models is expensive and not necessary if
 you're not actually testing the model performance. If all you need is a `Doc`
 object with annotations like heads, POS tags or the dependency parse, you can
-use the `get_doc()` utility function to construct it manually.
+use the `Doc` constructor to construct it manually.
 
 📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
 
diff --git a/spacy/errors.py b/spacy/errors.py
index f219496a5..406ea603b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -455,7 +455,7 @@ class Errors:
             "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
     E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
     E187 = ("Only unicode strings are supported as labels.")
-    E189 = ("Each argument to `get_doc` should be of equal length.")
+    E189 = ("Each argument to Doc.__init__ should be of equal length.")
     E190 = ("Token head out of range in `Doc.from_array()` for token index "
             "'{index}' with value '{value}' (equivalent to relative head "
             "index: '{rel_head_index}'). The head indices should be relative "
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 7aa7f6166..86bbd52da 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -17,7 +17,6 @@ Tests for spaCy modules and classes live in their own directories of the same na
 5. [Helpers and utilities](#helpers-and-utilities)
 6. [Contributing to the tests](#contributing-to-the-tests)
 
-
 ## Running the tests
 
 To show print statements, run the tests with `py.test -s`. To abort after the
@@ -41,17 +40,16 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
 
-* **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
-* If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-* Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-* Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
-* If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-* Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
-* **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are  available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
-* If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-* Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-* Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
-
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
+- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
+- If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
+- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
 
 ## Parameters
 
@@ -64,7 +62,7 @@ def test_tokenizer_keep_urls(tokenizer, text):
     assert len(tokens) == 1
 ```
 
-This will run the test once for each `text` value. Even if you're only testing  one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
+This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
 
 You can also specify parameters as tuples to test with multiple values per test:
 
@@ -81,18 +79,17 @@ To test for combinations of parameters, you can add several `parametrize` marker
 
 This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
 
-
 ## Fixtures
 
 Fixtures to create instances of spaCy objects and other components should only be defined once in the global [`conftest.py`](conftest.py). We avoid having per-directory conftest files, as this can easily lead to confusion.
 
 These are the main fixtures that are currently available:
 
-| Fixture | Description |
-| --- | --- |
-| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
-| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
-| `en_vocab` | Creates an instance of the English `Vocab`. |
+| Fixture                             | Description                                                                  |
+| ----------------------------------- | ---------------------------------------------------------------------------- |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
+| `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
 The fixtures can be used in all tests by simply setting them as an argument, like this:
 
@@ -107,59 +104,32 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
+### Constructing a `Doc` object manually with
 
-### Constructing a `Doc` object manually with `get_doc()`
-
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can use `get_doc()` to construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
-def test_doc_token_api_strings(en_tokenizer):
+def test_doc_token_api_strings(en_vocab):
     text = "Give it back! He pleaded."
     pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
 
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
+    doc = Doc(en_vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
     assert doc[0].text == 'Give'
     assert doc[0].lower_ == 'give'
     assert doc[0].pos_ == 'VERB'
     assert doc[0].dep_ == 'ROOT'
 ```
 
-You can construct a `Doc` with the following arguments:
-
-| Argument | Description |
-| --- | --- |
-| `vocab` | `Vocab` instance to use. If you're tokenizing before creating a `Doc`, make sure to use the tokenizer's vocab. Otherwise, you can also use the `en_vocab` fixture. **(required)** |
-| `words` | List of words, for example `[t.text for t in tokens]`. **(required)** |
-| `heads` | List of heads as integers. |
-| `pos` | List of POS tags as text values. |
-| `tag` | List of tag names as text values. |
-| `dep` | List of dependencies as text values. |
-| `ents` | List of entity tuples with `start`, `end`, `label` (for example `(0, 2, 'PERSON')`). The `label` will be looked up in `vocab.strings[label]`. |
-
-Here's how to quickly get these values from within spaCy:
-
-```python
-doc = nlp(u'Some text here')
-print([token.head.i-token.i for token in doc])
-print([token.tag_ for token in doc])
-print([token.pos_ for token in doc])
-print([token.dep_ for token in doc])
-print([(ent.start, ent.end, ent.label_) for ent in doc.ents])
-```
-
-**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
-
 ### Other utilities
 
-| Name | Description |
-| --- | --- |
-| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
-| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
-| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
-| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
+| Name                                               | Description                                                                                                   |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state.                          |
+| `add_vecs_to_vocab(vocab, vectors)`                | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
+| `get_cosine(vec1, vec2)`                           | Get cosine for two given vectors.                                                                             |
+| `assert_docs_equal(doc1, doc2)`                    | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
 
 ## Contributing to the tests
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e17199a08..3a9a1f26b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -59,6 +59,11 @@ def de_tokenizer():
     return get_lang_class("de")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def de_vocab():
+    return get_lang_class("de")().vocab
+
+
 @pytest.fixture(scope="session")
 def el_tokenizer():
     return get_lang_class("el")().tokenizer
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 0c2a2a40b..40aff8e31 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,12 +1,10 @@
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.training import Example
 from spacy.pipeline import EntityRecognizer
 from spacy.tokens import Span, Doc
 from spacy import registry
 import pytest
 
-from ..util import get_doc
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-
 
 def _ner_example(ner):
     doc = Doc(
@@ -19,7 +17,7 @@ def _ner_example(ner):
 
 def test_doc_add_entities_set_ents_iob(en_vocab):
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -41,7 +39,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
 def test_ents_reset(en_vocab):
     """Ensure that resetting doc.ents does not change anything"""
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -59,7 +57,7 @@ def test_ents_reset(en_vocab):
 
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     entity = Span(doc, 0, 4, label=391)
     doc.ents = [entity]
 
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index e721b3f09..9c050f740 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -2,8 +2,6 @@ import pytest
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_array_attr_of_token(en_vocab):
     doc = Doc(en_vocab, words=["An", "example", "sentence"])
@@ -35,7 +33,7 @@ def test_doc_scalar_attr_of_token(en_vocab):
 def test_doc_array_tag(en_vocab):
     words = ["A", "nice", "sentence", "."]
     pos = ["DET", "ADJ", "NOUN", "PUNCT"]
-    doc = get_doc(en_vocab, words=words, pos=pos)
+    doc = Doc(en_vocab, words=words, pos=pos)
     assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
     feats_array = doc.to_array((ORTH, POS))
     assert feats_array[0][1] == doc[0].pos
@@ -47,7 +45,7 @@ def test_doc_array_tag(en_vocab):
 def test_doc_array_morph(en_vocab):
     words = ["Eat", "blue", "ham"]
     morph = ["Feat=V", "Feat=J", "Feat=N"]
-    doc = get_doc(en_vocab, words=words, morphs=morph)
+    doc = Doc(en_vocab, words=words, morphs=morph)
     assert morph[0] == doc[0].morph_
     assert morph[1] == doc[1].morph_
     assert morph[2] == doc[2].morph_
@@ -61,7 +59,7 @@ def test_doc_array_morph(en_vocab):
 def test_doc_array_dep(en_vocab):
     words = ["A", "nice", "sentence", "."]
     deps = ["det", "amod", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, deps=deps)
+    doc = Doc(en_vocab, words=words, deps=deps)
     feats_array = doc.to_array((ORTH, DEP))
     assert feats_array[0][1] == doc[0].dep
     assert feats_array[1][1] == doc[1].dep
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 0579642c4..2c22926e9 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -6,25 +6,20 @@ from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_api_init(en_vocab):
+    words = ["a", "b", "c", "d"]
+    heads = [0, 0, 2, 2]
     # set sent_start by sent_starts
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
-    )
+    doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
     # set sent_start by heads
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
-    )
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
-
     # heads override sent_starts
     doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+        en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
     )
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
@@ -178,7 +173,7 @@ def test_doc_api_runtime_error(en_tokenizer):
             "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
     nps = []
     for np in doc.noun_chunks:
         while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
@@ -195,17 +190,19 @@ def test_doc_api_runtime_error(en_tokenizer):
             retokenizer.merge(np, attrs=attrs)
 
 
-def test_doc_api_right_edge(en_tokenizer):
+def test_doc_api_right_edge(en_vocab):
     """Test for bug occurring from Unshift action, causing incorrect right edge"""
     # fmt: off
-    text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
-    heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-             -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    words = [
+        "I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
+        "of", "such", "as", "live", "under", "the", "government", "of", "the",
+        "Romans", ",", "to", "translate", "those", "books", "into", "the",
+        "Greek", "tongue", "."
+    ]
+    heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
     deps = ["dep"] * len(heads)
     # fmt: on
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -233,16 +230,16 @@ def test_doc_api_similarity_match():
 
 
 @pytest.mark.parametrize(
-    "sentence,heads,lca_matrix",
+    "words,heads,lca_matrix",
     [
         (
-            "the lazy dog slept",
-            [2, 1, 1, 0],
+            ["the", "lazy", "dog", "slept"],
+            [2, 2, 3, 3],
             numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
         ),
         (
-            "The lazy dog slept. The quick fox jumped",
-            [2, 1, 1, 0, -1, 2, 1, 1, 0],
+            ["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
+            [2, 2, 3, 3, 3, 7, 7, 8, 8],
             numpy.array(
                 [
                     [0, 2, 2, 3, 3, -1, -1, -1, -1],
@@ -259,11 +256,8 @@ def test_doc_api_similarity_match():
         ),
     ],
 )
-def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
-    tokens = en_tokenizer(sentence)
-    doc = get_doc(
-        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
-    )
+def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
+    doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -287,26 +281,23 @@ def test_doc_is_nered(en_vocab):
 
 
 def test_doc_from_array_sent_starts(en_vocab):
-    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
+    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
     deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     # HEAD overrides SENT_START without warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     # no warning using default attrs
     attrs = doc._get_array_attrs()
     arr = doc.to_array(attrs)
     with pytest.warns(None) as record:
         new_doc.from_array(attrs, arr)
         assert len(record) == 0
-
     # only SENT_START uses SENT_START
     attrs = [SENT_START]
     arr = doc.to_array(attrs)
@@ -314,7 +305,6 @@ def test_doc_from_array_sent_starts(en_vocab):
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
     assert not new_doc.has_annotation("DEP")
-
     # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
@@ -325,19 +315,17 @@ def test_doc_from_array_sent_starts(en_vocab):
 
 
 def test_doc_from_array_morph(en_vocab):
-    words = ["I", "live", "in", "New", "York", "."]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", "."]
     morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
     # fmt: on
     doc = Doc(en_vocab, words=words)
     for i, morph in enumerate(morphs):
         doc[i].morph_ = morph
-
     attrs = [MORPH]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     assert [t.morph_ for t in new_doc] == morphs
     assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
 
@@ -349,15 +337,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs = [en_tokenizer(text) for text in en_texts]
     docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
-    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
-        True,
-        None,
-        None,
-        None,
-    )
-
+    expected = (True, None, None, None)
+    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
     assert Doc.from_docs([]) is None
-
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
 
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index bc9567b2a..806c4b46f 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -3,8 +3,6 @@ from spacy.attrs import LEMMA
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_merge(en_tokenizer):
     text = "WKRO played songs by the beach boys all night"
@@ -88,9 +86,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
 
 def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     text = "Los Angeles start."
-    heads = [1, 1, 0, -1]
+    heads = [1, 2, 2, 2]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert len(doc) == 4
     assert doc[0].head.text == "Angeles"
     assert doc[1].head.text == "start"
@@ -103,17 +101,12 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     assert doc[0].ent_type_ == "GPE"
 
 
-def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
-    text = "The players start."
-    heads = [1, 1, 0, -1]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
+    words = ["The", "players", "start", "."]
+    heads = [1, 2, 2, 2]
+    tags = ["DT", "NN", "VBZ", "."]
+    pos = ["DET", "NOUN", "VERB", "PUNCT"]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -124,13 +117,7 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[0].text == "The players"
     assert doc[0].tag_ == "NN"
     assert doc[0].pos_ == "NOUN"
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -147,11 +134,10 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[1].pos_ == "VERB"
 
 
-def test_doc_retokenize_spans_merge_heads(en_tokenizer):
-    text = "I found a pilates class near work."
-    heads = [1, 0, 2, 1, -3, -1, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_doc_retokenize_spans_merge_heads(en_vocab):
+    words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
+    heads = [1, 1, 4, 6, 1, 4, 5, 1]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 8
     with doc.retokenize() as retokenizer:
         attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
@@ -182,9 +168,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
 
 def test_doc_retokenize_span_np_merges(en_tokenizer):
     text = "displaCy is a parse tool built with Javascript"
-    heads = [1, 0, 2, 1, -3, -1, -1, -1]
+    heads = [1, 1, 4, 4, 1, 4, 5, 6]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert doc[4].head.i == 1
     with doc.retokenize() as retokenizer:
         attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
@@ -192,18 +178,18 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
     assert doc[2].head.i == 1
 
     text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
-    heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
+    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
             retokenizer.merge(ent, attrs=attrs)
 
     text = "One test with entities like New York City so the ents list is not void"
-    heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
+    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             retokenizer.merge(ent)
@@ -212,12 +198,12 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
 def test_doc_retokenize_spans_entity_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
     tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
-    ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
+    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
     )
     assert len(doc) == 17
@@ -282,13 +268,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # if there is a parse, span.root provides default values
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
+    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
     deps = ["dep"] * len(words)
     en_vocab.strings.add("ent-de")
     en_vocab.strings.add("ent-fg")
     en_vocab.strings.add("dep")
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     assert doc[2:4].root == doc[3]  # root of 'c d' is d
     assert doc[4:6].root == doc[4]  # root is 'e f' is e
     with doc.retokenize() as retokenizer:
@@ -305,10 +291,10 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # check that B is preserved if span[start] is B
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
+    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
     deps = ["dep"] * len(words)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     with doc.retokenize() as retokenizer:
         retokenizer.merge(doc[3:5])
         retokenizer.merge(doc[5:7])
@@ -322,13 +308,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
-    heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
+    heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
     deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
             'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
             'compound', 'dobj', 'punct']
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
@@ -343,13 +329,13 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
 def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
     deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
             "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
             "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1 = list(doc.sents)[0]
     init_len = len(list(sent1.root.subtree))
     with doc.retokenize() as retokenizer:
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 5f40da425..4d4b170f9 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -2,13 +2,11 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_split(en_vocab):
     words = ["LosAngeles", "start", "."]
-    heads = [1, 1, 0]
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    heads = [1, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 3
     assert len(str(doc)) == 19
     assert doc[0].head.text == "start"
@@ -88,11 +86,11 @@ def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
     # fmt: off
     words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
              "lives", "in", "England", "and", "loves", "JoePasquale", "."]
-    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
+    heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
     deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
             "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0c538a0eb..2f562deb7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -4,19 +4,17 @@ from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 
-from ..util import get_doc
-
 
 @pytest.fixture
 def doc(en_tokenizer):
     # fmt: off
     text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
     tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -69,10 +67,10 @@ def test_spans_string_fn(doc):
 
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
-    heads = [0, 3, -1, -2, -4]
+    heads = [0, 4, 1, 1, 0]
     deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,10 +90,10 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab,
         words=[t.text for t in tokens],
-        heads=[2, 1, 1, 0],
+        heads=[2, 2, 3, 3],
         deps=["dep"] * 4,
     )
     lca = doc[:2].get_lca_matrix()
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index da3bc7dbb..c9bcafcfa 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,6 +1,5 @@
 import pytest
 from spacy.tokens import Doc
-from ..util import get_doc
 
 
 @pytest.fixture()
@@ -8,10 +7,10 @@ def doc(en_vocab):
     words = ["c", "d", "e"]
     pos = ["VERB", "NOUN", "NOUN"]
     tags = ["VBP", "NN", "NN"]
-    heads = [0, -1, -2]
+    heads = [0, 0, 0]
     deps = ["ROOT", "dobj", "dobj"]
-    ents = [(1, 2, "ORG")]
-    return get_doc(
+    ents = [("ORG", 1, 2)]
+    return Doc(
         en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
     )
 
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 1308df67b..3c5c063bd 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -5,31 +5,24 @@ from spacy.symbols import VERB
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 
-from ..util import get_doc
-
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 10, 12, 10, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
-def test_doc_token_api_strings(en_tokenizer):
-    text = "Give it back! He pleaded."
+def test_doc_token_api_strings(en_vocab):
+    words = ["Give", "it", "back", "!", "He", "pleaded", "."]
     pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, heads=heads, deps=deps)
     assert doc[0].orth_ == "Give"
     assert doc[0].text == "Give"
     assert doc[0].text_with_ws == "Give "
@@ -97,88 +90,70 @@ def test_doc_token_api_vectors():
     assert doc[0].similarity(doc[1]) == cosine
 
 
-def test_doc_token_api_ancestors(en_tokenizer):
+def test_doc_token_api_ancestors(en_vocab):
     # the structure of this sentence depends on the English annotation scheme
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
     assert [t.text for t in doc[1].ancestors] == ["saw"]
     assert [t.text for t in doc[2].ancestors] == []
-
     assert doc[2].is_ancestor(doc[7])
     assert not doc[6].is_ancestor(doc[2])
 
 
-def test_doc_token_api_head_setter(en_tokenizer):
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+def test_doc_token_api_head_setter(en_vocab):
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 1
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 0
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 3
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[6].head = doc[3]
-
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 1
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 0
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[0].head = doc[5]
-
     assert doc[5].left_edge.i == 0
     assert doc[6].left_edge.i == 0
     assert doc[3].left_edge.i == 0
     assert doc[4].left_edge.i == 0
     assert doc[2].left_edge.i == 0
-
     # head token must be from the same document
-    doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc2 = Doc(en_vocab, words=words, heads=heads)
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
-
     # test sentence starts when two sentences are joined
-    text = "This is one sentence. This is another sentence."
-    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
+    # fmt: off
+    words = ["This", "is", "one", "sentence", ".", "This", "is", "another", "sentence", "."]
+    heads = [0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
+    # fmt: on
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     # initially two sentences
     assert doc[0].is_sent_start
     assert doc[5].is_sent_start
@@ -186,7 +161,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # modifying with a sentence doesn't change sent starts
     doc[2].head = doc[3]
     assert doc[0].is_sent_start
@@ -195,7 +169,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # attach the second sentence to the first, resulting in one sentence
     doc[5].head = doc[0]
     assert doc[0].is_sent_start
@@ -252,28 +225,28 @@ def test_tokenlast_has_sent_end_true():
 
 
 def test_token_api_conjuncts_chain(en_vocab):
-    words = "The boy and the girl and the man went .".split()
-    heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1]
+    words = ["The", "boy", "and", "the", "girl", "and", "the", "man", "went", "."]
+    heads = [1, 8, 1, 4, 1, 4, 7, 4, 8, 8]
     deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["girl", "man"]
     assert [w.text for w in doc[4].conjuncts] == ["boy", "man"]
     assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"]
 
 
 def test_token_api_conjuncts_simple(en_vocab):
-    words = "They came and went .".split()
-    heads = [1, 0, -1, -2, -1]
+    words = ["They", "came", "and", "went", "."]
+    heads = [1, 1, 1, 1, 3]
     deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["went"]
     assert [w.text for w in doc[3].conjuncts] == ["came"]
 
 
 def test_token_api_non_conjuncts(en_vocab):
-    words = "They came .".split()
-    heads = [1, 0, -1]
+    words = ["They", "came", "."]
+    heads = [1, 1, 1]
     deps = ["nsubj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[0].conjuncts] == []
     assert [w.text for w in doc[1].conjuncts] == []
diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py
index e2154b4c0..8c858a4cb 100644
--- a/spacy/tests/lang/de/test_parser.py
+++ b/spacy/tests/lang/de/test_parser.py
@@ -1,30 +1,26 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_de_parser_noun_chunks_standard_de(de_tokenizer):
-    text = "Eine Tasse steht auf dem Tisch."
-    heads = [1, 1, 0, -1, 1, -2, -4]
+def test_de_parser_noun_chunks_standard_de(de_vocab):
+    words = ["Eine", "Tasse", "steht", "auf", "dem", "Tisch", "."]
+    heads = [1, 2, 2, 2, 5, 3, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "Eine Tasse "
     assert chunks[1].text_with_ws == "dem Tisch "
 
 
-def test_de_extended_chunk(de_tokenizer):
-    text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
-    heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
+def test_de_extended_chunk(de_vocab):
+    # fmt: off
+    words = ["Die", "Sängerin", "singt", "mit", "einer", "Tasse", "Kaffee", "Arien", "."]
+    heads = [1, 2, 2, 2, 5, 3, 5, 2, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "NOUN", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    # fmt: on
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Die Sängerin "
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index fa3a134bd..0189a26d4 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -2,13 +2,10 @@ import numpy
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
 from spacy.lang.en.syntax_iterators import noun_chunks
-
+from spacy.tokens import Doc
 import pytest
 
 
-from ...util import get_doc
-
-
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
     """
@@ -19,9 +16,9 @@ def test_noun_chunks_is_parsed(en_tokenizer):
 
 def test_en_noun_chunks_not_nested(en_vocab):
     words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
-    heads = [1, 0, 4, 3, -1, -2, -5]
+    heads = [1, 1, 6, 6, 3, 3, 1]
     deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.from_array(
         [HEAD, DEP],
         numpy.asarray(
diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py
index 4d06ff8ef..426605566 100644
--- a/spacy/tests/lang/en/test_parser.py
+++ b/spacy/tests/lang/en/test_parser.py
@@ -1,63 +1,51 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_en_parser_noun_chunks_standard(en_tokenizer):
-    text = "A base phrase should be recognized."
-    heads = [2, 1, 3, 2, 1, 0, -1]
+def test_en_parser_noun_chunks_standard(en_vocab):
+    words = ["A", "base", "phrase", "should", "be", "recognized", "."]
+    heads = [2, 2, 5, 5, 5, 5, 5]
     pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
     deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "A base phrase "
 
 
-def test_en_parser_noun_chunks_coordinated(en_tokenizer):
+def test_en_parser_noun_chunks_coordinated(en_vocab):
     # fmt: off
-    text = "A base phrase and a good phrase are often the same."
-    heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
+    words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
+    heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
     pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
     deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A base phrase "
     assert chunks[1].text_with_ws == "a good phrase "
 
 
-def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
-    text = "A phrase with another phrase occurs."
-    heads = [1, 4, -1, 1, -2, 0, -1]
+def test_en_parser_noun_chunks_pp_chunks(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
+    heads = [1, 5, 1, 4, 2, 5, 5]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A phrase "
     assert chunks[1].text_with_ws == "another phrase "
 
 
-def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
+def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
     # fmt: off
-    text = "Sam, my brother, arrived to the house."
-    heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
+    words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
+    heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
     pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Sam "
@@ -65,15 +53,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
     assert chunks[2].text_with_ws == "the house "
 
 
-def test_en_parser_noun_chunks_dative(en_tokenizer):
-    text = "She gave Bob a raise."
-    heads = [1, 0, -1, 1, -3, -4]
+def test_en_parser_noun_chunks_dative(en_vocab):
+    words = ["She", "gave", "Bob", "a", "raise", "."]
+    heads = [1, 1, 1, 4, 1, 1]
     pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "She "
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index ee1e6be17..39d8d3b59 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -1,15 +1,16 @@
 import pytest
+from spacy.tokens import Doc
 
-from ...util import get_doc, apply_transition_sequence
+from ...util import apply_transition_sequence
 
 
-@pytest.mark.parametrize("text", ["A test sentence"])
+@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
-def test_en_sbd_single_punct(en_tokenizer, text, punct):
-    heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+def test_en_sbd_single_punct(en_vocab, words, punct):
+    heads = [2, 2, 2, 2] if punct else [2, 2, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    words = [*words, punct] if punct else words
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
@@ -18,17 +19,16 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_en_sentence_breaks(en_tokenizer, en_parser):
+def test_en_sentence_breaks(en_vocab, en_parser):
     # fmt: off
-    text = "This is a sentence . This is another one ."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "one", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct"]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
                   "L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     apply_transition_sequence(en_parser, doc, transition)
     assert len(list(doc.sents)) == 2
     for token in doc:
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index bcf103b65..3810323bf 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_ru_doc_lemmatization(ru_lemmatizer):
@@ -11,7 +10,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
         "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
         "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
     ]
-    doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = Doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
     doc = ru_lemmatizer(doc)
     lemmas = [token.lemma_ for token in doc]
     assert lemmas == ["мама", "мыть", "рама"]
@@ -28,7 +27,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
     ],
 )
 def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert sorted(result_lemmas) == lemmas
 
@@ -51,7 +50,7 @@ def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
 def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ru_lemmatizer, text, pos, morph, lemma
 ):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
@@ -66,13 +65,13 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ],
 )
 def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
 
 def test_ru_lemmatizer_punct(ru_lemmatizer):
-    doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
-    doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index 458cdadd5..3791d8021 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
@@ -16,21 +15,21 @@ SV_NP_TEST_EXAMPLES = [
         "En student läste en bok",  # A student read a book
         ["DET", "NOUN", "VERB", "DET", "NOUN"],
         ["det", "nsubj", "ROOT", "det", "dobj"],
-        [1, 1, 0, 1, -2],
+        [1, 2, 2, 4, 2],
         ["En student", "en bok"],
     ),
     (
         "Studenten läste den bästa boken.",  # The student read the best book
         ["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
         ["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
-        [1, 0, 2, 1, -3, -4],
+        [1, 1, 4, 4, 1, 1],
         ["Studenten", "den bästa boken"],
     ),
     (
         "De samvetslösa skurkarna hade stulit de största juvelerna på söndagen",  # The remorseless crooks had stolen the largest jewels that sunday
         ["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
         ["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
-        [2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
+        [2, 2, 4, 4, 4, 7, 7, 4, 9, 4],
         ["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
     ),
 ]
@@ -41,12 +40,9 @@ SV_NP_TEST_EXAMPLES = [
 )
 def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
     tokens = sv_tokenizer(text)
-
     assert len(heads) == len(pos)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
-    )
-
+    words = [t.text for t in tokens]
+    doc = Doc(tokens.vocab, words=words, heads=heads, deps=deps, pos=pos)
     noun_chunks = list(doc.noun_chunks)
     assert len(noun_chunks) == len(expected_noun_chunks)
     for i, np in enumerate(noun_chunks):
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 6361a10ce..e18a8f6d8 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -4,16 +4,15 @@ import re
 import copy
 from mock import Mock
 from spacy.matcher import DependencyMatcher
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
 def doc(en_vocab):
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
+    heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
     deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-    return doc
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -236,10 +235,10 @@ def test_dependency_matcher_callback(en_vocab, doc):
 @pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
 def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
     # two sentences to test that all matches are within the same sentence
-    doc = get_doc(
+    doc = Doc(
         en_vocab,
         words=["a", "b", "c", "d", "e"] * 2,
-        heads=[0, -1, -2, -3, -4] * 2,
+        heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
         deps=["dep"] * 10,
     )
     match_count = 0
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 9caf284a3..522356ffc 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -3,7 +3,6 @@ import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span
-from ..util import get_doc
 
 
 def test_matcher_phrase_matcher(en_vocab):
@@ -140,10 +139,10 @@ def test_phrase_matcher_string_attrs(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
     pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 1
     match_id, start, end = matches[0]
@@ -158,10 +157,10 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
     pos2 = ["X", "X", "X"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 0
 
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 41da7cf49..544701a4c 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -2,8 +2,7 @@ import pytest
 from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
 from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
 from spacy.pipeline._parser_internals import nonproj
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
@@ -74,16 +73,10 @@ def test_parser_is_nonproj_tree(
     assert is_nonproj_tree(multirooted_tree) is True
 
 
-def test_parser_pseudoprojectivity(en_tokenizer):
+def test_parser_pseudoprojectivity(en_vocab):
     def deprojectivize(proj_heads, deco_labels):
-        tokens = en_tokenizer("whatever " * len(proj_heads))
-        rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
-        doc = get_doc(
-            tokens.vocab,
-            words=[t.text for t in tokens],
-            deps=deco_labels,
-            heads=rel_proj_heads,
-        )
+        words = ["whatever "] * len(proj_heads)
+        doc = Doc(en_vocab, words=words, deps=deco_labels, heads=proj_heads)
         nonproj.deprojectivize(doc)
         return [t.head.i for t in doc], [token.dep_ for token in doc]
 
@@ -94,49 +87,39 @@ def test_parser_pseudoprojectivity(en_tokenizer):
     labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
     labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
     # fmt: on
-
     assert nonproj.decompose("X||Y") == ("X", "Y")
     assert nonproj.decompose("X") == ("X", "")
     assert nonproj.is_decorated("X||Y") is True
     assert nonproj.is_decorated("X") is False
-
     nonproj._lift(0, tree)
     assert tree == [2, 2, 2]
-
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
-
     # fmt: off
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
     assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
     assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                            "nsubj", "acl||dobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree
     assert undeco_labels == labels
-
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
     assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
     assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
                            "det", "dobj", "det", "nmod", "aux", "nmod||dobj",
                            "advmod", "det", "amod", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree2
     assert undeco_labels == labels2
-
     # if decoration is wrong such that there is no head with the desired label
     # the structure is kept and the label is undecorated
     proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
     deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
                    "acl||iobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == proj_heads
     assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                              "nsubj", "acl", "punct"]
-
     # if there are two potential new heads, the first one is chosen even if
     # it"s wrong
     proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 9e760c1e7..8648f2018 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,9 +1,11 @@
 import pytest
-
 from spacy.lang.en import English
-from ..util import get_doc, apply_transition_sequence, make_tempdir
-from ... import util
-from ...training import Example
+from spacy.training import Example
+from spacy.tokens import Doc
+from spacy import util
+
+from ..util import apply_transition_sequence, make_tempdir
+
 
 TRAIN_DATA = [
     (
@@ -23,12 +25,11 @@ TRAIN_DATA = [
 ]
 
 
-def test_parser_root(en_tokenizer):
-    text = "i don't have other assistance"
-    heads = [3, 2, 1, 0, 1, -2]
+def test_parser_root(en_vocab):
+    words = ["i", "do", "n't", "have", "other", "assistance"]
+    heads = [3, 3, 3, 3, 5, 3]
     deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for t in doc:
         assert t.dep != 0, t.text
 
@@ -36,13 +37,9 @@ def test_parser_root(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-@pytest.mark.parametrize("text", ["Hello"])
-def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
-
+@pytest.mark.parametrize("words", [["Hello"]])
+def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
+    doc = Doc(en_vocab, words=words, heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     with en_parser.step_through(doc) as _:  # noqa: F841
         pass
@@ -52,24 +49,22 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_initial(en_tokenizer, en_parser):
-    text = "I ate the pizza with anchovies."
-    # heads = [1, 0, 1, -2, -3, -1, -5]
+def test_parser_initial(en_vocab, en_parser):
+    words = ["I", "ate", "the", "pizza", "with", "anchovies", "."]
     transition = ["L-nsubj", "S", "L-det"]
-    tokens = en_tokenizer(text)
-    apply_transition_sequence(en_parser, tokens, transition)
-    assert tokens[0].head.i == 1
-    assert tokens[1].head.i == 1
-    assert tokens[2].head.i == 3
-    assert tokens[3].head.i == 3
+    doc = Doc(en_vocab, words=words)
+    apply_transition_sequence(en_parser, doc, transition)
+    assert doc[0].head.i == 1
+    assert doc[1].head.i == 1
+    assert doc[2].head.i == 3
+    assert doc[3].head.i == 3
 
 
-def test_parser_parse_subtrees(en_tokenizer, en_parser):
-    text = "The four wheels on the bus turned quickly"
-    heads = [2, 1, 4, -1, 1, -2, 0, -1]
+def test_parser_parse_subtrees(en_vocab, en_parser):
+    words = ["The", "four", "wheels", "on", "the", "bus", "turned", "quickly"]
+    heads = [2, 2, 6, 2, 5, 3, 6, 6]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -79,15 +74,12 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
     assert len(list(doc[2].subtree)) == 6
 
 
-def test_parser_merge_pp(en_tokenizer):
-    text = "A phrase with another phrase occurs"
-    heads = [1, 4, -1, 1, -2, 0]
+def test_parser_merge_pp(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs"]
+    heads = [1, 5, 1, 4, 2, 5]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
-    )
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads, pos=pos)
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
             retokenizer.merge(np, attrs={"lemma": np.lemma_})
@@ -100,12 +92,11 @@ def test_parser_merge_pp(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
-    text = "a b c d e"
-
+def test_parser_arc_eager_finalize_state(en_vocab, en_parser):
+    words = ["a", "b", "c", "d", "e"]
     # right branching
     transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -140,7 +131,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 
     # left branching
     transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -177,10 +168,10 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 def test_parser_set_sent_starts(en_vocab):
     # fmt: off
     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
-    heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
+    heads = [1, 1, 1, 30, 4, 4, 7, 4, 7, 17, 14, 14, 11, 14, 17, 16, 17, 6, 17, 20, 11, 20, 26, 22, 26, 26, 20, 26, 29, 31, 31, 25, 31, 32, 17, 4, 4, 36]
     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
     # fmt: on
-    doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads)
     for i in range(len(words)):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
@@ -201,24 +192,21 @@ def test_overfitting_IO():
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
     optimizer = nlp.begin_training()
-
     for i in range(100):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["parser"] < 0.0001
-
     # test the trained model
     test_text = "I like securities."
     doc = nlp(test_text)
-    assert doc[0].dep_ is "nsubj"
-    assert doc[2].dep_ is "dobj"
-    assert doc[3].dep_ is "punct"
-
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        assert doc2[0].dep_ is "nsubj"
-        assert doc2[2].dep_ is "dobj"
-        assert doc2[3].dep_ is "punct"
+        assert doc2[0].dep_ == "nsubj"
+        assert doc2[2].dep_ == "dobj"
+        assert doc2[3].dep_ == "punct"
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f181a799a..8ca4039a2 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -1,59 +1,75 @@
 import pytest
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
-def text():
-    return """
-It was a bright cold day in April, and the clocks were striking thirteen.
-Winston Smith, his chin nuzzled into his breast in an effort to escape the
-vile wind, slipped quickly through the glass doors of Victory Mansions,
-though not quickly enough to prevent a swirl of gritty dust from entering
-along with him.
-
-The hallway smelt of boiled cabbage and old rag mats. At one end of it a
-coloured poster, too large for indoor display, had been tacked to the wall.
-It depicted simply an enormous face, more than a metre wide: the face of a
-man of about forty-five, with a heavy black moustache and ruggedly handsome
-features. Winston made for the stairs. It was no use trying the lift. Even at
-the best of times it was seldom working, and at present the electric current
-was cut off during daylight hours. It was part of the economy drive in
-preparation for Hate Week. The flat was seven flights up, and Winston, who
-was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
-resting several times on the way. On each landing, opposite the lift-shaft,
-the poster with the enormous face gazed from the wall. It was one of those
-pictures which are so contrived that the eyes follow you about when you move.
-BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
-"""
+def words():
+    # fmt: off
+    return [
+        "\n", "It", "was", "a", "bright", "cold", "day", "in", "April", ",",
+        "and", "the", "clocks", "were", "striking", "thirteen", ".", "\n",
+        "Winston", "Smith", ",", "his", "chin", "nuzzled", "into", "his",
+        "breast", "in", "an", "effort", "to", "escape", "the", "\n", "vile",
+        "wind", ",", "slipped", "quickly", "through", "the", "glass", "doors",
+        "of", "Victory", "Mansions", ",", "\n", "though", "not", "quickly",
+        "enough", "to", "prevent", "a", "swirl", "of", "gritty", "dust",
+        "from", "entering", "\n", "along", "with", "him", ".", "\n\n", "The",
+        "hallway", "smelt", "of", "boiled", "cabbage", "and", "old", "rag",
+        "mats", ".", "At", "one", "end", "of", "it", "a", "\n", "coloured",
+        "poster", ",", "too", "large", "for", "indoor", "display", ",", "had",
+        "been", "tacked", "to", "the", "wall", ".", "\n", "It", "depicted",
+        "simply", "an", "enormous", "face", ",", "more", "than", "a", "metre",
+        "wide", ":", "the", "face", "of", "a", "\n", "man", "of", "about",
+        "forty", "-", "five", ",", "with", "a", "heavy", "black", "moustache",
+        "and", "ruggedly", "handsome", "\n", "features", ".", "Winston", "made",
+        "for", "the", "stairs", ".", "It", "was", "no", "use", "trying", "the",
+        "lift", ".", "Even", "at", "\n", "the", "best", "of", "times", "it",
+        "was", "seldom", "working", ",", "and", "at", "present", "the",
+        "electric", "current", "\n", "was", "cut", "off", "during", "daylight",
+        "hours", ".", "It", "was", "part", "of", "the", "economy", "drive",
+        "in", "\n", "preparation", "for", "Hate", "Week", ".", "The", "flat",
+        "was", "seven", "flights", "up", ",", "and", "Winston", ",", "who",
+        "\n", "was", "thirty", "-", "nine", "and", "had", "a", "varicose",
+        "ulcer", "above", "his", "right", "ankle", ",", "went", "slowly", ",",
+        "\n", "resting", "several", "times", "on", "the", "way", ".", "On",
+        "each", "landing", ",", "opposite", "the", "lift", "-", "shaft", ",",
+        "\n", "the", "poster", "with", "the", "enormous", "face", "gazed",
+        "from", "the", "wall", ".", "It", "was", "one", "of", "those", "\n",
+        "pictures", "which", "are", "so", "contrived", "that", "the", "eyes",
+        "follow", "you", "about", "when", "you", "move", ".", "\n", "BIG",
+        "BROTHER", "IS", "WATCHING", "YOU", ",", "the", "caption", "beneath",
+        "it", "ran", ".", "\n", ]
+    # fmt: on
 
 
 @pytest.fixture
 def heads():
     # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
-            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
-            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
-            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
-            0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
-            -1, 0, -1, -1]
+    return [
+        1, 2, 2, 6, 6, 6, 2, 6, 7, 2, 2, 12, 14, 14, 2, 14, 14, 16, 19, 23, 23,
+        22, 23, 23, 23, 26, 24, 23, 29, 27, 31, 29, 35, 32, 35, 31, 23, 23, 37,
+        37, 42, 42, 39, 42, 45, 43, 37, 46, 37, 50, 51, 37, 53, 51, 55, 53, 55,
+        58, 56, 53, 59, 60, 60, 62, 63, 23, 65, 68, 69, 69, 69, 72, 70, 72, 76,
+        76, 72, 69, 96, 80, 78, 80, 81, 86, 83, 86, 96, 96, 89, 96, 89, 92, 90,
+        96, 96, 96, 96, 96, 99, 97, 96, 100, 103, 103, 103, 107, 107, 103, 107,
+        111, 111, 112, 113, 107, 103, 116, 136, 116, 120, 118, 117, 120, 125,
+        125, 125, 121, 116, 116, 131, 131, 131, 127, 131, 134, 131, 134, 136,
+        136, 139, 139, 139, 142, 140, 139, 145, 145, 147, 145, 147, 150, 148,
+        145, 153, 162, 153, 156, 162, 156, 157, 162, 162, 162, 162, 162, 162,
+        172, 165, 169, 169, 172, 169, 172, 162, 172, 172, 176, 174, 172, 179,
+        179, 179, 180, 183, 181, 179, 184, 185, 185, 187, 190, 188, 179, 193,
+        194, 194, 196, 194, 196, 194, 194, 218, 200, 204, 202, 200, 207, 207,
+        204, 204, 204, 212, 212, 209, 212, 216, 216, 213, 200, 194, 218, 218,
+        220, 218, 224, 222, 222, 227, 225, 218, 246, 231, 229, 246, 246, 237,
+        237, 237, 233, 246, 238, 241, 246, 241, 245, 245, 242, 246, 246, 249,
+        247, 246, 252, 252, 252, 253, 257, 255, 254, 259, 257, 261, 259, 265,
+        264, 265, 261, 265, 265, 270, 270, 267, 252, 271, 274, 275, 275, 276,
+        283, 283, 280, 283, 280, 281, 283, 283, 284]
     # fmt: on
 
 
-def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for head in doc:
         for child in head.lefts:
             assert child.head == head
@@ -61,15 +77,8 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
             assert child.head == head
 
 
-def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
-
+def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     lefts = {}
     rights = {}
     for head in doc:
@@ -99,9 +108,8 @@ def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
         assert not children
 
 
-def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_edges(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for token in doc:
         subtree = list(token.subtree)
         debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3672dabea..2b80272d6 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -1,42 +1,40 @@
 import pytest
+from spacy.tokens import Doc
 
-from spacy.tokens.doc import Doc
-
-from ..util import get_doc, apply_transition_sequence
+from ..util import apply_transition_sequence
 
 
-def test_parser_space_attachment(en_tokenizer):
-    text = "This is a test.\nTo ensure  spaces are attached well."
-    heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+def test_parser_space_attachment(en_vocab):
+    # fmt: off
+    words = ["This", "is", "a", "test", ".", "\n", "To", "ensure", " ", "spaces", "are", "attached", "well", "."]
+    heads = [1, 1, 3, 1, 1, 4, 7, 11, 7, 11, 11, 11, 11, 11]
+    # fmt: on
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
 
 
-def test_parser_sentence_space(en_tokenizer):
+def test_parser_sentence_space(en_vocab):
     # fmt: off
-    text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-    heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+    words = ["I", "look", "forward", "to", "using", "Thingamajig", ".", " ", "I", "'ve", "been", "told", "it", "will", "make", "my", "life", "easier", "..."]
+    heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
     deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
             "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
             "poss", "nsubj", "ccomp", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc.sents)) == 2
 
 
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_leading(en_tokenizer, en_parser):
-    text = "\t \n This is a sentence ."
-    heads = [1, 1, 0, 1, -2, -3]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+def test_parser_space_attachment_leading(en_vocab, en_parser):
+    words = ["\t", "\n", "This", "is", "a", "sentence", "."]
+    heads = [1, 2, 2, 4, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[0].is_space
     assert doc[1].is_space
     assert doc[2].text == "This"
@@ -50,18 +48,16 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
-    text = "This is \t a \t\n \n sentence . \n\n \n"
-    heads = [1, 0, -1, 2, -1, -4, -5, -1]
+def test_parser_space_attachment_intermediate_trailing(en_vocab, en_parser):
+    words = ["This", "is", "\t", "a", "\t\n", "\n", "sentence", ".", "\n\n", "\n"]
+    heads = [1, 1, 1, 5, 3, 1, 1, 6]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[2].is_space
     assert doc[4].is_space
     assert doc[5].is_space
     assert doc[8].is_space
     assert doc[9].is_space
-
     apply_transition_sequence(en_parser, doc, transition)
     for token in doc:
         assert token.dep != 0 or token.is_space
@@ -72,7 +68,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
+def test_parser_space_attachment_space(en_parser, text, length):
     doc = Doc(en_parser.vocab, words=text)
     assert len(doc) == length
     with en_parser.step_through(doc) as _:  # noqa: F841
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index a66b34bc0..b9e5894dd 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -4,8 +4,9 @@ from spacy.training import Example
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
 from spacy import util, registry
+from spacy.tokens import Doc
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.fixture
@@ -66,7 +67,6 @@ def test_attributeruler_init(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     for p in pattern_dicts:
         a.add(**p)
-
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
@@ -129,7 +129,7 @@ def test_attributeruler_rule_order(nlp):
         {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
     ]
     a.add_patterns(patterns)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
@@ -141,13 +141,12 @@ def test_attributeruler_rule_order(nlp):
 def test_attributeruler_tag_map(nlp, tag_map):
     a = AttributeRuler(nlp.vocab)
     a.load_from_tag_map(tag_map)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i == 4:
             assert doc[i].pos_ == "PUNCT"
@@ -160,13 +159,12 @@ def test_attributeruler_tag_map(nlp, tag_map):
 def test_attributeruler_morph_rules(nlp, morph_rules):
     a = AttributeRuler(nlp.vocab)
     a.load_from_morph_rules(morph_rules)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "the", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i != 2:
             assert doc[i].pos_ == ""
@@ -193,7 +191,6 @@ def test_attributeruler_indices(nlp):
 
     text = "This is a test."
     doc = nlp(text)
-
     for i in range(len(doc)):
         if i == 1:
             assert doc[i].lemma_ == "was"
@@ -205,12 +202,10 @@ def test_attributeruler_indices(nlp):
             assert doc[i].lemma_ == "cat"
         else:
             assert doc[i].morph_ == ""
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
     with pytest.raises(ValueError):
         doc = nlp(text)
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
     with pytest.raises(ValueError):
@@ -220,7 +215,6 @@ def test_attributeruler_indices(nlp):
 def test_attributeruler_patterns_prop(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     for p1, p2 in zip(pattern_dicts, a.patterns):
         assert p1["patterns"] == p2["patterns"]
         assert p1["attrs"] == p2["attrs"]
@@ -231,18 +225,15 @@ def test_attributeruler_patterns_prop(nlp, pattern_dicts):
 def test_attributeruler_serialize(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     text = "This is a test."
     attrs = ["ORTH", "LEMMA", "MORPH"]
     doc = nlp(text)
-
     # bytes roundtrip
     a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
     assert a.to_bytes() == a_reloaded.to_bytes()
     doc1 = a_reloaded(nlp.make_doc(text))
     numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
     assert a.patterns == a_reloaded.patterns
-
     # disk roundtrip
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index ee9e34df3..025ac04af 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -1,57 +1,38 @@
 import pytest
 from spacy.pipeline.functions import merge_subtokens
 from spacy.language import Language
-from spacy.tokens import Span
-
-from ..util import get_doc
+from spacy.tokens import Span, Doc
 
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 11, 12, 13, 13]
     deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
             "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
-def doc2(en_tokenizer):
-    text = "I like New York in Autumn."
-    heads = [1, 0, 1, -2, -3, -1, -5]
+def doc2(en_vocab):
+    words = ["I", "like", "New", "York", "in", "Autumn", "."]
+    heads = [1, 1, 3, 1, 1, 4, 1]
     tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        tags=tags,
-        pos=pos,
-        deps=deps,
-    )
-    doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
+    doc = Doc(en_vocab, words=words, heads=heads, tags=tags, pos=pos, deps=deps)
+    doc.ents = [Span(doc, 2, 4, label="GPE")]
     return doc
 
 
 def test_merge_subtokens(doc):
     doc = merge_subtokens(doc)
-    # get_doc() doesn't set spaces, so the result is "And a third ."
-    assert [t.text for t in doc] == [
-        "This",
-        "is",
-        "a sentence",
-        ".",
-        "This",
-        "is",
-        "another sentence",
-        ".",
-        "And a third .",
-    ]
+    # Doc doesn't have spaces, so the result is "And a third ."
+    # fmt: off
+    assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]
+    # fmt: on
 
 
 def test_factories_merge_noun_chunks(doc2):
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 30f66fb1d..d841ee24b 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -9,7 +9,7 @@ from spacy.lang.en import English
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize(
@@ -88,12 +88,9 @@ def test_issue242(en_tokenizer):
         doc.ents += tuple(matches)
 
 
-def test_issue309(en_tokenizer):
+def test_issue309(en_vocab):
     """Test Issue #309: SBD fails on empty string"""
-    tokens = en_tokenizer(" ")
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
+    doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 71ed2ea03..dce3e8298 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 def test_issue1506():
@@ -197,32 +197,21 @@ def test_issue1807():
 def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
-    string = "This is a first sentence . And another one"
-    words = string.split()
-    doc = get_doc(Vocab(), words=words)
+    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
+    doc = Doc(Vocab(), words=words)
     doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
     assert not new_doc.has_annotation("DEP")
     assert not new_doc.has_annotation("TAG")
-    doc = get_doc(
+    doc = Doc(
         Vocab(),
         words=words,
         tags=["TAG"] * len(words),
-        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
         deps=["dep"] * len(words),
     )
-    print(
-        doc.has_annotation("DEP"),
-        [t.head.i for t in doc],
-        [t.is_sent_start for t in doc],
-    )
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    print(
-        new_doc.has_annotation("DEP"),
-        [t.head.i for t in new_doc],
-        [t.is_sent_start for t in new_doc],
-    )
     assert new_doc[6].sent_start
     assert new_doc.has_annotation("DEP")
     assert new_doc.has_annotation("TAG")
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 3bea5d3f6..c4c755153 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -7,7 +7,7 @@ from spacy.training import iob_to_biluo
 from spacy.lang.it import Italian
 from spacy.lang.en import English
 
-from ..util import add_vecs_to_vocab, get_doc
+from ..util import add_vecs_to_vocab
 
 
 @pytest.mark.skip(
@@ -69,9 +69,10 @@ def test_issue2219(en_vocab):
     assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
 
 
-def test_issue2361(de_tokenizer):
+def test_issue2361(de_vocab):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
-    doc = de_tokenizer('< > & " ')
+    words = ["<", ">", "&", '"']
+    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
     html = render(doc)
     for char in chars:
         assert char in html
@@ -105,7 +106,7 @@ def test_issue2385_biluo(tags):
 
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
-    heads = [1, 0, 1, -2, -1, -1]
+    heads = [1, 1, 3, 1, 3, 4]
     deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
@@ -118,7 +119,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 9267a7346..5895b616e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -12,8 +12,6 @@ from spacy.compat import pickle
 import numpy
 import random
 
-from ..util import get_doc
-
 
 def test_issue2564():
     """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
@@ -117,12 +115,14 @@ def test_issue2754(en_tokenizer):
 
 def test_issue2772(en_vocab):
     """Test that deprojectivization doesn't mess up sentence boundaries."""
-    words = "When we write or communicate virtually , we can hide our true feelings .".split()
+    # fmt: off
+    words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
+    # fmt: on
     # A tree with a non-projective (i.e. crossing) arc
     # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
+    heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
     deps = ["dep"] * len(heads)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[1].is_sent_start is False
 
 
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index d848467dd..a64dc53e4 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -10,10 +10,8 @@ from spacy.vocab import Vocab
 from spacy.attrs import ENT_IOB, ENT_TYPE
 from spacy.compat import pickle
 from spacy import displacy
-import numpy
-
 from spacy.vectors import Vectors
-from ..util import get_doc
+import numpy
 
 
 def test_issue3002():
@@ -47,7 +45,7 @@ def test_issue3009(en_vocab):
     words = ["also", "has", "to", "do", "with"]
     tags = ["RB", "VBZ", "TO", "VB", "IN"]
     pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos)
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
     matcher = Matcher(en_vocab)
     for i, pattern in enumerate(patterns):
         matcher.add(str(i), [pattern])
@@ -61,19 +59,15 @@ def test_issue3012(en_vocab):
     words = ["This", "is", "10", "%", "."]
     tags = ["DT", "VBZ", "CD", "NN", "."]
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [(2, 4, "PERCENT")]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+    ents = [("PERCENT", 2, 4)]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
     assert doc.has_annotation("TAG")
-
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     header = [ENT_IOB, ENT_TYPE]
     ent_array = doc.to_array(header)
     doc.from_array(header, ent_array)
-
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     # Serializing then deserializing
     doc_bytes = doc.to_bytes()
     doc2 = Doc(en_vocab).from_bytes(doc_bytes)
@@ -85,12 +79,8 @@ def test_issue3199():
     is available. To make this test future-proof, we're constructing a Doc
     with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = get_doc(
-        Vocab(),
-        words=["This", "is", "a", "sentence"],
-        heads=[0, -1, -2, -3],
-        deps=["dep"] * 4,
-    )
+    words = ["This", "is", "a", "sentence"]
+    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -147,9 +137,9 @@ def test_issue3288(en_vocab):
     """Test that retokenization works correctly via displaCy when punctuation
     is merged onto the preceeding token and tensor is resized."""
     words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-    heads = [1, 0, -1, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 4, 4, 6, 4, 4]
     deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
     displacy.render(doc)
 
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 8c483d877..a79be6638 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -20,7 +20,7 @@ import spacy
 import srsly
 import numpy
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
@@ -355,7 +355,7 @@ def test_issue3882(en_vocab):
     """Test that displaCy doesn't serialize the doc.user_data when making a
     copy of the Doc.
     """
-    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -398,10 +398,10 @@ def test_issue3962(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = doc[1:5]  # "jests at scars ,"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
@@ -436,10 +436,10 @@ def test_issue3962_long(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = two_sent_doc[1:7]  # "jests at scars. They never"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
new file mode 100644
index 000000000..dbfe78679
--- /dev/null
+++ b/spacy/tests/regression/test_issue5001-5500.py
@@ -0,0 +1,138 @@
+import numpy
+from spacy.tokens import Doc, DocBin
+from spacy.attrs import DEP, POS, TAG
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.lang.en.syntax_iterators import noun_chunks
+from spacy.vocab import Vocab
+import spacy
+import pytest
+
+from ...util import make_tempdir
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+    strings = en_vocab.strings
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+    doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
+
+
+def test_issue5082():
+    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+    nlp = English()
+    vocab = nlp.vocab
+    array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
+    array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
+    array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
+    array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
+    array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
+    vocab.set_vector("I", array1)
+    vocab.set_vector("like", array2)
+    vocab.set_vector("David", array3)
+    vocab.set_vector("Bowie", array4)
+    text = "I like David Bowie"
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    parsed_vectors_1 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_1) == 4
+    numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
+    numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
+    nlp.add_pipe("merge_entities")
+    parsed_vectors_2 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_2) == 3
+    numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
+
+
+def test_issue5137():
+    @Language.factory("my_component")
+    class MyComponent:
+        def __init__(self, nlp, name="my_component", categories="all_categories"):
+            self.nlp = nlp
+            self.categories = categories
+            self.name = name
+
+        def __call__(self, doc):
+            pass
+
+        def to_disk(self, path, **kwargs):
+            pass
+
+        def from_disk(self, path, **cfg):
+            pass
+
+    nlp = English()
+    my_component = nlp.add_pipe("my_component")
+    assert my_component.categories == "all_categories"
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
+        nlp2 = spacy.load(tmpdir, config=overrides)
+        assert nlp2.get_pipe("my_component").categories == "my_categories"
+
+
+def test_issue5141(en_vocab):
+    """ Ensure an empty DocBin does not crash on serialization """
+    doc_bin = DocBin(attrs=["DEP", "HEAD"])
+    assert list(doc_bin.get_docs(en_vocab)) == []
+    doc_bin_bytes = doc_bin.to_bytes()
+    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+    assert list(doc_bin_2.get_docs(en_vocab)) == []
+
+
+def test_issue5152():
+    # Test that the comparison between a Span and a Token, goes well
+    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+    nlp = English()
+    text = nlp("Talk about being boring!")
+    text_var = nlp("Talk of being boring!")
+    y = nlp("Let")
+    span = text[0:3]  # Talk about being
+    span_2 = text[0:3]  # Talk about being
+    span_3 = text_var[0:3]  # Talk of being
+    token = y[0]  # Let
+    with pytest.warns(UserWarning):
+        assert span.similarity(token) == 0.0
+    assert span.similarity(span_2) == 1.0
+    with pytest.warns(UserWarning):
+        assert span_2.similarity(span_3) < 1.0
+
+
+def test_issue5458():
+    # Test that the noun chuncker does not generate overlapping spans
+    # fmt: off
+    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+    vocab = Vocab(strings=words)
+    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
+    # fmt: on
+    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
+    en_doc.noun_chunks_iterator = noun_chunks
+
+    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+    nlp = English()
+    merge_nps = nlp.create_pipe("merge_noun_chunks")
+    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
deleted file mode 100644
index bc52ae82f..000000000
--- a/spacy/tests/regression/test_issue5048.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy
-from spacy.tokens import Doc
-from spacy.attrs import DEP, POS, TAG
-
-from ..util import get_doc
-
-
-def test_issue5048(en_vocab):
-    words = ["This", "is", "a", "sentence"]
-    pos_s = ["DET", "VERB", "DET", "NOUN"]
-    spaces = [" ", " ", " ", ""]
-    deps_s = ["dep", "adj", "nn", "atm"]
-    tags_s = ["DT", "VBZ", "DT", "NN"]
-
-    strings = en_vocab.strings
-
-    for w in words:
-        strings.add(w)
-    deps = [strings.add(d) for d in deps_s]
-    pos = [strings.add(p) for p in pos_s]
-    tags = [strings.add(t) for t in tags_s]
-
-    attrs = [POS, DEP, TAG]
-    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
-
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    doc.from_array(attrs, array)
-    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
-
-    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
-    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
-    assert v1 == v2
diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py
deleted file mode 100644
index 76f3a552e..000000000
--- a/spacy/tests/regression/test_issue5082.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-from spacy.lang.en import English
-
-
-def test_issue5082():
-    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
-    nlp = English()
-    vocab = nlp.vocab
-    array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32)
-    array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32)
-    array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32)
-    array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32)
-    array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32)
-
-    vocab.set_vector("I", array1)
-    vocab.set_vector("like", array2)
-    vocab.set_vector("David", array3)
-    vocab.set_vector("Bowie", array4)
-
-    text = "I like David Bowie"
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    parsed_vectors_1 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_1) == 4
-    np.testing.assert_array_equal(parsed_vectors_1[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_1[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_1[2], array3)
-    np.testing.assert_array_equal(parsed_vectors_1[3], array4)
-    nlp.add_pipe("merge_entities")
-    parsed_vectors_2 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_2) == 3
-    np.testing.assert_array_equal(parsed_vectors_2[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_2[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_2[2], array34)
diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py
deleted file mode 100644
index cc7a9bd38..000000000
--- a/spacy/tests/regression/test_issue5137.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import spacy
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.tests.util import make_tempdir
-
-
-def test_issue5137():
-    @Language.factory("my_component")
-    class MyComponent:
-        def __init__(self, nlp, name="my_component", categories="all_categories"):
-            self.nlp = nlp
-            self.categories = categories
-            self.name = name
-
-        def __call__(self, doc):
-            pass
-
-        def to_disk(self, path, **kwargs):
-            pass
-
-        def from_disk(self, path, **cfg):
-            pass
-
-    nlp = English()
-    my_component = nlp.add_pipe("my_component")
-    assert my_component.categories == "all_categories"
-
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
-        nlp2 = spacy.load(tmpdir, config=overrides)
-        assert nlp2.get_pipe("my_component").categories == "my_categories"
diff --git a/spacy/tests/regression/test_issue5141.py b/spacy/tests/regression/test_issue5141.py
deleted file mode 100644
index 845454583..000000000
--- a/spacy/tests/regression/test_issue5141.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue5141(en_vocab):
-    """ Ensure an empty DocBin does not crash on serialization """
-    doc_bin = DocBin(attrs=["DEP", "HEAD"])
-    assert list(doc_bin.get_docs(en_vocab)) == []
-    doc_bin_bytes = doc_bin.to_bytes()
-
-    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
-    assert list(doc_bin_2.get_docs(en_vocab)) == []
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
deleted file mode 100644
index c7a70a99c..000000000
--- a/spacy/tests/regression/test_issue5152.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-def test_issue5152():
-    # Test that the comparison between a Span and a Token, goes well
-    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
-    nlp = English()
-    text = nlp("Talk about being boring!")
-    text_var = nlp("Talk of being boring!")
-    y = nlp("Let")
-    span = text[0:3]  # Talk about being
-    span_2 = text[0:3]  # Talk about being
-    span_3 = text_var[0:3]  # Talk of being
-    token = y[0]  # Let
-    with pytest.warns(UserWarning):
-        assert span.similarity(token) == 0.0
-    assert span.similarity(span_2) == 1.0
-    with pytest.warns(UserWarning):
-        assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py
deleted file mode 100644
index a7a2959df..000000000
--- a/spacy/tests/regression/test_issue5458.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from spacy.lang.en import English
-from spacy.lang.en.syntax_iterators import noun_chunks
-from spacy.tests.util import get_doc
-from spacy.vocab import Vocab
-
-
-def test_issue5458():
-    # Test that the noun chuncker does not generate overlapping spans
-    # fmt: off
-    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
-    dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
-    pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
-    heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
-    # fmt: on
-
-    en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
-    en_doc.noun_chunks_iterator = noun_chunks
-
-    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
-    nlp = English()
-    merge_nps = nlp.create_pipe("merge_noun_chunks")
-    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index 66280f012..db957709c 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,5 +1,6 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
+import pytest
 
 
 def test_issue5918():
@@ -22,6 +23,7 @@ def test_issue5918():
     assert len(doc.ents) == 3
     # make it so that the third span's head is within the entity (ent_iob=I)
     # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    doc[29].head = doc[33]
+    with pytest.warns(UserWarning):
+        doc[29].head = doc[33]
     doc = merge_entities(doc)
     assert len(doc.ents) == 3
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 1fa0eeaa1..040dd657f 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,15 +1,13 @@
 import pytest
 from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
-from spacy.tokens import Span
+from spacy.tokens import Span, Doc
 from spacy.lang.fa import Persian
 
-from .util import get_doc
-
 
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     ents = displacy.parse_ents(doc)
     assert isinstance(ents, dict)
@@ -20,11 +18,11 @@ def test_displacy_parse_ents(en_vocab):
 def test_displacy_parse_deps(en_vocab):
     """Test that deps and tags on a Doc are converted into displaCy's format."""
     words = ["This", "is", "a", "sentence"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 1, 3, 1]
     pos = ["DET", "VERB", "DET", "NOUN"]
     tags = ["DT", "VBZ", "DT", "NN"]
     deps = ["nsubj", "ROOT", "det", "attr"]
-    doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
     deps = displacy.parse_deps(doc)
     assert isinstance(deps, dict)
     assert deps["words"] == [
@@ -53,7 +51,7 @@ def test_displacy_invalid_arcs():
 
 def test_displacy_spans(en_vocab):
     """Test that displaCy can render Spans."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc[1:4], style="ent")
     assert html.startswith("<div")
@@ -70,9 +68,9 @@ def test_displacy_rtl():
     # These are (likely) wrong, but it's just for testing
     pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
     deps = ["foo", "bar", "foo", "baz"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 0, 3, 1]
     nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
+    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
     doc.ents = [Span(doc, 1, 3, label="TEST")]
     html = displacy.render(doc, page=True, style="dep")
     assert "direction: rtl" in html
@@ -90,7 +88,7 @@ def test_displacy_render_wrapper(en_vocab):
         return "TEST" + html + "TEST"
 
     displacy.set_render_wrapper(wrapper)
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc, style="ent")
     assert html.startswith("TEST<div")
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 6e3604ce8..a1406c14a 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -5,7 +5,6 @@ from spacy.training import Example
 from spacy.training.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
-from .util import get_doc
 from spacy.lang.en import English
 from spacy.tokens import Doc
 
@@ -137,11 +136,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         example = Example.from_dict(doc, gold)
@@ -161,11 +157,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         doc[0].dep_ = "compound"
@@ -188,10 +181,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_cardinal:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
+            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
@@ -213,10 +206,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_apple:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
+            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index b09487965..4cab5b015 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,13 +12,14 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.fixture
-def doc():
+def doc(en_vocab):
+    nlp = English()  # make sure we get a new vocab every time
     # fmt: off
-    text = "Sarah's sister flew to Silicon Valley via London."
+    words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
     tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     morphs = ["NounType=prop|Number=sing", "Poss=yes", "Number=sing", "Tense=past|VerbForm=fin",
@@ -26,15 +27,12 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
-    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
+    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
-    nlp = English()
-    words = [t.text for t in nlp.make_doc(text)]
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=words,
         tags=tags,
@@ -212,41 +210,24 @@ def test_json2docs_no_ner(en_vocab):
 
 
 def test_split_sentences(en_vocab):
+    # fmt: off
     words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San",
-        "Francisco",
-        "Valley",
-        "had",
-        "loads",
-        "of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
     sent_starts = [True, False, False, False, False, False, True, False, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "I flew to San Francisco Valley "
     assert split_examples[1].text == "had loads of fun "
-
+    # fmt: off
     words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San Francisco",
-        "Valley",
-        "had",
-        "loads of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
     sent_starts = [True, False, False, False, False, True, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
@@ -479,7 +460,6 @@ def test_roundtrip_docs_to_docbin(doc):
     heads = [t.head.i for t in doc]
     cats = doc.cats
     ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-
     # roundtrip to DocBin
     with make_tempdir() as tmpdir:
         # use a separate vocab to test that all labels are added
@@ -600,7 +580,6 @@ def test_tuple_format_implicit():
 
 def test_tuple_format_implicit_invalid():
     """Test that an error is thrown for an implicit invalid field"""
-
     train_data = [
         ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
         (
@@ -609,7 +588,6 @@ def test_tuple_format_implicit_invalid():
         ),
         ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
     ]
-
     with pytest.raises(KeyError):
         _train_tuples(train_data)
 
@@ -619,11 +597,9 @@ def _train_tuples(train_data):
     ner = nlp.add_pipe("ner")
     ner.add_label("ORG")
     ner.add_label("LOC")
-
     train_examples = []
     for t in train_data:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
     optimizer = nlp.begin_training()
     for i in range(5):
         losses = {}
@@ -639,17 +615,14 @@ def test_split_sents(merged_dict):
         merged_dict,
     )
     assert example.text == "Hi there everyone It is just me"
-
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "Hi there everyone "
     assert split_examples[1].text == "It is just me"
-
     token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
     assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
     assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
     assert token_annotation_1["SENT_START"] == [1, 0, 0]
-
     token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
     assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
     assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 6c67d2ee1..ef7b4d00d 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -2,11 +2,7 @@ import numpy
 import tempfile
 import contextlib
 import srsly
-
-from spacy import Errors
-from spacy.tokens import Doc, Span
-from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
-
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
 from spacy.util import make_tempdir  # noqa: F401
 
@@ -18,35 +14,6 @@ def make_tempfile(mode="r"):
     f.close()
 
 
-def get_doc(
-    vocab,
-    words=[],
-    pos=None,
-    heads=None,
-    deps=None,
-    tags=None,
-    ents=None,
-    lemmas=None,
-    morphs=None,
-):
-    """Create Doc object from given vocab, words and annotations."""
-    if heads is not None:
-        heads = [i + head for i, head in enumerate(heads)]
-    if ents is not None:
-        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(
-        vocab,
-        words=words,
-        pos=pos,
-        heads=heads,
-        deps=deps,
-        tags=tags,
-        ents=ents,
-        lemmas=lemmas,
-        morphs=morphs,
-    )
-
-
 def get_batch(batch_size):
     vocab = Vocab()
     docs = []
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c5f1f6801..f81e4a96b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -200,8 +200,8 @@ cdef class Doc:
         sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
             the same length as words, to assign as token.is_sent_start. Will be
             overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
-            Defaults to None.
+        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
+            (label, start, end) tuples to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -665,7 +665,7 @@ cdef class Doc:
             cdef attr_t kb_id
             cdef int ent_start, ent_end
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
                 for token_index in range(ent_start, ent_end):
                     if token_index in tokens_in_ents.keys():
                         raise ValueError(Errors.E103.format(
@@ -1583,7 +1583,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info):
+def get_entity_info(ent_info, vocab):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,4 +1596,6 @@ def get_entity_info(ent_info):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
+    if isinstance(ent_type, str):
+        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..371b4a06a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -221,7 +221,7 @@ cdef class Example:
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
-        if not self.reference.is_sentenced:
+        if not self.reference.has_annotation("SENT_START"):
             return [self]
 
         align = self.alignment.y2x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 52f94a83d..648ade5f6 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -25,26 +25,27 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 >
 > # Construction 2
 > from spacy.tokens import Doc
+>
 > words = ["hello", "world", "!"]
 > spaces = [True, False, False]
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name           | Description                                                                                                                                                                                    |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
-| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
-| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| _keyword-only_ |                                                                                                                                                                                                |
-| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
-| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
-| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
-| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
+| Name                                     | Description                                                                                                                                                                                                       |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                                  |
+| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                                |
+| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~                      |
+| _keyword-only_                           |                                                                                                                                                                                                                   |
+| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                                |
+| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~                |
+| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~                 |
+| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 
@@ -281,6 +282,19 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 
 Check whether the doc contains annotation on a token attribute.
 
+<Infobox title="Changed in v3.0" variant="warning">
+
+This method replaces the previous boolean attributes like `Doc.is_tagged`,
+`Doc.is_parsed` or `Doc.is_sentenced`.
+
+```diff
+doc = nlp("This is a text")
+- assert doc.is_parsed
++ assert doc.has_annotation("DEP")
+```
+
+</Infobox>
+
 | Name               | Description                                                                                         |
 | ------------------ | --------------------------------------------------------------------------------------------------- |
 | `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 5abeb5707..406ba4b75 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -530,6 +530,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
   The `on_match` callback becomes an optional keyword argument.
+- The `Doc` flags like `Doc.is_parsed` or `Doc.is_tagged` have been replaced by
+  [`Doc.has_annotation`](/api/doc#has_annotation).
 - The `spacy.gold` module has been renamed to
   [`spacy.training`](%%GITHUB_SPACY/spacy/training).
 - The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
@@ -807,10 +809,11 @@ nlp = spacy.blank("en")
 
 ### Migrating Doc flags {#migrating-doc-flags}
 
-The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
-`Doc.is_sentenced` are deprecated in v3 and replaced by
+The [`Doc`](/api/doc) flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3.0 and replaced by
 [`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
-token attribute symbols (the same symbols used in `Matcher` patterns):
+token attribute symbols (the same symbols used in [`Matcher`](/api/matcher)
+patterns):
 
 ```diff
 doc = nlp(text)

From 3abc4a5adb9c29605de89ab984190f64d88190b4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 21 Sep 2020 22:58:03 +0200
Subject: [PATCH 079/516] Slightly tidy doc.ents.__set__

---
 spacy/tokens/doc.pyx | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f81e4a96b..b82bab294 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -663,11 +663,14 @@ cdef class Doc:
             tokens_in_ents = {}
             cdef attr_t entity_type
             cdef attr_t kb_id
-            cdef int ent_start, ent_end
+            cdef int ent_start, ent_end, token_index
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
+                entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                if isinstance(entity_type_, str):
+                    self.vocab.strings.add(entity_type_)
+                entity_type = self.vocab.strings.as_int(entity_type_)
                 for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents.keys():
+                    if token_index in tokens_in_ents:
                         raise ValueError(Errors.E103.format(
                             span1=(tokens_in_ents[token_index][0],
                                    tokens_in_ents[token_index][1],
@@ -1583,7 +1586,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info, vocab):
+def get_entity_info(ent_info):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,6 +1599,4 @@ def get_entity_info(ent_info, vocab):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
-    if isinstance(ent_type, str):
-        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end

From fa5c416db646b919153a362c02f842c7a19dbb9e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:09:22 +0200
Subject: [PATCH 080/516] initialize through nlp object and with train_corpus

---
 spacy/cli/debug_model.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 3d76cdbde..017bcd239 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,5 +1,9 @@
+import warnings
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
+
+from spacy.training import Example
+from spacy.util import dot_to_object
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
@@ -71,12 +75,10 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    # call _link_components directly as we won't call nlp.begin_training
-    nlp._link_components()
-    debug_model(nlp, model, print_settings=print_settings)
+    debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -93,10 +95,21 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X)
+        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
+        try:
+            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
+            nlp.begin_training(lambda: train_corpus(nlp))
+            msg.info("Initialized the model with the training corpus.")
+        except ValueError:
+            try:
+                _set_output_dim(nO=7, model=model)
+                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                msg.info("Initialized the model with dummy data.")
+            except:
+                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -114,8 +127,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        # simulate a goldY value
-        if not goldY:
+        if goldY is None:
             goldY = _simulate_gold(Y)
         dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)

From 45b29c4a5b926c8f85b0a2ed4a9b8be13c5bf7eb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:17:23 +0200
Subject: [PATCH 081/516] cleanup

---
 spacy/cli/debug_model.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 017bcd239..1d27c7c52 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -78,7 +78,9 @@ def debug_model_cli(
     debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(
+    config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
+):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -97,7 +99,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     X = _get_docs()
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
         try:
             train_corpus = dot_to_object(config, config["training"]["train_corpus"])
             nlp.begin_training(lambda: train_corpus(nlp))
@@ -108,7 +109,10 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
                 nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except:
-                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+                msg.fail(
+                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
+                    exits=1,
+                )
 
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
@@ -121,7 +125,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     tok2vec = None
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
-        tok2vec.model.initialize(X=X)
     goldY = None
     for e in range(3):
         if tok2vec:
@@ -145,17 +148,17 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     msg.good(f"Succesfully ended analysis - model looks good.")
 
 
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
+
+
 def _simulate_gold(element, counter=1):
     if isinstance(element, Iterable):
         for i in range(len(element)):
-            element[i] = _simulate_gold(element[i], counter+i)
+            element[i] = _simulate_gold(element[i], counter + i)
         return element
     else:
-        return 1/counter
-
-
-def get_gradient(goldY, Y, ops):
-    return ops.asarray(Y) - ops.asarray(goldY)
+        return 1 / counter
 
 
 def _sentences():
@@ -229,12 +232,3 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
-
-
-def _set_output_dim(model, nO):
-    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
-    if model.has_dim("nO") is None:
-        model.set_dim("nO", nO)
-    if model.has_ref("output_layer"):
-        if model.get_ref("output_layer").has_dim("nO") is None:
-            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 69f7e52c26ef545fb9e39cd748666ae451318c77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:10:06 +0200
Subject: [PATCH 082/516] Update README.md

---
 spacy/tests/README.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 86bbd52da..833dc9266 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -38,18 +38,17 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 ## Dos and don'ts
 
-To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
+To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
 - **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
 - If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- Before requiring the models, always make sure there is no other way to test the particular behavior. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
 - **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
 - If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behavior at a time.
 
 ## Parameters
 
@@ -77,7 +76,7 @@ To test for combinations of parameters, you can add several `parametrize` marker
 @pytest.mark.parametrize('punct', ['.', '!', '?'])
 ```
 
-This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
+This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unnecessary or undesired test bloat.
 
 ## Fixtures
 
@@ -104,9 +103,9 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
-### Constructing a `Doc` object manually with
+### Constructing a `Doc` object manually
 
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need is a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
 def test_doc_token_api_strings(en_vocab):

From beb766d0a09509a7d91518e60c990489789978e0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:15:57 +0200
Subject: [PATCH 083/516] Add test

---
 spacy/tests/doc/test_doc_api.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 2c22926e9..163de5ab0 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -455,3 +455,16 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_doc_set_ents():
+    """Test that both strings and integers can be used to set entities in
+    tuple format via doc.ents."""
+    words = ["a", "b", "c", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+    vocab = Vocab()
+    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    doc = Doc(vocab, words=words, ents=ents)
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]

From fc9c78da25202322c9ec042b529a6a3f91d48e4d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:23:47 +0200
Subject: [PATCH 084/516] Add MorphAnalysis to API sidebar

---
 website/meta/sidebars.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index e27817c92..28915ebb7 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,6 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
+                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 844db6ff12441f63f51d4d9921cdaf4e6af61a04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:31:47 +0200
Subject: [PATCH 085/516] Update architecture overview

---
 website/docs/usage/101/_architecture.md | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 98011f173..6e9120022 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -65,22 +65,22 @@ Matchers help you find and extract information from [`Doc`](/api/doc) objects
 based on match patterns describing the sequences you're looking for. A matcher
 operates on a `Doc` and gives you access to the matched tokens **in context**.
 
-| Name                                          | Description                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                  |
-| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                         |
-| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
+| Name                                          | Description                                                                                                                                                                        |
+| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                 |
+| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                        |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 
 ### Other classes {#architecture-other}
 
-| Name                                             | Description                                                                                                      |
-| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects.               |
-| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                                             |
-| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                                 |
-| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                                         |
-| [`Morphology`](/api/morphology)                  | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
-| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                                        |
-| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                                         |
-| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                                       |
-| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                                           |
+| Name                                             | Description                                                                                        |
+| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
+| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                               |
+| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                   |
+| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
+| [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
+| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
+| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
+| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                         |
+| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |

From e05d6d358d04166779093d2acff0e2c3bb95fe04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:36:37 +0200
Subject: [PATCH 086/516] Update API sidebar MorphAnalysis link

---
 website/meta/sidebars.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 28915ebb7..c5404b68e 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,7 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
-                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
+                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 6316d5f3989a53e4868cd346256fa614bd49e711 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:34 +0200
Subject: [PATCH 087/516] Improve messages in project CLI [ci skip]

---
 spacy/cli/project/assets.py | 1 +
 spacy/cli/project/run.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 8a3aaff25..58f59a3f9 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
                 branch=asset["git"].get("branch"),
                 sparse=sparse_checkout,
             )
+            msg.good(f"Downloaded asset {dest}")
         else:
             url = asset.get("url")
             if not url:
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index d7e1075f3..69c49fba7 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -59,7 +59,7 @@ def project_run(
         for dep in cmd.get("deps", []):
             if not (project_dir / dep).exists():
                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                err_help = "Maybe you forgot to run the 'project assets' command?"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                 err_kwargs = {"exits": 1} if not dry else {}
                 msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:

From f9af7d365c228a8113e6db66d5bc4941c2546d88 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:41 +0200
Subject: [PATCH 088/516] Update docs [ci skip]

---
 website/docs/api/language.md              |  2 +-
 website/docs/usage/linguistic-features.md | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index ffdae9ec6..a7b9c0d88 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -187,7 +187,7 @@ more efficient than processing texts one-by-one.
 > ```python
 > texts = ["One document.", "...", "Lots of documents"]
 > for doc in nlp.pipe(texts, batch_size=50):
->     assert doc.is_parsed
+>     assert doc.has_annotation("DEP")
 > ```
 
 | Name                                       | Description                                                                                                                                                         |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index a229c18e9..914e18acb 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -205,9 +205,10 @@ acquired from [WordNet](https://wordnet.princeton.edu/).
 spaCy features a fast and accurate syntactic dependency parser, and has a rich
 API for navigating the tree. The parser also powers the sentence boundary
 detection, and lets you iterate over base noun phrases, or "chunks". You can
-check whether a [`Doc`](/api/doc) object has been parsed with the
-`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
-`False`, the default sentence iterator will raise an exception.
+check whether a [`Doc`](/api/doc) object has been parsed by calling
+`doc.has_annotation("DEP")`, which checks whether the attribute `Token.dep` has
+been set returns a boolean value. If the result is `False`, the default sentence
+iterator will raise an exception.
 
 <Infobox title="Dependency label scheme" emoji="📖">
 
@@ -1705,9 +1706,10 @@ and can still be overwritten by the parser.
 <Infobox title="Important note" variant="warning">
 
 To prevent inconsistent state, you can only set boundaries **before** a document
-is parsed (and `doc.is_parsed` is `False`). To ensure that your component is
-added in the right place, you can set `before='parser'` or `first=True` when
-adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
+is parsed (and `doc.has_annotation("DEP")` is `False`). To ensure that your
+component is added in the right place, you can set `before='parser'` or
+`first=True` when adding it to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe).
 
 </Infobox>
 

From 135de82a2d7073d535d1ffd1e4254e5dca37c046 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:22:06 +0200
Subject: [PATCH 089/516] add textcat to quickstart

---
 spacy/cli/templates/quickstart_training.jinja | 48 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0db4c8a59..2c7ce024b 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -93,6 +93,29 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "textcat" in components %}
+[components.textcat]
+factory = "textcat"
+
+{% if optimize == "accuracy" %}
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v1"
+exclusive_classes = false
+width = 64
+conv_depth = 2
+embed_size = 2000
+window_size = 1
+ngram_size = 1
+nO = null
+
+{% else -%}
+[components.textcat.model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+{%- endif %}
+{%- endif %}
+
 {# NON-TRANSFORMER PIPELINE #}
 {% else -%}
 
@@ -167,10 +190,33 @@ nO = null
 @architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
+
+{% if "textcat" in components %}
+[components.textcat]
+factory = "textcat"
+
+{% if optimize == "accuracy" %}
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v1"
+exclusive_classes = false
+width = 64
+conv_depth = 2
+embed_size = 2000
+window_size = 1
+ngram_size = 1
+nO = null
+
+{% else -%}
+[components.textcat.model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+{%- endif %}
+{%- endif %}
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner"] %}
+{% if pipe not in ["tagger", "parser", "ner", "textcat"] %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"

From db7126ead9675d70212c33ab9f09d2f67d72cf77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 10:31:26 +0200
Subject: [PATCH 090/516] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec3c168a5..b57bbeda2 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a20"
+__version__ = "3.0.0a21"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 396b33257f7dff646040067c2ed7872d8c194f8b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:40:05 +0200
Subject: [PATCH 091/516] add entity_linker to jinja template

---
 spacy/cli/init_config.py                      |  2 +-
 spacy/cli/templates/quickstart_training.jinja | 34 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index e70195e15..5203c5dbb 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -36,7 +36,7 @@ def init_config_cli(
     """
     Generate a starter config.cfg for training. Based on your requirements
     specified via the CLI arguments, this command generates a config with the
-    optimal settings for you use case. This includes the choice of architecture,
+    optimal settings for your use case. This includes the choice of architecture,
     pretrained weights and related hyperparameters.
 
     DOCS: https://nightly.spacy.io/api/cli#init-config
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2c7ce024b..0674f0964 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -93,6 +93,22 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "entity_linker" in components -%}
+[components.entity_linker]
+factory = "entity_linker"
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+incl_context = true
+incl_prior = true
+
+[components.entity_linker.model]
+@architectures = "spacy.EntityLinker.v1"
+nO = null
+
+[components.entity_linker.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+{% endif -%}
+
 {% if "textcat" in components %}
 [components.textcat]
 factory = "textcat"
@@ -191,6 +207,22 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
 
+{% if "entity_linker" in components -%}
+[components.entity_linker]
+factory = "entity_linker"
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+incl_context = true
+incl_prior = true
+
+[components.entity_linker.model]
+@architectures = "spacy.EntityLinker.v1"
+nO = null
+
+[components.entity_linker.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
 {% if "textcat" in components %}
 [components.textcat]
 factory = "textcat"
@@ -216,7 +248,7 @@ ngram_size = 1
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner", "textcat"] %}
+{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"

From e931f4d75771dc63b2573e2cbd7c834de96def7d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:56:43 +0200
Subject: [PATCH 092/516] add textcat score

---
 spacy/cli/templates/quickstart_training.jinja | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0674f0964..0e83b9bdb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -323,3 +323,6 @@ ents_f = {{ (1.0 / components|length)|round(2) }}
 ents_p = 0.0
 ents_r = 0.0
 {%- endif -%}
+{%- if "textcat" in components %}
+cats_score = {{ (1.0 / components|length)|round(2) }}
+{%- endif -%}

From b556a1080893202651d473fc93c4b9010ee01665 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 11:50:19 +0200
Subject: [PATCH 093/516] rename converts in_to_out

---
 spacy/cli/_util.py                            |  4 +--
 spacy/cli/convert.py                          | 14 ++++-----
 spacy/errors.py                               |  2 +-
 spacy/tests/regression/test_issue4001-4500.py |  4 +--
 spacy/tests/regression/test_issue4501-5000.py |  6 ++--
 spacy/tests/test_cli.py                       | 30 +++++++++----------
 spacy/tests/test_scorer.py                    |  6 ++--
 spacy/tests/training/test_training.py         | 26 ++++++++--------
 spacy/training/__init__.py                    |  4 +--
 spacy/training/converters/__init__.py         |  8 ++---
 ...conll_ner2docs.py => conll_ner_to_docs.py} |  2 +-
 .../{conllu2docs.py => conllu_to_docs.py}     | 12 ++++----
 .../{iob2docs.py => iob_to_docs.py}           |  4 +--
 .../{json2docs.py => json_to_docs.py}         |  6 ++--
 spacy/training/example.pyx                    | 18 +++++------
 spacy/training/gold_io.pyx                    |  4 +--
 spacy/training/iob_utils.py                   | 14 ++++-----
 website/docs/api/data-formats.md              |  2 +-
 website/docs/api/top-level.md                 | 18 +++++------
 website/docs/usage/processing-pipelines.md    |  6 ++--
 website/docs/usage/v3.md                      | 15 +++++-----
 21 files changed, 103 insertions(+), 102 deletions(-)
 rename spacy/training/converters/{conll_ner2docs.py => conll_ner_to_docs.py} (99%)
 rename spacy/training/converters/{conllu2docs.py => conllu_to_docs.py} (97%)
 rename spacy/training/converters/{iob2docs.py => iob_to_docs.py} (95%)
 rename spacy/training/converters/{json2docs.py => json_to_docs.py} (82%)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 797a701b9..21a4e54ce 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
         # Looking for this 'rev-list' command in the git --help? Hah.
         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
         ret = run_command(cmd, capture=True)
-        git_repo = _from_http_to_git(repo)
+        git_repo = _http_to_git(repo)
         # Now pass those missings into another bit of git internals
         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
         if not missings:
@@ -414,7 +414,7 @@ def get_git_version(
     return (int(version[0]), int(version[1]))
 
 
-def _from_http_to_git(repo: str) -> str:
+def _http_to_git(repo: str) -> str:
     if repo.startswith("http://"):
         repo = repo.replace(r"http://", r"https://")
     if repo.startswith(r"https://"):
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index ad89b9976..8f8234c61 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -9,7 +9,7 @@ import sys
 from ._util import app, Arg, Opt
 from ..training import docs_to_json
 from ..tokens import DocBin
-from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
+from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
@@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
 # imported from /converters.
 
 CONVERTERS = {
-    "conllubio": conllu2docs,
-    "conllu": conllu2docs,
-    "conll": conllu2docs,
-    "ner": conll_ner2docs,
-    "iob": iob2docs,
-    "json": json2docs,
+    "conllubio": conllu_to_docs,
+    "conllu": conllu_to_docs,
+    "conll": conllu_to_docs,
+    "ner": conll_ner_to_docs,
+    "iob": iob_to_docs,
+    "json": json_to_docs,
 }
 
 
diff --git a/spacy/errors.py b/spacy/errors.py
index f276c4d1a..153f8da0c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -69,7 +69,7 @@ class Warnings:
             "in problems with the vocab further on in the pipeline.")
     W030 = ("Some entities could not be aligned in the text \"{text}\" with "
             "entities \"{entities}\". Use "
-            "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
             " to check the alignment. Misaligned entities ('-') will be "
             "ignored during training.")
     W033 = ("Training a new {model} using a model with no lexeme normalization "
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 4e58c347e..7b7ddfe0d 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.training import Example, Corpus
-from spacy.training.converters import json2docs
+from spacy.training.converters import json_to_docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
@@ -425,7 +425,7 @@ def test_issue4402():
     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
+        docs = json_to_docs([json_data])
         data = DocBin(docs=docs, attrs=attrs).to_bytes()
         with output_file.open("wb") as file_:
             file_.write(data)
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 9454d7f0c..e351858f5 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -1,7 +1,7 @@
 import pytest
 from spacy.tokens import Doc, Span, DocBin
 from spacy.training import Example
-from spacy.training.converters.conllu2docs import conllu2docs
+from spacy.training.converters.conllu_to_docs import conllu_to_docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
@@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
 
 def test_issue4665():
     """
-    conllu2json should not raise an exception if the HEAD column contains an
+    conllu_to_docs should not raise an exception if the HEAD column contains an
     underscore
     """
     input_data = """
@@ -105,7 +105,7 @@ def test_issue4665():
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
-    conllu2docs(input_data)
+    conllu_to_docs(input_data)
 
 
 def test_issue4674():
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a9c9d8ca5..7141a11ff 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,7 +1,7 @@
 import pytest
 from click import NoSuchOption
-from spacy.training import docs_to_json, biluo_tags_from_offsets
-from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
+from spacy.training import docs_to_json, offsets_to_biluo_tags
+from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
@@ -14,7 +14,7 @@ import os
 from .util import make_tempdir
 
 
-def test_cli_converters_conllu2json():
+def test_cli_converters_conllu_to_json():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@@ -23,7 +23,7 @@ def test_cli_converters_conllu2json():
         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(input_data, n_sents=1)
+    converted_docs = conllu_to_docs(input_data, n_sents=1)
     assert len(converted_docs) == 1
     converted = [docs_to_json(converted_docs)]
     assert converted[0]["id"] == 0
@@ -39,7 +39,7 @@ def test_cli_converters_conllu2json():
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
 
 
@@ -62,9 +62,9 @@ def test_cli_converters_conllu2json():
         ),
     ],
 )
-def test_cli_converters_conllu2json_name_ner_map(lines):
+def test_cli_converters_conllu_to_json_name_ner_map(lines):
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(
+    converted_docs = conllu_to_docs(
         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
     )
     assert len(converted_docs) == 1
@@ -83,11 +83,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
-def test_cli_converters_conllu2json_subtokens():
+def test_cli_converters_conllu_to_json_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
@@ -98,7 +98,7 @@ def test_cli_converters_conllu2json_subtokens():
         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(
+    converted_docs = conllu_to_docs(
         input_data, n_sents=1, merge_subtokens=True, append_morphology=True
     )
     assert len(converted_docs) == 1
@@ -132,11 +132,11 @@ def test_cli_converters_conllu2json_subtokens():
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "U-PER", "O", "O"]
 
 
-def test_cli_converters_iob2json():
+def test_cli_converters_iob_to_docs():
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
         "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -144,7 +144,7 @@ def test_cli_converters_iob2json():
         "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
     ]
     input_data = "\n".join(lines)
-    converted_docs = iob2docs(input_data, n_sents=10)
+    converted_docs = iob_to_docs(input_data, n_sents=10)
     assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
     assert converted["id"] == 0
@@ -161,7 +161,7 @@ def test_cli_converters_iob2json():
         assert ent.text in ["New York City", "London"]
 
 
-def test_cli_converters_conll_ner2json():
+def test_cli_converters_conll_ner_to_docs():
     lines = [
         "-DOCSTART- -X- O O",
         "",
@@ -211,7 +211,7 @@ def test_cli_converters_conll_ner2json():
         ".\t.\t_\tO",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conll_ner2docs(input_data, n_sents=10)
+    converted_docs = conll_ner_to_docs(input_data, n_sents=10)
     assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
     assert converted["id"] == 0
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index a1406c14a..2825f1703 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
 from spacy.training import Example
-from spacy.training.iob_utils import biluo_tags_from_offsets
+from spacy.training.iob_utils import offsets_to_biluo_tags
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from spacy.lang.en import English
@@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
         )
-        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
         # a hack for sentence boundaries
         example.predicted[1].is_sent_start = False
@@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
         )
-        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
         # a hack for sentence boundaries
         example.predicted[1].is_sent_start = False
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 4cab5b015..a04e6aadd 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,9 +1,9 @@
 import numpy
-from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
-from spacy.training import spans_from_biluo_tags, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
+from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json
 from spacy.training.example import Example
-from spacy.training.converters import json2docs
+from spacy.training.converters import json_to_docs
 from spacy.training.augment import make_orth_variants_example
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
     spaces = [True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to London"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "U-LOC", "O"]
 
 
@@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
     spaces = [True, True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
 
 
@@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
     spaces = [True, True, True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 
 
@@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
         (len("I flew to "), len("I flew to San Francisco"), "LOC"),
     ]
     with pytest.raises(ValueError):
-        biluo_tags_from_offsets(doc, entities)
+        offsets_to_biluo_tags(doc, entities)
 
 
 def test_gold_biluo_misalign(en_vocab):
@@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
     with pytest.warns(UserWarning):
-        tags = biluo_tags_from_offsets(doc, entities)
+        tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "-", "-", "-"]
 
 
@@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
 
 
 @pytest.mark.filterwarnings("ignore::UserWarning")
-def test_json2docs_no_ner(en_vocab):
+def test_json_to_docs_no_ner(en_vocab):
     data = [
         {
             "id": 1,
@@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
             ],
         }
     ]
-    docs = json2docs(data)
+    docs = json_to_docs(data)
     assert len(docs) == 1
     for doc in docs:
         assert not doc.has_annotation("ENT_IOB")
@@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
     offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
     doc = en_tokenizer(text)
-    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
+    biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
     assert biluo_tags_converted == biluo_tags
-    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
+    offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
     offsets_converted = [ent for ent in offsets if ent[2]]
     assert offsets_converted == offsets
 
@@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 def test_biluo_spans(en_tokenizer):
     doc = en_tokenizer("I flew to Silicon Valley via London.")
     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
-    spans = spans_from_biluo_tags(doc, biluo_tags)
+    spans = biluo_tags_to_spans(doc, biluo_tags)
     spans = [span for span in spans if span.label_]
     assert len(spans) == 2
     assert spans[0].text == "Silicon Valley"
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 35e67f696..9172dde25 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -2,8 +2,8 @@ from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
-from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags  # noqa: F401
-from .iob_utils import spans_from_biluo_tags, tags_to_entities  # noqa: F401
+from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
+from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .loggers import console_logger, wandb_logger  # noqa: F401
diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py
index 15f025a08..e91b6aaa6 100644
--- a/spacy/training/converters/__init__.py
+++ b/spacy/training/converters/__init__.py
@@ -1,4 +1,4 @@
-from .iob2docs import iob2docs  # noqa: F401
-from .conll_ner2docs import conll_ner2docs  # noqa: F401
-from .json2docs import json2docs  # noqa: F401
-from .conllu2docs import conllu2docs  # noqa: F401
+from .iob_to_docs import iob_to_docs  # noqa: F401
+from .conll_ner_to_docs import conll_ner_to_docs  # noqa: F401
+from .json_to_docs import json_to_docs  # noqa: F401
+from .conllu_to_docs import conllu_to_docs  # noqa: F401
diff --git a/spacy/training/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner_to_docs.py
similarity index 99%
rename from spacy/training/converters/conll_ner2docs.py
rename to spacy/training/converters/conll_ner_to_docs.py
index 8dcaf2599..3b851039c 100644
--- a/spacy/training/converters/conll_ner2docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -7,7 +7,7 @@ from ...tokens import Doc, Span
 from ...util import load_model
 
 
-def conll_ner2docs(
+def conll_ner_to_docs(
     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
     """
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu_to_docs.py
similarity index 97%
rename from spacy/training/converters/conllu2docs.py
rename to spacy/training/converters/conllu_to_docs.py
index b4d8b3ac4..18a2b6a93 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -1,13 +1,13 @@
 import re
 
-from .conll_ner2docs import n_sents_info
-from ...training import iob_to_biluo, spans_from_biluo_tags
+from .conll_ner_to_docs import n_sents_info
+from ...training import iob_to_biluo, biluo_tags_to_spans
 from ...tokens import Doc, Token, Span
 from ...vocab import Vocab
 from wasabi import Printer
 
 
-def conllu2docs(
+def conllu_to_docs(
     input_data,
     n_sents=10,
     append_morphology=False,
@@ -78,7 +78,7 @@ def read_conllx(
         if lines:
             while lines[0].startswith("#"):
                 lines.pop(0)
-            doc = doc_from_conllu_sentence(
+            doc = conllu_sentence_to_doc(
                 vocab,
                 lines,
                 ner_tag_pattern,
@@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
     return iob_to_biluo(iob)
 
 
-def doc_from_conllu_sentence(
+def conllu_sentence_to_doc(
     vocab,
     lines,
     ner_tag_pattern,
@@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
         doc[i]._.merged_lemma = lemmas[i]
         doc[i]._.merged_spaceafter = spaces[i]
     ents = get_entities(lines, ner_tag_pattern, ner_map)
-    doc.ents = spans_from_biluo_tags(doc, ents)
+    doc.ents = biluo_tags_to_spans(doc, ents)
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
diff --git a/spacy/training/converters/iob2docs.py b/spacy/training/converters/iob_to_docs.py
similarity index 95%
rename from spacy/training/converters/iob2docs.py
rename to spacy/training/converters/iob_to_docs.py
index 2f6742fea..bfd981649 100644
--- a/spacy/training/converters/iob2docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -1,13 +1,13 @@
 from wasabi import Printer
 
-from .conll_ner2docs import n_sents_info
+from .conll_ner_to_docs import n_sents_info
 from ...vocab import Vocab
 from ...training import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
 from ...util import minibatch
 
 
-def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
+def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
     """
     Convert IOB files with one sentence per line and tags separated with '|'
     into Doc objects so they can be saved. IOB and IOB2 are accepted.
diff --git a/spacy/training/converters/json2docs.py b/spacy/training/converters/json_to_docs.py
similarity index 82%
rename from spacy/training/converters/json2docs.py
rename to spacy/training/converters/json_to_docs.py
index 342f94848..d7df1d6f9 100644
--- a/spacy/training/converters/json2docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,12 +1,12 @@
 import srsly
 from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations2doc
+from ..example import annotations_to_doc
 from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 from ...util import load_model
 from ...lang.xx import MultiLanguage
 
 
-def json2docs(input_data, model=None, **kwargs):
+def json_to_docs(input_data, model=None, **kwargs):
     nlp = load_model(model) if model is not None else MultiLanguage()
     if not isinstance(input_data, bytes):
         if not isinstance(input_data, str):
@@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
         for json_para in json_to_annotations(json_doc):
             example_dict = _fix_legacy_dict_data(json_para)
             tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
+            doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
             docs.append(doc)
     return docs
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 371b4a06a..fbf05b224 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -7,13 +7,13 @@ from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
 from .align import Alignment
-from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
-from .iob_utils import spans_from_biluo_tags
+from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
+from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
 
 
-cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
+cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
     """ Create a Doc from dictionaries with token and doc annotations. """
     attrs, array = _annot2array(vocab, tok_annot, doc_annot)
     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
@@ -92,7 +92,7 @@ cdef class Example:
             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
         return Example(
             predicted,
-            annotations2doc(predicted.vocab, tok_dict, doc_dict)
+            annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
         )
 
     @property
@@ -176,7 +176,7 @@ cdef class Example:
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
-        x_tags = biluo_tags_from_offsets(
+        x_tags = offsets_to_biluo_tags(
             self.x,
             [(e.start_char, e.end_char, e.label_) for e in x_ents],
             missing=None
@@ -195,7 +195,7 @@ cdef class Example:
         return {
             "doc_annotation": {
                 "cats": dict(self.reference.cats),
-                "entities": biluo_tags_from_doc(self.reference),
+                "entities": doc_to_biluo_tags(self.reference),
                 "links": self._links_to_dict()
             },
             "token_annotation": {
@@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
     elif isinstance(ner_data[0], tuple):
         return _add_entities_to_doc(
             doc,
-            biluo_tags_from_offsets(doc, ner_data)
+            offsets_to_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], str) or ner_data[0] is None:
         return _add_entities_to_doc(
             doc,
-            spans_from_biluo_tags(doc, ner_data)
+            biluo_tags_to_spans(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
         # Ugh, this is super messy. Really hard to set O entities
@@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
         # This is annoying but to convert the offsets we need a Doc
         # that has the target tokenization.
         reference = Doc(vocab, words=words, spaces=spaces)
-        biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
+        biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
     else:
         biluo = biluo_or_offsets
     ent_iobs = []
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index b58df0d71..524da0a16 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -3,7 +3,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import biluo_tags_from_offsets, tags_to_entities
+from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 import json
 
 
@@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
             if ent.kb_id_:
                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                 json_para["links"].append(link_dict)
-        biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
         for j, sent in enumerate(doc.sents):
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index ceb5e16b8..63deed3a5 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -50,15 +50,15 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
-def biluo_tags_from_doc(doc, missing="O"):
-    return biluo_tags_from_offsets(
+def doc_to_biluo_tags(doc, missing="O"):
+    return offsets_to_biluo_tags(
         doc,
         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
         missing=missing,
     )
 
 
-def biluo_tags_from_offsets(doc, entities, missing="O"):
+def offsets_to_biluo_tags(doc, entities, missing="O"):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
 
@@ -80,7 +80,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
         >>> text = 'I like London.'
         >>> entities = [(len('I like '), len('I like London'), 'LOC')]
         >>> doc = nlp.tokenizer(text)
-        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> tags = offsets_to_biluo_tags(doc, entities)
         >>> assert tags == ["O", "O", 'U-LOC', "O"]
     """
     # Ensure no overlapping entity labels exist
@@ -143,7 +143,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
     return biluo
 
 
-def spans_from_biluo_tags(doc, tags):
+def biluo_tags_to_spans(doc, tags):
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
 
@@ -161,7 +161,7 @@ def spans_from_biluo_tags(doc, tags):
     return spans
 
 
-def offsets_from_biluo_tags(doc, tags):
+def biluo_tags_to_offsets(doc, tags):
     """Encode per-token tags following the BILUO scheme into entity offsets.
 
     doc (Doc): The document that the BILUO tags refer to.
@@ -172,7 +172,7 @@ def offsets_from_biluo_tags(doc, tags):
         `end` will be character-offset integers denoting the slice into the
         original string.
     """
-    spans = spans_from_biluo_tags(doc, tags)
+    spans = biluo_tags_to_spans(doc, tags)
     return [(span.start_char, span.end_char, span.label_) for span in spans]
 
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 3a214428b..e3b3900be 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
 > can help you convert entity offsets to the right format.
 
 ```python
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 7afe02403..2c082ae0b 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -619,7 +619,7 @@ sequences in the batch.
 
 ## Training data and alignment {#gold source="spacy/training"}
 
-### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
+### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
 
 Encode labelled spans into per-token tags, using the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
@@ -635,11 +635,11 @@ single-token entity.
 > #### Example
 >
 > ```python
-> from spacy.training import biluo_tags_from_offsets
+> from spacy.training import offsets_to_biluo_tags
 >
 > doc = nlp("I like London.")
 > entities = [(7, 13, "LOC")]
-> tags = biluo_tags_from_offsets(doc, entities)
+> tags = offsets_to_biluo_tags(doc, entities)
 > assert tags == ["O", "O", "U-LOC", "O"]
 > ```
 
@@ -649,7 +649,7 @@ single-token entity.
 | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
 | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    |
 
-### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
+### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
 
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
@@ -657,11 +657,11 @@ Encode per-token tags following the
 > #### Example
 >
 > ```python
-> from spacy.training import offsets_from_biluo_tags
+> from spacy.training import biluo_tags_to_offsets
 >
 > doc = nlp("I like London.")
 > tags = ["O", "O", "U-LOC", "O"]
-> entities = offsets_from_biluo_tags(doc, tags)
+> entities = biluo_tags_to_offsets(doc, tags)
 > assert entities == [(7, 13, "LOC")]
 > ```
 
@@ -671,7 +671,7 @@ Encode per-token tags following the
 | `entities`  | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
 | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~                                                                                 |
 
-### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
+### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
 
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into
@@ -681,11 +681,11 @@ token-based tags, e.g. to overwrite the `doc.ents`.
 > #### Example
 >
 > ```python
-> from spacy.training import spans_from_biluo_tags
+> from spacy.training import biluo_tags_to_spans
 >
 > doc = nlp("I like London.")
 > tags = ["O", "O", "U-LOC", "O"]
-> doc.ents = spans_from_biluo_tags(doc, tags)
+> doc.ents = biluo_tags_to_spans(doc, tags)
 > ```
 
 | Name        | Description                                                                                                                                                                                                                                                  |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 3d756215f..97806dc2a 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1501,7 +1501,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
 component function and pass it the token texts from the `Doc` object received by
 the component.
 
-The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
+The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
 helpful here, because it takes a `Doc` object and token-based BILUO tags and
 returns a sequence of `Span` objects in the `Doc` with added labels. So all your
 wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
@@ -1516,14 +1516,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
 ```python
 ### {highlight="1,8-9"}
 import your_custom_entity_recognizer
-from spacy.training import offsets_from_biluo_tags
+from spacy.training import biluo_tags_to_spans
 from spacy.language import Language
 
 @Language.component("custom_ner_wrapper")
 def custom_ner_wrapper(doc):
     words = [token.text for token in doc]
     custom_entities = your_custom_entity_recognizer(words)
-    doc.ents = spans_from_biluo_tags(doc, custom_entities)
+    doc.ents = biluo_tags_to_spans(doc, custom_entities)
     return doc
 ```
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 406ba4b75..b3c586fe1 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -968,16 +968,17 @@ python -m spacy package ./output ./packages
 
 #### Data utilities and gold module {#migrating-gold}
 
-The `spacy.gold` module has been renamed to `spacy.training`. This mostly
+The `spacy.gold` module has been renamed to `spacy.training` and the conversion 
+utilities now follow the naming format of `x_to_y`. This mostly
 affects internals, but if you've been using the span offset conversion utilities
-[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets),
-[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or
-[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to
-change your imports:
+[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
+[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
+[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
+change your names and imports:
 
 ```diff
-- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags
-+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags
+- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
++ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
 ```
 
 #### Migration notes for plugin maintainers {#migrating-plugins}

From e1b8090b9bdc880ede79bab5f269e3c352e17183 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 12:01:06 +0200
Subject: [PATCH 094/516] few more fixes

---
 spacy/tests/test_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 7141a11ff..99e83eccf 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -14,7 +14,7 @@ import os
 from .util import make_tempdir
 
 
-def test_cli_converters_conllu_to_json():
+def test_cli_converters_conllu_to_docs():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@@ -62,7 +62,7 @@ def test_cli_converters_conllu_to_json():
         ),
     ],
 )
-def test_cli_converters_conllu_to_json_name_ner_map(lines):
+def test_cli_converters_conllu_to_docs_name_ner_map(lines):
     input_data = "\n".join(lines)
     converted_docs = conllu_to_docs(
         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
@@ -87,7 +87,7 @@ def test_cli_converters_conllu_to_json_name_ner_map(lines):
     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
-def test_cli_converters_conllu_to_json_subtokens():
+def test_cli_converters_conllu_to_docs_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",

From 085a1c8e2b4b3a136025ef693bb6e7537d88729f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 12:06:40 +0200
Subject: [PATCH 095/516] add no_output_layer to TextCatBOW config

---
 spacy/cli/templates/quickstart_training.jinja | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0e83b9bdb..a0d9f78ac 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -129,6 +129,7 @@ nO = null
 @architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
 ngram_size = 1
+no_output_layer = false
 {%- endif %}
 {%- endif %}
 
@@ -243,6 +244,7 @@ nO = null
 @architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
 ngram_size = 1
+no_output_layer = false
 {%- endif %}
 {%- endif %}
 {% endif %}

From 5e3b796b122fc9b1125f350b5dcda625fd9740f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 12:24:39 +0200
Subject: [PATCH 096/516] Validate section refs in debug config

---
 spacy/cli/debug_config.py | 27 +++++++++++++++++++++++++--
 spacy/tests/test_cli.py   | 15 ++++++++++++++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 7930d0674..d07a0bb2d 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
 from thinc.api import Config
-from thinc.config import VARIABLE_RE
+from thinc.config import VARIABLE_RE, ConfigValidationError
 import typer
 
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@@ -51,7 +51,10 @@ def debug_config(
     msg.divider("Config validation")
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
-        nlp, _ = util.load_model_from_config(config)
+        nlp, resolved = util.load_model_from_config(config)
+        # Use the resolved config here in case user has one function returning
+        # a dict of corpora etc.
+        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
     msg.good("Config is valid")
     if show_vars:
         variables = get_variables(config)
@@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
         value = util.dot_to_object(config, path)
         result[variable] = repr(value)
     return result
+
+
+def check_section_refs(config: Config, fields: List[str]) -> None:
+    """Validate fields in the config that refer to other sections or values
+    (e.g. in the corpora) and make sure that those references exist.
+    """
+    errors = []
+    for field in fields:
+        # If the field doesn't exist in the config, we ignore it
+        try:
+            value = util.dot_to_object(config, field)
+        except KeyError:
+            continue
+        try:
+            util.dot_to_object(config, value)
+        except KeyError:
+            msg = f"not a valid section reference: {value}"
+            errors.append({"loc": field.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config, errors)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a9c9d8ca5..1bc246fef 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -7,7 +7,8 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from thinc.config import ConfigValidationError
+from spacy.cli.debug_config import check_section_refs
+from thinc.config import ConfigValidationError, Config
 import srsly
 import os
 
@@ -413,3 +414,15 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
     assert string_to_list(value, intify=False) == ["1", "2", "3"]
     assert string_to_list(value, intify=True) == [1, 2, 3]
+
+
+def test_check_section_refs():
+    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
+    config = Config(config)
+    # Valid section reference
+    check_section_refs(config, ["a.b.c"])
+    # Section that doesn't exist in this config
+    check_section_refs(config, ["x.y.z"])
+    # Invalid section reference
+    with pytest.raises(ConfigValidationError):
+        check_section_refs(config, ["a.b.c", "f.g"])

From d53c84b6d6717375ee91d2847a3d0f24beafd8d1 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 13:54:44 +0200
Subject: [PATCH 097/516] avoid None callback (#6100)

---
 spacy/pipeline/tok2vec.py            |  2 +-
 spacy/tests/pipeline/test_tok2vec.py | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 721c67a19..9ab4e42b7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
         tokvecs = self.model.predict(docs)
         batch_id = Tok2VecListener.get_batch_id(docs)
         for listener in self.listeners:
-            listener.receive(batch_id, tokvecs, None)
+            listener.receive(batch_id, tokvecs, lambda dX: [])
         return tokvecs
 
     def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 2e514f490..6041657d3 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -169,3 +169,22 @@ def test_tok2vec_listener():
     nlp.select_pipes(disable="tok2vec")
     assert nlp.pipe_names == ["tagger"]
     nlp("Running the pipeline with the Tok2Vec component disabled.")
+
+
+def test_tok2vec_listener_callback():
+    orig_config = Config().from_str(cfg_string)
+    nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp.pipe_names == ["tok2vec", "tagger"]
+    tagger = nlp.get_pipe("tagger")
+    tok2vec = nlp.get_pipe("tok2vec")
+    nlp._link_components()
+    docs = [nlp.make_doc("A random sentence")]
+    tok2vec.model.initialize(X=docs)
+    gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
+    label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
+    tagger.model.initialize(X=docs, Y=label_sample)
+    docs = [nlp.make_doc("Another entirely random sentence")]
+    tok2vec.predict(docs)
+    Y, get_dX = tagger.model.begin_update(docs)
+    # assure that the backprop call works (and doesn't hit a 'None' callback)
+    assert get_dX(Y) is not None

From b1a7d6c528e08c4a80594ae6338cacb22bf8b5b1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 14:42:51 +0200
Subject: [PATCH 098/516] Refactor seen token detection

---
 spacy/errors.py      |  4 ++--
 spacy/tokens/doc.pyx | 24 ++----------------------
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index de4ffde3c..27091810d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -690,8 +690,8 @@ class Errors:
              "in more than one span in entities, blocked, missing or outside.")
     E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
              "options: {modes}")
-    E1012 = ("Spans provided to doc.set_ents must be provided as a list of "
-             "`Span` objects.")
+    E1012 = ("Entity spans and blocked/missing/outside spans should be "
+             "provided to doc.set_ents as lists of `Span` objects.")
     E1013 = ("Unable to set entity for span with empty label. Entity spans are "
              "required to have a label. To set entity information as missing "
              "or blocked, use the keyword arguments with doc.set_ents.")
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 34742e587..4bf6f0e5e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -8,6 +8,7 @@ from libc.stdint cimport int32_t, uint64_t
 import copy
 from collections import Counter
 from enum import Enum
+import itertools
 import numpy
 import srsly
 from thinc.api import get_array_module
@@ -742,28 +743,7 @@ cdef class Doc:
 
         # Find all tokens covered by spans and check that none are overlapping
         seen_tokens = set()
-        for span in entities:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in blocked:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in missing:
-            if not isinstance(span, Span):
-                raise ValueError(Errors.E1012.format(span=span))
-            for i in range(span.start, span.end):
-                if i in seen_tokens:
-                    raise ValueError(Errors.E1010.format(i=i))
-                seen_tokens.add(i)
-        for span in outside:
+        for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
             if not isinstance(span, Span):
                 raise ValueError(Errors.E1012.format(span=span))
             for i in range(span.start, span.end):

From 46250293705b946b762242b0beea38f313412c58 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 19:04:49 +0200
Subject: [PATCH 099/516] Add pin for pyrsistent<0.17.0 (#6116)

Add pin for pyrsistent<0.17.0 since pyrsistent>=0.17.1 is only
compatible with python3.5+.
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index b93def651..367eef111 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0
 pathlib==1.0.1; python_version < "3.4"
 tqdm>=4.38.0,<5.0.0
 # Optional dependencies
+pyrsistent<0.17.0
 jsonschema>=2.6.0,<3.1.0
 # Development dependencies
 cython>=0.25

From 9b4979407d989aab01c9734c697ac73004abefe8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 21:52:42 +0200
Subject: [PATCH 100/516] Fix overlapping German noun chunks (#6112)

Add a similar fix as in #5470 to prevent the German noun chunks iterator
from producing overlapping spans.
---
 spacy/lang/de/syntax_iterators.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index 73c1b1a6e..c5513abc0 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -38,9 +38,13 @@ def noun_chunks(doclike):
     close_app = doc.vocab.strings.add("nk")
 
     rbracket = 0
+    prev_end = -1
     for i, word in enumerate(doclike):
         if i < rbracket:
             continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
         if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
             rbracket = word.i + 1
             # try to extend the span to the right
@@ -48,6 +52,7 @@ def noun_chunks(doclike):
             for rdep in doc[word.i].rights:
                 if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
                     rbracket = rdep.i + 1
+            prev_end = rbracket - 1
             yield word.left_edge.i, rbracket, np_label
 
 

From e0e793be4d8146768e722c23d16cf7c5b170155e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 21:53:06 +0200
Subject: [PATCH 101/516] fix KB IO (#6118)

---
 spacy/kb.pxd                               |  1 -
 spacy/kb.pyx                               | 47 ++++++++++++----------
 spacy/tests/pipeline/test_entity_linker.py | 23 +++++++++++
 3 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 695693666..4a71b26a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -140,7 +140,6 @@ cdef class KnowledgeBase:
         self._entries.push_back(entry)
         self._aliases_table.push_back(alias)
 
-    cpdef from_disk(self, loc)
     cpdef set_entities(self, entity_list, freq_list, vector_list)
 
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index b24ed3a20..ff5382c24 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -9,7 +9,8 @@ from libcpp.vector cimport vector
 
 from pathlib import Path
 import warnings
-from os import path
+
+from spacy import util
 
 from .typedefs cimport hash_t
 from .errors import Errors, Warnings
@@ -319,8 +320,14 @@ cdef class KnowledgeBase:
         return 0.0
 
 
-    def to_disk(self, loc):
-        cdef Writer writer = Writer(loc)
+    def to_disk(self, path):
+        path = util.ensure_path(path)
+        if path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        if not path.parent.exists():
+            path.parent.mkdir(parents=True)
+
+        cdef Writer writer = Writer(path)
         writer.write_header(self.get_size_entities(), self.entity_vector_length)
 
         # dumping the entity vectors in their original order
@@ -359,7 +366,13 @@ cdef class KnowledgeBase:
 
         writer.close()
 
-    cpdef from_disk(self, loc):
+    def from_disk(self, path):
+        path = util.ensure_path(path)
+        if path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
@@ -369,7 +382,7 @@ cdef class KnowledgeBase:
         cdef AliasC alias
         cdef float vector_element
 
-        cdef Reader reader = Reader(loc)
+        cdef Reader reader = Reader(path)
 
         # STEP 0: load header and initialize KB
         cdef int64_t nr_entities
@@ -450,16 +463,13 @@ cdef class KnowledgeBase:
 
 
 cdef class Writer:
-    def __init__(self, object loc):
-        if isinstance(loc, Path):
-            loc = bytes(loc)
-        if path.exists(loc):
-            if path.isdir(loc):
-                raise ValueError(Errors.E928.format(loc=loc))
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+    def __init__(self, path):
+        assert isinstance(path, Path)
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
         self._fp = fopen(<char*>bytes_loc, 'wb')
         if not self._fp:
-            raise IOError(Errors.E146.format(path=loc))
+            raise IOError(Errors.E146.format(path=path))
         fseek(self._fp, 0, 0)
 
     def close(self):
@@ -496,14 +506,9 @@ cdef class Writer:
 
 
 cdef class Reader:
-    def __init__(self, object loc):
-        if isinstance(loc, Path):
-            loc = bytes(loc)
-        if not path.exists(loc):
-            raise ValueError(Errors.E929.format(loc=loc))
-        if path.isdir(loc):
-            raise ValueError(Errors.E928.format(loc=loc))
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+    def __init__(self, path):
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
         self._fp = fopen(<char*>bytes_loc, 'rb')
         if not self._fp:
             PyErr_SetFromErrno(IOError)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index c43d2c58e..88e0646b3 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -144,6 +144,29 @@ def test_kb_empty(nlp):
         entity_linker.begin_training(lambda: [])
 
 
+def test_kb_serialize(nlp):
+    """Test serialization of the KB"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    with make_tempdir() as d:
+        # normal read-write behaviour
+        mykb.to_disk(d / "kb")
+        mykb.from_disk(d / "kb")
+        mykb.to_disk(d / "kb.file")
+        mykb.from_disk(d / "kb.file")
+        mykb.to_disk(d / "new" / "kb")
+        mykb.from_disk(d / "new" / "kb")
+        # allow overwriting an existing file
+        mykb.to_disk(d / "kb.file")
+        with pytest.raises(ValueError):
+            # can not write to a directory
+            mykb.to_disk(d)
+        with pytest.raises(ValueError):
+            # can not read from a directory
+            mykb.from_disk(d)
+        with pytest.raises(ValueError):
+            # can not read from an unknown file
+            mykb.from_disk(d / "unknown" / "kb")
+
 def test_candidate_generation(nlp):
     """Test correct candidate generation"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

From e4acb286582477caaf5486833781c5802374d171 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 21:53:33 +0200
Subject: [PATCH 102/516] Fix norm in retokenizer split (#6111)

Parallel to behavior in merge, reset norm on original token in
retokenizer split.
---
 spacy/tests/doc/test_retokenize_split.py | 19 +++++++++++++++++++
 spacy/tokens/_retokenize.pyx             |  1 +
 2 files changed, 20 insertions(+)

diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index d074fddc6..d84c846de 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab):
         token = doc[0]
         heads = [(token, 0)] * len(token)
         retokenizer.split(doc[token.i], list(token.text), heads=heads)
+
+
+def test_doc_retokenizer_split_norm(en_vocab):
+    """#6060: reset norm in split"""
+    text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots"
+    doc = Doc(en_vocab, words=text.split())
+
+    # Set custom norm on the w/ token.
+    doc[5].norm_ = "with"
+
+    # Retokenize to split out the words in the token at doc[2].
+    token = doc[2]
+    with doc.retokenize() as retokenizer:
+      retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
+
+    assert doc[9].text  == "w/"
+    assert doc[9].norm_ == "with"
+    assert doc[5].text  == "over"
+    assert doc[5].norm_ == "over"
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index abc9b731b..4a030bef6 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
         lex = doc.vocab.get(doc.mem, orth)
         token.lex = lex
         token.lemma = 0  # reset lemma
+        token.norm = 0  # reset norm
         if to_process_tensor:
             # setting the tensors of the split tokens to array of zeros
             doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")

From 86a08f819d192e50beff97e1b90c12f0daba2975 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 21:54:52 +0200
Subject: [PATCH 103/516] tok2vec.update instead of predict (#6113)

---
 spacy/cli/debug_model.py             | 2 +-
 spacy/tests/pipeline/test_tok2vec.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1d27c7c52..7f8e1dabc 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -128,7 +128,7 @@ def debug_model(
     goldY = None
     for e in range(3):
         if tok2vec:
-            tok2vec.predict(X)
+            tok2vec.update([Example.from_dict(x, {}) for x in X])
         Y, get_dX = model.begin_update(X)
         if goldY is None:
             goldY = _simulate_gold(Y)
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 6041657d3..985314217 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -184,7 +184,7 @@ def test_tok2vec_listener_callback():
     label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
     tagger.model.initialize(X=docs, Y=label_sample)
     docs = [nlp.make_doc("Another entirely random sentence")]
-    tok2vec.predict(docs)
+    tok2vec.update([Example.from_dict(x, {}) for x in docs])
     Y, get_dX = tagger.model.begin_update(docs)
     # assure that the backprop call works (and doesn't hit a 'None' callback)
     assert get_dX(Y) is not None

From 4a56ea72b545ea1162ae85d3b1ccc37f809182ec Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 09:15:07 +0200
Subject: [PATCH 104/516] fallbacks for old names

---
 spacy/training/iob_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 63deed3a5..03a502912 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -50,6 +50,10 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
+def biluo_tags_from_doc(doc, missing="O"):
+    return doc_to_biluo_tags(doc, missing)
+
+
 def doc_to_biluo_tags(doc, missing="O"):
     return offsets_to_biluo_tags(
         doc,
@@ -58,6 +62,10 @@ def doc_to_biluo_tags(doc, missing="O"):
     )
 
 
+def biluo_tags_from_offsets(doc, entities, missing="O"):
+    return offsets_to_biluo_tags(doc, entities, missing)
+
+
 def offsets_to_biluo_tags(doc, entities, missing="O"):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
@@ -143,6 +151,10 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     return biluo
 
 
+def spans_from_biluo_tags(doc, tags):
+    return biluo_tags_to_spans(doc, tags)
+
+
 def biluo_tags_to_spans(doc, tags):
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
@@ -161,6 +173,10 @@ def biluo_tags_to_spans(doc, tags):
     return spans
 
 
+def offsets_from_biluo_tags(doc, tags):
+    return biluo_tags_to_offsets(doc, tags)
+
+
 def biluo_tags_to_offsets(doc, tags):
     """Encode per-token tags following the BILUO scheme into entity offsets.
 

From 556f3e4652a33eb1465e1f886310653d8e3d2fd2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 09:24:28 +0200
Subject: [PATCH 105/516] add pooling to NEL's TransformerListener

---
 spacy/cli/templates/quickstart_training.jinja | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index a0d9f78ac..c55374899 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -107,6 +107,9 @@ nO = null
 [components.entity_linker.model.tok2vec]
 @architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
+
+[components.entity_linker.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
 {% endif -%}
 
 {% if "textcat" in components %}

From f976bab710dae664501e6fecd7360053a080090e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:30:09 +0200
Subject: [PATCH 106/516] Remove empty file [ci skip]

---
 spacy/lang/cs/test_text.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 spacy/lang/cs/test_text.py

diff --git a/spacy/lang/cs/test_text.py b/spacy/lang/cs/test_text.py
deleted file mode 100644
index e69de29bb..000000000

From d8f661c9103b6b0a09de5b0e25428782d6736006 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:30:26 +0200
Subject: [PATCH 107/516] Update docs [ci skip]

---
 README.md                   |   4 +-
 website/meta/languages.json | 239 +++++++++++++++++-------------------
 2 files changed, 113 insertions(+), 130 deletions(-)

diff --git a/README.md b/README.md
index d23051af0..61cefb69a 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
 
 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and vectors, and
-currently supports tokenization for **59+ languages**. It features
+currently supports tokenization for **60+ languages**. It features
 state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
 spaCy is commercial open-source software, released under the MIT license.
@@ -69,7 +69,7 @@ it.
 
 ## Features
 
-- Support for **59+ languages**
+- Support for **60+ languages**
 - **Trained pipelines**
 - Multi-task learning with pretrained **transformers** like BERT
 - Pretrained **word vectors**
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 493f96c49..5ef3a6469 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -1,21 +1,11 @@
 {
     "languages": [
-        {
-            "code": "zh",
-            "name": "Chinese",
-            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
-            "dependencies": [
-                {
-                    "name": "Jieba",
-                    "url": "https://github.com/fxsjy/jieba"
-                },
-                {
-                    "name": "PKUSeg",
-                    "url": "https://github.com/lancopku/PKUSeg-python"
-                }
-            ],
-            "has_examples": true
-        },
+        { "code": "af", "name": "Afrikaans" },
+        { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
+        { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
+        { "code": "bn", "name": "Bengali", "has_examples": true },
+        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
+        { "code": "cs", "name": "Czech", "has_examples": true },
         {
             "code": "da",
             "name": "Danish",
@@ -23,39 +13,10 @@
             "has_examples": true,
             "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
         },
-        {
-            "code": "nl",
-            "name": "Dutch",
-            "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
-            "example": "Dit is een zin.",
-            "has_examples": true
-        },
-        {
-            "code": "en",
-            "name": "English",
-            "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
-            "starters": [
-                "en_vectors_web_lg",
-                "en_trf_bertbaseuncased_lg",
-                "en_trf_robertabase_lg",
-                "en_trf_distilbertbaseuncased_lg",
-                "en_trf_xlnetbasecased_lg"
-            ],
-            "example": "This is a sentence.",
-            "has_examples": true
-        },
-        {
-            "code": "fr",
-            "name": "French",
-            "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
-            "example": "C'est une phrase.",
-            "has_examples": true
-        },
         {
             "code": "de",
             "name": "German",
-            "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
-            "starters": ["de_trf_bertbasecased_lg"],
+            "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
             "example": "Dies ist ein Satz.",
             "has_examples": true
         },
@@ -66,6 +27,46 @@
             "example": "Αυτή είναι μια πρόταση.",
             "has_examples": true
         },
+        {
+            "code": "en",
+            "name": "English",
+            "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
+            "starters": ["en_vectors_web_lg"],
+            "example": "This is a sentence.",
+            "has_examples": true
+        },
+        {
+            "code": "es",
+            "name": "Spanish",
+            "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
+            "example": "Esto es una frase.",
+            "has_examples": true
+        },
+        { "code": "et", "name": "Estonian" },
+        { "code": "eu", "name": "Basque", "has_examples": true },
+        { "code": "fa", "name": "Persian", "has_examples": true },
+        { "code": "fi", "name": "Finnish", "has_examples": true },
+        {
+            "code": "fr",
+            "name": "French",
+            "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
+            "example": "C'est une phrase.",
+            "has_examples": true
+        },
+        { "code": "ga", "name": "Irish" },
+        { "code": "gu", "name": "Gujarati", "has_examples": true },
+        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
+        { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
+        { "code": "hr", "name": "Croatian", "has_examples": true },
+        { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
+        { "code": "hy", "name": "Armenian", "has_examples": true },
+        {
+            "code": "id",
+            "name": "Indonesian",
+            "example": "Ini adalah sebuah kalimat.",
+            "has_examples": true
+        },
+        { "code": "is", "name": "Icelandic" },
         {
             "code": "it",
             "name": "Italian",
@@ -88,12 +89,37 @@
             "example": "これは文章です。",
             "has_examples": true
         },
+        { "code": "kn", "name": "Kannada", "has_examples": true },
+        {
+            "code": "ko",
+            "name": "Korean",
+            "dependencies": [
+                {
+                    "name": "mecab-ko",
+                    "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
+                },
+                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
+                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
+            ],
+            "example": "이것은 문장입니다.",
+            "has_examples": true
+        },
+        { "code": "lb", "name": "Luxembourgish", "has_examples": true },
+        {
+            "code": "lij",
+            "name": "Ligurian",
+            "example": "Sta chì a l'é unna fraxe.",
+            "has_examples": true
+        },
         {
             "code": "lt",
             "name": "Lithuanian",
             "has_examples": true,
             "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
         },
+        { "code": "lv", "name": "Latvian" },
+        { "code": "ml", "name": "Malayalam", "has_examples": true },
+        { "code": "mr", "name": "Marathi" },
         {
             "code": "nb",
             "name": "Norwegian Bokmål",
@@ -101,6 +127,14 @@
             "has_examples": true,
             "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
         },
+        { "code": "ne", "name": "Nepali", "has_examples": true },
+        {
+            "code": "nl",
+            "name": "Dutch",
+            "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
+            "example": "Dit is een zin.",
+            "has_examples": true
+        },
         {
             "code": "pl",
             "name": "Polish",
@@ -122,69 +156,26 @@
             "has_examples": true,
             "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
         },
-        {
-            "code": "es",
-            "name": "Spanish",
-            "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
-            "example": "Esto es una frase.",
-            "has_examples": true
-        },
-        { "code": "sv", "name": "Swedish", "has_examples": true },
-        { "code": "fi", "name": "Finnish", "has_examples": true },
-        { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
         {
             "code": "ru",
             "name": "Russian",
             "has_examples": true,
             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
         },
-        {
-            "code": "uk",
-            "name": "Ukrainian",
-            "has_examples": true,
-            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
-        },
-        { "code": "hr", "name": "Croatian", "has_examples": true },
-        { "code": "eu", "name": "Basque", "has_examples": true },
-        { "code": "yo", "name": "Yoruba", "has_examples": true },
-        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
-        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
-        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
-        { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
-        { "code": "fa", "name": "Persian", "has_examples": true },
-        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
-        { "code": "tt", "name": "Tatar", "has_examples": true },
-        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
+        { "code": "sa", "name": "Sanskrit", "has_examples": true },
         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
-        { "code": "ga", "name": "Irish" },
-        { "code": "bn", "name": "Bengali", "has_examples": true },
-        { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
-        { "code": "mr", "name": "Marathi" },
-        { "code": "kn", "name": "Kannada" },
-        { "code": "ta", "name": "Tamil", "has_examples": true },
-        {
-            "code": "id",
-            "name": "Indonesian",
-            "example": "Ini adalah sebuah kalimat.",
-            "has_examples": true
-        },
-        { "code": "tl", "name": "Tagalog" },
-        { "code": "af", "name": "Afrikaans" },
-        { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
-        { "code": "cs", "name": "Czech" },
-        { "code": "is", "name": "Icelandic" },
-        { "code": "lv", "name": "Latvian" },
-        { "code": "sr", "name": "Serbian" },
-        { "code": "sk", "name": "Slovak" },
+        { "code": "sk", "name": "Slovak", "has_examples": true },
         { "code": "sl", "name": "Slovenian" },
-        { "code": "lb", "name": "Luxembourgish" },
         {
             "code": "sq",
             "name": "Albanian",
             "example": "Kjo është një fjali.",
             "has_examples": true
         },
-        { "code": "et", "name": "Estonian" },
+        { "code": "sr", "name": "Serbian", "has_examples": true },
+        { "code": "sv", "name": "Swedish", "has_examples": true },
+        { "code": "ta", "name": "Tamil", "has_examples": true },
+        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
         {
             "code": "th",
             "name": "Thai",
@@ -194,51 +185,43 @@
             "example": "นี่คือประโยค",
             "has_examples": true
         },
+        { "code": "tl", "name": "Tagalog" },
+        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
+        { "code": "tt", "name": "Tatar", "has_examples": true },
         {
-            "code": "ko",
-            "name": "Korean",
-            "dependencies": [
-                {
-                    "name": "mecab-ko",
-                    "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
-                },
-                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
-                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
-            ],
-            "example": "이것은 문장입니다.",
-            "has_examples": true
+            "code": "uk",
+            "name": "Ukrainian",
+            "has_examples": true,
+            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
         },
+        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
         {
             "code": "vi",
             "name": "Vietnamese",
             "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
         },
-        {
-            "code": "lij",
-            "name": "Ligurian",
-            "example": "Sta chì a l'é unna fraxe.",
-            "has_examples": true
-        },
-        {
-            "code": "hy",
-            "name": "Armenian",
-            "has_examples": true
-        },
-        {
-            "code": "gu",
-            "name": "Gujarati",
-            "has_examples": true
-        },
-        {
-            "code": "ml",
-            "name": "Malayalam",
-            "has_examples": true
-        },
         {
             "code": "xx",
             "name": "Multi-language",
             "models": ["xx_ent_wiki_sm"],
             "example": "This is a sentence about Facebook."
+        },
+        { "code": "yo", "name": "Yoruba", "has_examples": true },
+        {
+            "code": "zh",
+            "name": "Chinese",
+            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
+            "dependencies": [
+                {
+                    "name": "Jieba",
+                    "url": "https://github.com/fxsjy/jieba"
+                },
+                {
+                    "name": "PKUSeg",
+                    "url": "https://github.com/lancopku/PKUSeg-python"
+                }
+            ],
+            "has_examples": true
         }
     ],
     "licenses": [

From 930b116f004bf4413851da6710712a77ae118dbb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:35:21 +0200
Subject: [PATCH 108/516] Update docs [ci skip]

---
 website/docs/usage/v3.md         | 5 ++++-
 website/src/widgets/languages.js | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 406ba4b75..28bd02e3e 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
   [TransformerListener](/api/architectures#TransformerListener),
   [Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
-- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf)
+- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
+  [`de_dep_news_trf`](/models/de#de_dep_news_trf),
+  [`es_dep_news_trf`](/models/es#es_dep_news_trf),
+  [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
 - **Implementation:**
   [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
 
diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js
index bb26e57cd..74d850182 100644
--- a/website/src/widgets/languages.js
+++ b/website/src/widgets/languages.js
@@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => (
         <Td>
             {models && models.length ? (
                 <Link to={`/models/${code}`}>
-                    {models.length} {models.length === 1 ? 'model' : 'models'}
+                    {models.length} {models.length === 1 ? 'package' : 'packages'}
                 </Link>
             ) : (
                 <em>none yet</em>
@@ -51,7 +51,7 @@ const Languages = () => (
                                 <Th>Language</Th>
                                 <Th>Code</Th>
                                 <Th>Language Data</Th>
-                                <Th>Models</Th>
+                                <Th>Pipelines</Th>
                             </Tr>
                         </thead>
                         <tbody>

From 566d0487538c547dc40c14a80341c92a73378399 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:43:51 +0200
Subject: [PATCH 109/516] Fix project repo link [ci skip]

---
 website/src/widgets/project.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js
index 8d309394d..9e23d60ea 100644
--- a/website/src/widgets/project.js
+++ b/website/src/widgets/project.js
@@ -16,7 +16,8 @@ export default function Project({
 }) {
     const repoArg = repo ? ` --repo ${repo}` : ''
     const text = `${COMMAND} ${id}${repoArg}`
-    const url = `${repo || projectsRepo}/${id}`
+    const defaultRepo = `https://github.com/${projectsRepo}`
+    const url = `${repo || defaultRepo}/${id}`
     const header = (
         <>
             {title}:{' '}

From 61235445db66b66181d76d217c92d2501128f699 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:45:32 +0200
Subject: [PATCH 110/516] Update README.md [ci skip]

---
 README.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 61cefb69a..3e5e5febe 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
 spaCy is commercial open-source software, released under the MIT license.
 
-💫 **Version 2.3 out now!**
+💫 **Version 3.0 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
 
 ## 📖 Documentation
 
-| Documentation   |                                                                |
-| --------------- | -------------------------------------------------------------- |
-| [spaCy 101]     | New to spaCy? Here's everything you need to know!              |
-| [Usage Guides]  | How to use spaCy and its features.                             |
-| [New in v3.0]   | New features, backwards incompatibilities and migration guide. |
-| [API Reference] | The detailed reference for spaCy's API.                        |
-| [Models]        | Download statistical language models for spaCy.                |
-| [Universe]      | Libraries, extensions, demos, books and courses.               |
-| [Changelog]     | Changes and version history.                                   |
-| [Contribute]    | How to contribute to the spaCy project and code base.          |
+| Documentation       |                                                                |
+| ------------------- | -------------------------------------------------------------- |
+| [spaCy 101]         | New to spaCy? Here's everything you need to know!              |
+| [Usage Guides]      | How to use spaCy and its features.                             |
+| [New in v3.0]       | New features, backwards incompatibilities and migration guide. |
+| [Project Templates] | End-to-end workflows you can clone, modify and run.            |
+| [API Reference]     | The detailed reference for spaCy's API.                        |
+| [Models]            | Download statistical language models for spaCy.                |
+| [Universe]          | Libraries, extensions, demos, books and courses.               |
+| [Changelog]         | Changes and version history.                                   |
+| [Contribute]        | How to contribute to the spaCy project and code base.          |
 
 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
 [api reference]: https://spacy.io/api/
 [models]: https://spacy.io/models
 [universe]: https://spacy.io/universe
+[project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 

From 6ca06cb62cdbcddd1071fcc05871d675704c47a2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 10:14:27 +0200
Subject: [PATCH 111/516] Update docs and formatting [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 website/docs/api/top-level.md                 | 19 ++++++++++
 website/docs/usage/v3.md                      | 30 ++++++++-------
 website/src/components/infobox.js             | 37 ++++++++++---------
 4 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index c55374899..7241c5116 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -327,7 +327,7 @@ sents_f = 0.0
 ents_f = {{ (1.0 / components|length)|round(2) }}
 ents_p = 0.0
 ents_r = 0.0
-{%- endif -%}
+{%- endif %}
 {%- if "textcat" in components %}
 cats_score = {{ (1.0 / components|length)|round(2) }}
 {%- endif -%}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 2c082ae0b..f36be0806 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -632,6 +632,12 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or
 more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a
 single-token entity.
 
+<Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets">
+
+This method was previously available as `spacy.gold.biluo_tags_from_offsets`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -647,6 +653,7 @@ single-token entity.
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `doc`       | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~                                                             |
 | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
+| `missing`   | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~                                                                  |
 | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    |
 
 ### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
@@ -654,6 +661,12 @@ single-token entity.
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
 
+<Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags">
+
+This method was previously available as `spacy.gold.offsets_from_biluo_tags`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -678,6 +691,12 @@ Encode per-token tags following the
 [`Span`](/api/span) objects. This can be used to create entity spans from
 token-based tags, e.g. to overwrite the `doc.ents`.
 
+<Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags">
+
+This method was previously available as `spacy.gold.spans_from_biluo_tags`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 88935e720..91d97cae2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -551,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 
 ### Removed or renamed API {#incompat-removed}
 
-| Removed                                                  | Replacement                                                                                                  |
-| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
-| `Language.disable_pipes`                                 | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
-| `GoldParse`                                              | [`Example`](/api/example)                                                                                    |
-| `GoldCorpus`                                             | [`Corpus`](/api/corpus)                                                                                      |
-| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`          | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                   |
-| `Matcher.pipe`, `PhraseMatcher.pipe`                     | not needed                                                                                                   |
-| `spacy init-model`                                       | [`spacy init vocab`](/api/cli#init-vocab)                                                                    |
-| `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)                                                                    |
-| `spacy profile`                                          | [`spacy debug profile`](/api/cli#debug-profile)                                                              |
-| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated                                                                          |
+| Removed                                                                                      | Replacement                                                                                                                                                                                                              |
+| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
+| `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
+| `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
+| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               |
+| `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               |
+| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
+| `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                |
+| `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                |
+| `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          |
+| `spacy link`, `util.set_data_path`, `util.get_data_path`                                     | not needed, symlinks are deprecated                                                                                                                                                                                      |
 
 The following deprecated methods, attributes and arguments were removed in v3.0.
 Most of them have been **deprecated for a while** and many would previously
@@ -971,9 +973,9 @@ python -m spacy package ./output ./packages
 
 #### Data utilities and gold module {#migrating-gold}
 
-The `spacy.gold` module has been renamed to `spacy.training` and the conversion 
-utilities now follow the naming format of `x_to_y`. This mostly
-affects internals, but if you've been using the span offset conversion utilities
+The `spacy.gold` module has been renamed to `spacy.training` and the conversion
+utilities now follow the naming format of `x_to_y`. This mostly affects
+internals, but if you've been using the span offset conversion utilities
 [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
 [`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
 [`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js
index 968b6cea8..b5a7af545 100644
--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@@ -20,24 +20,27 @@ export default function Infobox({
         [classes.danger]: variant === 'danger',
     })
     return (
-        <aside className={infoboxClassNames} id={id}>
-            {title && (
-                <h4 className={classes.title}>
-                    {variant !== 'default' && !emoji && (
-                        <Icon width={18} name={variant} inline className={classes.icon} />
-                    )}
-                    <span className={classes.titleText}>
-                        {emoji && (
-                            <span className={classes.emoji} aria-hidden="true">
-                                {emoji}
-                            </span>
+        <>
+            {id && <a id={id} />}
+            <aside className={infoboxClassNames}>
+                {title && (
+                    <h4 className={classes.title}>
+                        {variant !== 'default' && !emoji && (
+                            <Icon width={18} name={variant} inline className={classes.icon} />
                         )}
-                        {title}
-                    </span>
-                </h4>
-            )}
-            {children}
-        </aside>
+                        <span className={classes.titleText}>
+                            {emoji && (
+                                <span className={classes.emoji} aria-hidden="true">
+                                    {emoji}
+                                </span>
+                            )}
+                            {title}
+                        </span>
+                    </h4>
+                )}
+                {children}
+            </aside>
+        </>
     )
 }
 

From ae5dacf75f490c1b64257235cc2e4c93306d226e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 10:14:34 +0200
Subject: [PATCH 112/516] Tidy up and add types

---
 spacy/training/iob_utils.py | 54 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 03a502912..91fc40205 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -1,9 +1,11 @@
+from typing import List, Tuple, Iterable, Union, Iterator
 import warnings
+
 from ..errors import Errors, Warnings
-from ..tokens import Span
+from ..tokens import Span, Doc
 
 
-def iob_to_biluo(tags):
+def iob_to_biluo(tags: Iterable[str]) -> List[str]:
     out = []
     tags = list(tags)
     while tags:
@@ -12,7 +14,7 @@ def iob_to_biluo(tags):
     return out
 
 
-def biluo_to_iob(tags):
+def biluo_to_iob(tags: Iterable[str]) -> List[str]:
     out = []
     for tag in tags:
         if tag is None:
@@ -23,12 +25,12 @@ def biluo_to_iob(tags):
     return out
 
 
-def _consume_os(tags):
+def _consume_os(tags: List[str]) -> Iterator[str]:
     while tags and tags[0] == "O":
         yield tags.pop(0)
 
 
-def _consume_ent(tags):
+def _consume_ent(tags: List[str]) -> List[str]:
     if not tags:
         return []
     tag = tags.pop(0)
@@ -50,11 +52,7 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
-def biluo_tags_from_doc(doc, missing="O"):
-    return doc_to_biluo_tags(doc, missing)
-
-
-def doc_to_biluo_tags(doc, missing="O"):
+def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
     return offsets_to_biluo_tags(
         doc,
         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
@@ -62,11 +60,9 @@ def doc_to_biluo_tags(doc, missing="O"):
     )
 
 
-def biluo_tags_from_offsets(doc, entities, missing="O"):
-    return offsets_to_biluo_tags(doc, entities, missing)
-
-
-def offsets_to_biluo_tags(doc, entities, missing="O"):
+def offsets_to_biluo_tags(
+    doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
+) -> List[str]:
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
 
@@ -77,7 +73,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
         the original string.
     RETURNS (list): A list of unicode strings, describing the tags. Each tag
         string will be of the form either "", "O" or "{action}-{label}", where
-        action is one of "B", "I", "L", "U". The string "-" is used where the
+        action is one of "B", "I", "L", "U". The missing label is used where the
         entity offsets don't align with the tokenization in the `Doc` object.
         The training algorithm will view these as missing values. "O" denotes a
         non-entity token. "B" denotes the beginning of a multi-token entity,
@@ -93,7 +89,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     """
     # Ensure no overlapping entity labels exist
     tokens_in_ents = {}
-
     starts = {token.idx: token.i for token in doc}
     ends = {token.idx + len(token): token.i for token in doc}
     biluo = ["-" for _ in doc]
@@ -117,7 +112,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
                         )
                     )
                 tokens_in_ents[token_index] = (start_char, end_char, label)
-
             start_token = starts.get(start_char)
             end_token = ends.get(end_char)
             # Only interested if the tokenization is correct
@@ -151,11 +145,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     return biluo
 
 
-def spans_from_biluo_tags(doc, tags):
-    return biluo_tags_to_spans(doc, tags)
-
-
-def biluo_tags_to_spans(doc, tags):
+def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
 
@@ -173,11 +163,9 @@ def biluo_tags_to_spans(doc, tags):
     return spans
 
 
-def offsets_from_biluo_tags(doc, tags):
-    return biluo_tags_to_offsets(doc, tags)
-
-
-def biluo_tags_to_offsets(doc, tags):
+def biluo_tags_to_offsets(
+    doc: Doc, tags: Iterable[str]
+) -> List[Tuple[int, int, Union[str, int]]]:
     """Encode per-token tags following the BILUO scheme into entity offsets.
 
     doc (Doc): The document that the BILUO tags refer to.
@@ -192,8 +180,8 @@ def biluo_tags_to_offsets(doc, tags):
     return [(span.start_char, span.end_char, span.label_) for span in spans]
 
 
-def tags_to_entities(tags):
-    """ Note that the end index returned by this function is inclusive.
+def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
+    """Note that the end index returned by this function is inclusive.
     To use it for Span creation, increment the end by 1."""
     entities = []
     start = None
@@ -225,3 +213,9 @@ def tags_to_entities(tags):
         else:
             raise ValueError(Errors.E068.format(tag=tag))
     return entities
+
+
+# Fallbacks to make backwards-compat easier
+offsets_from_biluo_tags = biluo_tags_to_offsets
+spans_from_biluo_tags = biluo_tags_to_spans
+biluo_tags_from_offsets = offsets_to_biluo_tags

From 20b0ec5dcf5b97a3c406ec6bd7aa3f32223c63fa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 10:37:12 +0200
Subject: [PATCH 113/516] avoid logging performance of frozen components

---
 spacy/cli/train.py        | 6 ++++--
 spacy/training/loggers.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index bf3749c9e..811a3ba86 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -152,7 +152,8 @@ def train(
         exclude=frozen_components,
     )
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    print_row, finalize_logger = train_logger(nlp)
+    with nlp.select_pipes(disable=[*frozen_components]):
+        print_row, finalize_logger = train_logger(nlp)
 
     try:
         progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@@ -163,7 +164,8 @@ def train(
                 progress.close()
                 print_row(info)
                 if is_best_checkpoint and output_path is not None:
-                    update_meta(T_cfg, nlp, info)
+                    with nlp.select_pipes(disable=[*frozen_components]):
+                        update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
                         nlp.to_disk(output_path / "model-best")
                 progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 92b598033..dddf20169 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -11,9 +11,11 @@ def console_logger():
     def setup_printer(
         nlp: "Language",
     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        # we assume here that only components are enabled that should be trained & logged
+        logged_pipes = nlp.pipe_names
         score_cols = list(nlp.config["training"]["score_weights"])
         score_widths = [max(len(col), 6) for col in score_cols]
-        loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
+        loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
         loss_widths = [max(len(col), 8) for col in loss_cols]
         table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
         table_header = [col.upper() for col in table_header]
@@ -26,7 +28,7 @@ def console_logger():
             try:
                 losses = [
                     "{0:.2f}".format(float(info["losses"][pipe_name]))
-                    for pipe_name in nlp.pipe_names
+                    for pipe_name in logged_pipes
                 ]
             except KeyError as e:
                 raise KeyError(

From 6435458d517e1ca689d2bcf6f996df59218957bf Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 12:12:38 +0200
Subject: [PATCH 114/516] simplify expression

---
 spacy/cli/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 811a3ba86..2900ef379 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -152,7 +152,7 @@ def train(
         exclude=frozen_components,
     )
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    with nlp.select_pipes(disable=[*frozen_components]):
+    with nlp.select_pipes(disable=frozen_components):
         print_row, finalize_logger = train_logger(nlp)
 
     try:
@@ -164,7 +164,7 @@ def train(
                 progress.close()
                 print_row(info)
                 if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=[*frozen_components]):
+                    with nlp.select_pipes(disable=frozen_components):
                         update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
                         nlp.to_disk(output_path / "model-best")

From 02b69dd0d532fb4c8835868332268e2f6eead511 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 12:56:54 +0200
Subject: [PATCH 115/516] Update models directory [ci skip]

---
 website/src/templates/models.js | 108 +++++++++++++-------------------
 1 file changed, 44 insertions(+), 64 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 5061972b8..5d705048b 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -12,7 +12,6 @@ import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
 import Link from '../components/link'
-import Grid from '../components/grid'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@@ -31,10 +30,16 @@ const MODEL_META = {
     wiki: 'Wikipedia',
     uas: 'Unlabelled dependencies',
     las: 'Labelled dependencies',
+    token_acc: 'Tokenization',
+    tok: 'Tokenization',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
-    ents_f: 'Entities (F-score)',
-    ents_p: 'Entities (precision)',
-    ents_r: 'Entities (recall)',
+    tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+    ents_f: 'Named entities (F-score)',
+    ents_p: 'Named entities (precision)',
+    ents_r: 'Named entities (recall)',
+    sent_f: 'Sentence segmentation (F-score)',
+    sent_p: 'Sentence segmentation (precision)',
+    sent_r: 'Sentence segmentation (recall)',
     cpu: 'words per second on CPU',
     gpu: 'words per second on GPU',
     pipeline: 'Active processing pipeline components in order',
@@ -83,25 +88,19 @@ function formatVectors(data) {
 }
 
 function formatAccuracy(data) {
-    if (!data) return null
-    const labels = {
-        las: 'LAS',
-        uas: 'UAS',
-        tags_acc: 'TAG',
-        ents_f: 'NER F',
-        ents_p: 'NER P',
-        ents_r: 'NER R',
-    }
-    const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
-    const isNer = key => key.startsWith('ents_')
+    if (!data) return []
     return Object.keys(data)
-        .filter(key => labels[key])
-        .map(key => ({
-            label: labels[key],
-            value: data[key].toFixed(2),
-            help: MODEL_META[key],
-            type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
-        }))
+        .map(label => {
+            const value = data[label]
+            return isNaN(value)
+                ? null
+                : {
+                      label,
+                      value: value.toFixed(2),
+                      help: MODEL_META[label],
+                  }
+        })
+        .filter(item => item)
 }
 
 function formatModelMeta(data) {
@@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
         { label: 'Author', content: author },
         { label: 'License', content: license },
     ]
-    const accuracy = [
-        {
-            label: 'Syntax Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
-        },
-        {
-            label: 'NER Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
-        },
-    ]
 
     const error = (
         <Infobox title="Unable to load model details from GitHub" variant="danger">
@@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
             </p>
         </Infobox>
     )
-
     return (
         <Section id={name}>
             <H2
@@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                     )}
                 </tbody>
             </Table>
-            <Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
-                {accuracy &&
-                    accuracy.map(({ label, items }, i) =>
-                        !items ? null : (
-                            <Table fixed key={i}>
-                                <thead>
-                                    <Tr>
-                                        <Th colSpan={2}>{label}</Th>
-                                    </Tr>
-                                </thead>
-                                <tbody>
-                                    {items.map((item, i) => (
-                                        <Tr key={i}>
-                                            <Td>
-                                                <Label>
-                                                    {item.label}{' '}
-                                                    {item.help && <Help>{item.help}</Help>}
-                                                </Label>
-                                            </Td>
-                                            <Td num>{item.value}</Td>
-                                        </Tr>
-                                    ))}
-                                </tbody>
-                            </Table>
-                        )
-                    )}
-            </Grid>
             {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
             {hasInteractiveCode && (
                 <CodeBlock title="Try out the model" lang="python" executable={true}>
@@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                         `import spacy`,
                         `from spacy.lang.${langId}.examples import sentences `,
                         ``,
-                        `nlp = spacy.load('${name}')`,
+                        `nlp = spacy.load("${name}")`,
                         `doc = nlp(sentences[0])`,
                         `print(doc.text)`,
                         `for token in doc:`,
@@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                     ].join('\n')}
                 </CodeBlock>
             )}
+            {meta.accuracy && (
+                <Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
+                    <Table>
+                        <tbody>
+                            {meta.accuracy.map(({ label, value, help }) => (
+                                <Tr key={`${name}-${label}`}>
+                                    <Td nowrap>
+                                        <InlineCode>{label.toUpperCase()}</InlineCode>
+                                    </Td>
+                                    <Td>{help}</Td>
+                                    <Td num style={{ textAlign: 'right' }}>
+                                        {value}
+                                    </Td>
+                                </Tr>
+                            ))}
+                        </tbody>
+                    </Table>
+                </Accordion>
+            )}
             {labels && (
                 <Accordion id={`${name}-labels`} title="Label Scheme">
                     <p>
@@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                                 const labelNames = labels[pipe] || []
                                 const help = LABEL_SCHEME_META[pipe]
                                 return (
-                                    <Tr key={pipe} evenodd={false} key={pipe}>
+                                    <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
                                         <Td style={{ width: '20%' }}>
                                             <Label>
                                                 {pipe} {help && <Help>{help}</Help>}
@@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
     const [initialized, setInitialized] = useState(false)
     const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta } = pageContext
+    const { id, title, meta, hasExamples } = pageContext
     const { models, isStarters } = meta
     const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
 
@@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
 
     const modelTitle = title
     const modelTeaser = `Available trained pipelines for ${title}`
-
     const starterTitle = `${title} starters`
     const starterTeaser = `Available transfer learning starter packs for ${title}`
 
@@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
                             baseUrl={baseUrl}
                             repo={repo}
                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
+                            hasExamples={meta.hasExamples}
                         />
                     ))
                 }

From a9da33c4d97abb0e9a09795d2383b3be3a10a3e9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 13:00:56 +0200
Subject: [PATCH 116/516] Fix infobox with ID [ci skip]

---
 website/src/components/infobox.js | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js
index b5a7af545..6df8426b8 100644
--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@@ -1,4 +1,4 @@
-import React from 'react'
+import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'
 
@@ -14,13 +14,14 @@ export default function Infobox({
     className,
     children,
 }) {
+    const Wrapper = id ? 'div' : Fragment
     const infoboxClassNames = classNames(classes.root, className, {
         [classes.list]: !!list,
         [classes.warning]: variant === 'warning',
         [classes.danger]: variant === 'danger',
     })
     return (
-        <>
+        <Wrapper>
             {id && <a id={id} />}
             <aside className={infoboxClassNames}>
                 {title && (
@@ -40,7 +41,7 @@ export default function Infobox({
                 )}
                 {children}
             </aside>
-        </>
+        </Wrapper>
     )
 }
 

From 7745d77a38a131f6ffec9b4ae43da8ef799c228e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 13:21:42 +0200
Subject: [PATCH 117/516] Fix whitespace in template [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 7241c5116..53fd99ee8 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -280,7 +280,7 @@ vectors = "{{ word_vectors }}"
 {% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
-{% endif %}
+{% endif -%}
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 

From 6c85fab3167a468953b23b25d4a25a7fbdb478cd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 13:35:09 +0200
Subject: [PATCH 118/516] state_type and extra_state_tokens instead of
 nr_feature_tokens

---
 spacy/cli/templates/quickstart_training.jinja | 12 ++++---
 spacy/ml/models/parser.py                     | 31 +++++++++----------
 spacy/pipeline/dep_parser.pyx                 |  3 +-
 spacy/pipeline/ner.pyx                        |  3 +-
 .../tests/serialize/test_serialize_config.py  |  9 ++++--
 website/docs/api/architectures.md             | 22 +++++++------
 website/docs/usage/embeddings-transformers.md |  3 +-
 7 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 7241c5116..9dde2237b 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -59,7 +59,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false
@@ -79,7 +80,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = false
@@ -183,7 +185,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = true
@@ -200,7 +203,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 868f9d6d2..0e10932d5 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -11,7 +11,8 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    nr_feature_tokens: int,
+    state_type: str,
+    extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
@@ -40,20 +41,12 @@ def build_tb_parser_model(
 
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
-    nr_feature_tokens (int): The number of tokens in the context to use to
-        construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
-        2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
-        feature sets are designed for the NER. The recommended feature sets are
-        3 for NER, and 8 for the dependency parser.
-
-        TODO: This feature should be split into two, state_type: ["deps", "ner"]
-        and extra_state_features: [True, False]. This would map into:
-
-        (deps, False): 8
-        (deps, True): 13
-        (ner, False): 3
-        (ner, True): 6
-
+    state_type (str):
+        String value denoting the type of parser model: "deps" or "ner"
+    extra_state_tokens (bool): Whether or not to use additional tokens in the context
+        to construct the state vector. Defaults to `False`, which means 3 and 8
+        for the NER and parser respectively. When set to `True`, this would become 6
+        feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
         Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
@@ -68,8 +61,14 @@ def build_tb_parser_model(
         Usually inferred from data at the beginning of training, or loaded from
         disk.
     """
+    if state_type == "deps":
+        nr_feature_tokens = 13 if extra_state_tokens else 8
+    elif state_type == "ner":
+        nr_feature_tokens = 6 if extra_state_tokens else 3
+    else:
+        raise ValueError(f"unknown state type {state_type}")  # TODO error
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
+    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
     tok2vec.set_dim("nO", hidden_width)
     lower = PrecomputableAffine(
         nO=hidden_width if use_upper else nO,
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index edd791e40..7d8c63815 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -15,7 +15,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 2fa5c6392..fc4f03473 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -13,7 +13,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1e17b3212..abfd4d725 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 99
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
 
@@ -95,7 +96,11 @@ def my_parser():
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )
     parser = build_tb_parser_model(
-        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
+        tok2vec=tok2vec,
+        state_type="deps",
+        extra_state_tokens=True,
+        hidden_width=65,
+        maxout_pieces=5,
     )
     return parser
 
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 30d863b17..0d283d805 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -414,7 +414,8 @@ one component.
 > ```ini
 > [model]
 > @architectures = "spacy.TransitionBasedParser.v1"
-> nr_feature_tokens = 6
+> state_type = "ner"
+> extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
 >
@@ -446,15 +447,16 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                | Description                                                                                                                                                                                                                                                                                                                                                             |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~       |
-| `hidden_width`      | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`     | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`         | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**         | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~                                                                                                                                                                                                                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index a855d703c..d61172a5b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -448,7 +448,8 @@ factory = "ner"
 
 [nlp.pipeline.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false

From 7489d02deaae09f1d0901122c7c40c71f0e85560 Mon Sep 17 00:00:00 2001
From: Muhammad Fahmi Rasyid <ufarasfa@gmail.com>
Date: Wed, 23 Sep 2020 19:02:26 +0700
Subject: [PATCH 119/516] Update Indonesian Example Phrases   (#6124)

* create contributor agreement

* Update Indonesian example. (see  #1107)

Update Indonesian examples with more proper phrases. the current phrases contains sensitive and violent words.
---
 .github/contributors/rasyidf.md | 106 ++++++++++++++++++++++++++++++++
 spacy/lang/id/examples.py       |   4 +-
 2 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 .github/contributors/rasyidf.md

diff --git a/.github/contributors/rasyidf.md b/.github/contributors/rasyidf.md
new file mode 100644
index 000000000..4a70547a3
--- /dev/null
+++ b/.github/contributors/rasyidf.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Muhammad Fahmi Rasyid    |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  |                          |
+| Date                           | 2020-09-23               |
+| GitHub username                | rasyidf                  |
+| Website (optional)             | http://rasyidf.github.io |
diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py
index 56ac9165e..7b4a4e513 100644
--- a/spacy/lang/id/examples.py
+++ b/spacy/lang/id/examples.py
@@ -11,8 +11,8 @@ Example sentences to test spaCy and its language models.
 
 
 sentences = [
-    "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
-    "Abu Sayyaf mengeksekusi sandera warga Filipina",
+    "Indonesia merupakan negara kepulauan yang kaya akan budaya.",
+    "Berapa banyak warga yang dibutuhkan saat kerja bakti?",
     "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
     "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
     "Jakarta adalah kota besar yang nyaris tidak pernah tidur."

From e4e7f5b00d46b0a6f75e419c509fbd0c73927121 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 15:44:40 +0200
Subject: [PATCH 120/516] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 33163f306..028746db0 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
 
 | System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           |      |      |      |
+| spaCy RoBERTa (2020)                                                           | 97.8 | 96.6 | 94.7 |
 | spaCy CNN (2020)                                                               |      |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |
@@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
 
 **Accuracy on the Penn Treebank.** See
 [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results.
+results. For spaCy's evaluation, see the
+[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
 
 </figcaption>
 

From 76bbed3466519d384834715f48f240140c43e02e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:00:03 +0200
Subject: [PATCH 121/516] Use Literal type for nr_feature_tokens

---
 requirements.txt                               |  1 +
 setup.cfg                                      |  1 +
 spacy/compat.py                                |  5 +++++
 spacy/ml/models/parser.py                      |  3 ++-
 spacy/tests/serialize/test_serialize_config.py | 14 ++++++++++++--
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4d6c1dfd0..a8b237aa1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ pytokenizations
 setuptools
 packaging
 importlib_metadata>=0.20; python_version < "3.8"
+typing_extensions>=3.7.4; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
diff --git a/setup.cfg b/setup.cfg
index dd0975800..9831402d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,6 +57,7 @@ install_requires =
     setuptools
     packaging
     importlib_metadata>=0.20; python_version < "3.8"
+    typing_extensions>=3.7.4; python_version < "3.8"
 
 [options.entry_points]
 console_scripts =
diff --git a/spacy/compat.py b/spacy/compat.py
index 2d51ff0ae..6eca18b80 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -22,6 +22,11 @@ try:
 except ImportError:
     cupy = None
 
+try:  # Python 3.8+
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 868f9d6d2..68cc20e9b 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,6 +2,7 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
+from ...compat import Literal
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -11,7 +12,7 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    nr_feature_tokens: int,
+    nr_feature_tokens: Literal[3, 6, 8, 13],
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1e17b3212..5f25cbfe1 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 99
+nr_feature_tokens = 3
 hidden_width = 66
 maxout_pieces = 2
 
@@ -95,7 +95,7 @@ def my_parser():
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )
     parser = build_tb_parser_model(
-        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
+        tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5
     )
     return parser
 
@@ -340,3 +340,13 @@ def test_config_auto_fill_extra_fields():
     assert "extra" not in nlp.config["training"]
     # Make sure the config generated is valid
     load_model_from_config(nlp.config)
+
+
+def test_config_validate_literal():
+    nlp = English()
+    config = Config().from_str(parser_config_string)
+    config["model"]["nr_feature_tokens"] = 666
+    with pytest.raises(ConfigValidationError):
+        nlp.add_pipe("parser", config=config)
+    config["model"]["nr_feature_tokens"] = 13
+    nlp.add_pipe("parser", config=config)

From 50a4425cdaed350653368c9c350f95717e9414d9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:03:32 +0200
Subject: [PATCH 122/516] Adjust docs

---
 spacy/ml/models/parser.py         | 4 ++--
 website/docs/api/architectures.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 68cc20e9b..5d091c590 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -42,8 +42,8 @@ def build_tb_parser_model(
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
     nr_feature_tokens (int): The number of tokens in the context to use to
-        construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
-        2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
+        construct the state vector. Valid choices are 3, 6, 8 and 13. The
+        8 and 13 feature sets are designed for the parser, while the 3 and 6
         feature sets are designed for the NER. The recommended feature sets are
         3 for NER, and 8 for the dependency parser.
 
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 30d863b17..8797b2f31 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -449,7 +449,7 @@ consists of either two or three subnetworks:
 | Name                | Description                                                                                                                                                                                                                                                                                                                                                             |
 | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tok2vec`           | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~       |
+| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~                      |
 | `hidden_width`      | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
 | `maxout_pieces`     | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
 | `use_upper`         | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |

From dd2292793f3bbd7cdfd2cf42bad205ec7428016a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 16:53:49 +0200
Subject: [PATCH 123/516] 'parser' instead of 'deps' for state_type

---
 spacy/cli/templates/quickstart_training.jinja  | 4 ++--
 spacy/ml/models/parser.py                      | 4 ++--
 spacy/pipeline/dep_parser.pyx                  | 2 +-
 spacy/tests/serialize/test_serialize_config.py | 4 ++--
 website/docs/api/architectures.md              | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9dde2237b..bc7e206f5 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -59,7 +59,7 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
@@ -185,7 +185,7 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 0e10932d5..b6e4b8d8a 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -42,7 +42,7 @@ def build_tb_parser_model(
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
     state_type (str):
-        String value denoting the type of parser model: "deps" or "ner"
+        String value denoting the type of parser model: "parser" or "ner"
     extra_state_tokens (bool): Whether or not to use additional tokens in the context
         to construct the state vector. Defaults to `False`, which means 3 and 8
         for the NER and parser respectively. When set to `True`, this would become 6
@@ -61,7 +61,7 @@ def build_tb_parser_model(
         Usually inferred from data at the beginning of training, or loaded from
         disk.
     """
-    if state_type == "deps":
+    if state_type == "parser":
         nr_feature_tokens = 13 if extra_state_tokens else 8
     elif state_type == "ner":
         nr_feature_tokens = 6 if extra_state_tokens else 3
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 7d8c63815..a49475c8e 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -15,7 +15,7 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index abfd4d725..10e0e132b 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
@@ -97,7 +97,7 @@ def my_parser():
     )
     parser = build_tb_parser_model(
         tok2vec=tok2vec,
-        state_type="deps",
+        state_type="parser",
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 0d283d805..ef2666ec0 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -450,7 +450,7 @@ consists of either two or three subnetworks:
 | Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
 | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~                                                                                                                                                                                                                                                                               |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
 | `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
 | `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
 | `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |

From 3c3863654e2804223a30c8ed3cae3d2e73147ca6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:54:43 +0200
Subject: [PATCH 124/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index b57bbeda2..b0cdd562c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a21"
+__version__ = "3.0.0a22"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 25b34bba9406a3185406e79e8b0e45048e7f3914 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 16:57:14 +0200
Subject: [PATCH 125/516] throw custom error when state_type is invalid

---
 spacy/errors.py           | 2 ++
 spacy/ml/models/parser.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 153f8da0c..47a134c1f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,8 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E917 = ("Received invalid value {value} for 'state_type' in "
+            "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
             "values are an instance of spacy.vocab.Vocab or True to create one"
             " (default).")
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index b6e4b8d8a..dbea6b507 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,6 +2,7 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
+from ... import Errors
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -66,7 +67,7 @@ def build_tb_parser_model(
     elif state_type == "ner":
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
-        raise ValueError(f"unknown state type {state_type}")  # TODO error
+        raise ValueError(Errors.E917.format(value=state_type))
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
     tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
     tok2vec.set_dim("nO", hidden_width)

From 5a9fdbc8ad8e6e03968b78e026b8ee75e4c4a3e1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 17:32:14 +0200
Subject: [PATCH 126/516] state_type as Literal

---
 spacy/ml/models/parser.py                      |  5 +++--
 spacy/tests/serialize/test_serialize_config.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index dbea6b507..2c40bb3ab 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,7 +2,8 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ... import Errors
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -12,7 +13,7 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: str,
+    state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 10e0e132b..6aad59272 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -345,3 +345,13 @@ def test_config_auto_fill_extra_fields():
     assert "extra" not in nlp.config["training"]
     # Make sure the config generated is valid
     load_model_from_config(nlp.config)
+
+
+def test_config_validate_literal():
+    nlp = English()
+    config = Config().from_str(parser_config_string)
+    config["model"]["state_type"] = "nonsense"
+    with pytest.raises(ConfigValidationError):
+        nlp.add_pipe("parser", config=config)
+    config["model"]["state_type"] = "ner"
+    nlp.add_pipe("parser", config=config)
\ No newline at end of file

From b816ace4bbd158524865b7e995da8fa23ee0bc2b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 17:33:13 +0200
Subject: [PATCH 127/516] format

---
 spacy/tests/serialize/test_serialize_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 6aad59272..ec7544456 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -354,4 +354,4 @@ def test_config_validate_literal():
     with pytest.raises(ConfigValidationError):
         nlp.add_pipe("parser", config=config)
     config["model"]["state_type"] = "ner"
-    nlp.add_pipe("parser", config=config)
\ No newline at end of file
+    nlp.add_pipe("parser", config=config)

From 3f77eb749c411f78dc21135deb446ad8d5fde76c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 19:50:15 +0200
Subject: [PATCH 128/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index b0cdd562c..8d019897b 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a22"
+__version__ = "3.0.0a23"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From f25f05c503c83949c9831028e221f3d024358889 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 20:03:04 +0200
Subject: [PATCH 129/516] Adjust sort order [ci skip]

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 93000ea27..025fe5288 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
-CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
 # fmt: on
 
 

From c8bda92243b7752ad88be46e071368376704fb2b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 20:05:02 +0200
Subject: [PATCH 130/516] Update benchmarks [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 028746db0..c5ce95e2f 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
 
 | System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 97.8 | 96.6 | 94.7 |
+| spaCy RoBERTa (2020)                                                           | 98.0 | 96.8 | 95.0 |
 | spaCy CNN (2020)                                                               |      |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |

From 02008e9a55ea0d4a3ac41cb2324d89c9f837abcd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 22:02:31 +0200
Subject: [PATCH 131/516] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 40 +++++++++++-------------
 website/docs/usage/facts-figures.md      | 19 +++++++++++
 website/src/widgets/landing.js           |  2 +-
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index c5ce95e2f..1fe6e2bff 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -4,21 +4,16 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figure>
 
-| System                                                                    |            Parser |            Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
-| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
-| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3)                |                   |                   |      |                                                                     |                                                                 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)                  |                   |                   |      |                                                                     |                                                                    |
-| `en_core_web_lg` (spaCy v2)                                               |              91.9 |              97.2 | 85.9 |                                                                 10k |                                                                    |
-| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 |                                                                 234 |                                                                 2k |
-| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link>        |                 - |              97.9 | 89.3 |                                                                     |                                                                    |
+| System                                                     | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
+| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
+| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
+| `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
 
 <figcaption class="caption">
 
 **Accuracy and speed on the
-[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. **
-[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_:
-Qi et al. don't report parsing and tagging results on OntoNotes. We're working
-on training Stanza on this corpus to allow direct comparison.
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
 
 </figcaption>
 
@@ -26,19 +21,22 @@ on training Stanza on this corpus to allow direct comparison.
 
 <figure>
 
-| System                                                                         |  POS |  UAS |  LAS |
-| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 98.0 | 96.8 | 95.0 |
-| spaCy CNN (2020)                                                               |      |      |      |
-| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
-| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |
+| Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
+| ------------------------------------------------------------------------------ | --------: | --------- |
+| spaCy RoBERTa (2020)                                                           |
+| spaCy CNN (2020)                                                               |           |
+| spaCy CNN (2017)                                                               |      86.4 |
+| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |
+| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |
 
 <figcaption class="caption">
 
-**Accuracy on the Penn Treebank.** See
-[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results. For spaCy's evaluation, see the
-[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
+**Named entity recognition accuracy** on the
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and
+[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
+[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
+more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
+**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/)
 
 </figcaption>
 
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 75f92070a..ad6776b2c 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -61,6 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 <Benchmarks />
 
+<figure>
+
+| System                                                                         |  UAS |  LAS |
+| ------------------------------------------------------------------------------ | ---: | ---: |
+| spaCy RoBERTa (2020)                                                           | 96.8 | 95.0 |
+| spaCy CNN (2020)                                                               | 93.7 | 91.8 |
+| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
+| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 |
+
+<figcaption class="caption">
+
+**Accuracy on the Penn Treebank.** See
+[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
+results.
+
+</figcaption>
+
+</figure>
+
 <Project id="benchmarks/parsing_penn_treebank">
 
 The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 2e75c893a..6fe7f4cdf 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -297,7 +297,7 @@ const Landing = ({ data }) => {
                         to run.
                     </p>
                     <p>
-                        <Button to="/usage/facts-figures#benchmarks">See details</Button>
+                        <Button to="/usage/facts-figures#benchmarks">More results</Button>
                     </p>
                 </LandingCol>
 

From e2ffe51fb5c18b18397930d976fe323f75d02863 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:13:41 +0200
Subject: [PATCH 132/516] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md |  4 ++--
 website/docs/usage/facts-figures.md      | 16 +++++-----------
 website/docs/usage/projects.md           |  2 +-
 website/gatsby-config.js                 |  1 +
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 1fe6e2bff..a00229867 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -12,8 +12,8 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figcaption class="caption">
 
-**Accuracy and speed on the
-[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
+**Full pipeline accuracy and speed** on the
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
 
 </figcaption>
 
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index ad6776b2c..743dae74d 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -65,28 +65,22 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 | System                                                                         |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 96.8 | 95.0 |
-| spaCy CNN (2020)                                                               | 93.7 | 91.8 |
+| spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 |
+| spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 |
 
 <figcaption class="caption">
 
-**Accuracy on the Penn Treebank.** See
+**Dependency parsing accuracy** on the Penn Treebank. See
 [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results.
+results. **1. ** Project template:
+[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
 
 </figcaption>
 
 </figure>
 
-<Project id="benchmarks/parsing_penn_treebank">
-
-The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
-our project template.
-
-</Project>
-
 <!-- TODO: ## Citing spaCy {#citation}
 
 -->
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 95e20525a..8e093e8d6 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.
 
 ```yaml
-https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
+%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
 ```
 
 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 5e3b5b537..c1a2f9ab9 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master'
 // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
 const replacements = {
     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
+    GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
 }
 
 /**

From ae51f580c1cd8a4168253d326fd9c1356fc88844 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:27:33 +0200
Subject: [PATCH 133/516] Fix handling of score_weights

---
 spacy/cli/templates/quickstart_training.jinja | 18 ---------
 spacy/cli/train.py                            |  5 ++-
 spacy/lang/bn/__init__.py                     |  1 -
 spacy/lang/el/__init__.py                     |  1 -
 spacy/lang/en/__init__.py                     |  1 -
 spacy/lang/fa/__init__.py                     |  1 -
 spacy/lang/fr/__init__.py                     |  1 -
 spacy/lang/nb/__init__.py                     |  1 -
 spacy/lang/nl/__init__.py                     |  1 -
 spacy/lang/pl/__init__.py                     |  1 -
 spacy/lang/ru/__init__.py                     |  1 -
 spacy/lang/sv/__init__.py                     |  1 -
 spacy/lang/uk/__init__.py                     |  1 -
 spacy/language.py                             | 20 ++++++----
 spacy/pipeline/dep_parser.pyx                 | 10 ++++-
 spacy/pipeline/entityruler.py                 |  8 +++-
 spacy/pipeline/lemmatizer.py                  |  1 -
 spacy/pipeline/morphologizer.pyx              |  3 +-
 spacy/pipeline/ner.pyx                        |  3 +-
 spacy/pipeline/sentencizer.pyx                |  1 -
 spacy/pipeline/senter.pyx                     |  1 -
 spacy/pipeline/tagger.pyx                     |  1 -
 spacy/pipeline/textcat.py                     | 23 ++++++-----
 spacy/schemas.py                              |  2 +-
 spacy/tests/pipeline/test_pipe_factories.py   | 23 ++++++++---
 spacy/util.py                                 | 11 ++++++
 website/docs/api/language.md                  | 39 +++++++++----------
 website/docs/usage/training.md                |  7 ++--
 28 files changed, 95 insertions(+), 92 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index a0ffa8f52..9a8b9d1d7 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -317,21 +317,3 @@ start = 100
 stop = 1000
 compound = 1.001
 {% endif %}
-
-[training.score_weights]
-{%- if "tagger" in components %}
-tag_acc = {{ (1.0 / components|length)|round(2) }}
-{%- endif -%}
-{%- if "parser" in components %}
-dep_uas = 0.0
-dep_las = {{ (1.0 / components|length)|round(2) }}
-sents_f = 0.0
-{%- endif %}
-{%- if "ner" in components %}
-ents_f = {{ (1.0 / components|length)|round(2) }}
-ents_p = 0.0
-ents_r = 0.0
-{%- endif %}
-{%- if "textcat" in components %}
-cats_score = {{ (1.0 / components|length)|round(2) }}
-{%- endif -%}
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2900ef379..3485a4ff2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -209,6 +209,8 @@ def create_train_batches(iterator, batcher, max_epochs: int):
 def create_evaluation_callback(
     nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    weights = {key: value for key, value in weights.items() if value is not None}
+
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
         scores = nlp.evaluate(dev_examples)
@@ -368,7 +370,8 @@ def update_meta(
 ) -> None:
     nlp.meta["performance"] = {}
     for metric in training["score_weights"]:
-        nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
+        if metric is not None:
+            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
     for pipe_name in nlp.pipe_names:
         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 270185a4b..923e29a17 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -25,7 +25,6 @@ class Bengali(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 0c5e0672b..1a7b19914 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -30,7 +30,6 @@ class Greek(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 1a595b6e7..bf7e9987f 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -29,7 +29,6 @@ class English(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 244534120..f3a6635dc 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -28,7 +28,6 @@ class Persian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 42241cd8a..72e641d1f 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -33,7 +33,6 @@ class French(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 28a2f0bf2..9672dfd6e 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -28,7 +28,6 @@ class Norwegian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 1526e41f5..15b6b9de2 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -30,7 +30,6 @@ class Dutch(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 7ddad9893..573dbc6f9 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -35,7 +35,6 @@ class Polish(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pos_lookup", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index be770e3ec..4a296dd23 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -25,7 +25,6 @@ class Russian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pymorphy2", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 6db74cd39..ea314f487 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -31,7 +31,6 @@ class Swedish(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index e9936cf7d..006a1cf7f 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -25,7 +25,6 @@ class Ukrainian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pymorphy2", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/language.py b/spacy/language.py
index 4dffd9679..0b7deacad 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -248,9 +248,15 @@ class Language:
         self._config["nlp"]["pipeline"] = list(self.component_names)
         self._config["nlp"]["disabled"] = list(self.disabled)
         self._config["components"] = pipeline
-        if not self._config["training"].get("score_weights"):
-            combined_score_weights = combine_score_weights(score_weights)
-            self._config["training"]["score_weights"] = combined_score_weights
+        # We're merging the existing score weights back into the combined
+        # weights to make sure we're preserving custom settings in the config
+        # but also reflect updates (e.g. new components added)
+        prev_score_weights = self._config["training"].get("score_weights", {})
+        combined_score_weights = combine_score_weights(score_weights)
+        combined_score_weights.update(prev_score_weights)
+        # Combine the scores a second time to normalize them
+        combined_score_weights = combine_score_weights([combined_score_weights])
+        self._config["training"]["score_weights"] = combined_score_weights
         if not srsly.is_json_serializable(self._config):
             raise ValueError(Errors.E961.format(config=self._config))
         return self._config
@@ -412,7 +418,6 @@ class Language:
         assigns: Iterable[str] = SimpleFrozenList(),
         requires: Iterable[str] = SimpleFrozenList(),
         retokenizes: bool = False,
-        scores: Iterable[str] = SimpleFrozenList(),
         default_score_weights: Dict[str, float] = SimpleFrozenDict(),
         func: Optional[Callable] = None,
     ) -> Callable:
@@ -430,12 +435,11 @@ class Language:
             e.g. "token.ent_id". Used for pipeline analyis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
-        scores (Iterable[str]): All scores set by the component if it's trainable,
-            e.g. ["ents_f", "ents_r", "ents_p"].
         default_score_weights (Dict[str, float]): The scores to report during
             training, and their default weight towards the final score used to
             select the best model. Weights should sum to 1.0 per component and
-            will be combined and normalized for the whole pipeline.
+            will be combined and normalized for the whole pipeline. If None,
+            the score won't be shown in the logs or be weighted.
         func (Optional[Callable]): Factory function if not used as a decorator.
 
         DOCS: https://nightly.spacy.io/api/language#factory
@@ -475,7 +479,7 @@ class Language:
                 default_config=default_config,
                 assigns=validate_attrs(assigns),
                 requires=validate_attrs(requires),
-                scores=scores,
+                scores=list(default_score_weights.keys()),
                 default_score_weights=default_score_weights,
                 retokenizes=retokenizes,
             )
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index a49475c8e..a447434d2 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -43,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
         "min_action_freq": 30,
         "model": DEFAULT_PARSER_MODEL,
     },
-    scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"],
-    default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
+    default_score_weights={
+        "dep_uas": 0.5,
+        "dep_las": 0.5,
+        "dep_las_per_type": None,
+        "sents_p": None,
+        "sents_r": None,
+        "sents_f": 0.0,
+    },
 )
 def make_parser(
     nlp: Language,
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 24bbb067f..9166a69b8 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "overwrite_ents": False,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
     },
-    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_entity_ruler(
     nlp: Language,
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 0fd3482c4..c30d09f62 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -21,7 +21,6 @@ from .. import util
         "lookups": None,
         "overwrite": False,
     },
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 62ad9e0eb..5fee9a900 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
     "morphologizer",
     assigns=["token.morph", "token.pos"],
     default_config={"model": DEFAULT_MORPH_MODEL},
-    scores=["pos_acc", "morph_acc", "morph_per_feat"],
-    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
+    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index fc4f03473..c9b0a5031 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -39,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
         "update_with_oracle_cut_size": 100,
         "model": DEFAULT_NER_MODEL,
     },
-    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 
 )
 def make_ner(
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 5700c2b98..2882f6f8b 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -15,7 +15,6 @@ from .. import util
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
     default_config={"punct_chars": None},
-    scores=["sents_p", "sents_r", "sents_f"],
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index a7eb721fd..da85a9cf2 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
     "senter",
     assigns=["token.is_sent_start"],
     default_config={"model": DEFAULT_SENTER_MODEL},
-    scores=["sents_p", "sents_r", "sents_f"],
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_senter(nlp: Language, name: str, model: Model):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 0d78047ae..3efe29916 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
     "tagger",
     assigns=["token.tag"],
     default_config={"model": DEFAULT_TAGGER_MODEL},
-    scores=["tag_acc"],
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(nlp: Language, name: str, model: Model):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index e7cb62a0d..6b8c0ca65 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -62,18 +62,17 @@ subword_features = true
         "positive_label": None,
         "model": DEFAULT_TEXTCAT_MODEL,
     },
-    scores=[
-        "cats_score",
-        "cats_score_desc",
-        "cats_p",
-        "cats_r",
-        "cats_f",
-        "cats_macro_f",
-        "cats_macro_auc",
-        "cats_f_per_type",
-        "cats_macro_auc_per_type",
-    ],
-    default_score_weights={"cats_score": 1.0},
+    default_score_weights={
+        "cats_score": 1.0,
+        "cats_score_desc": None,
+        "cats_p": None,
+        "cats_r": None,
+        "cats_f": None,
+        "cats_macro_f": None,
+        "cats_macro_auc": None,
+        "cats_f_per_type": None,
+        "cats_macro_auc_per_type": None,
+    },
 )
 def make_textcat(
     nlp: Language,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index b0f26dcd7..e34841008 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel):
     seed: Optional[StrictInt] = Field(..., title="Random seed")
     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
-    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
+    score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
     raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 881460704..4ab1c4248 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -359,12 +359,8 @@ def test_language_factories_scores():
     func = lambda nlp, name: lambda doc: doc
     weights1 = {"a1": 0.5, "a2": 0.5}
     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
-    Language.factory(
-        f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
-    )
-    Language.factory(
-        f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
-    )
+    Language.factory(f"{name}1", default_score_weights=weights1, func=func)
+    Language.factory(f"{name}2", default_score_weights=weights2, func=func)
     meta1 = Language.get_factory_meta(f"{name}1")
     assert meta1.default_score_weights == weights1
     meta2 = Language.get_factory_meta(f"{name}2")
@@ -376,6 +372,21 @@ def test_language_factories_scores():
     cfg = nlp.config["training"]
     expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
     assert cfg["score_weights"] == expected_weights
+    # Test with custom defaults
+    config = nlp.config.copy()
+    config["training"]["score_weights"]["a1"] = 0.0
+    config["training"]["score_weights"]["b3"] = 1.0
+    nlp = English.from_config(config)
+    score_weights = nlp.config["training"]["score_weights"]
+    expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59}
+    assert score_weights == expected
+    # Test with null values
+    config = nlp.config.copy()
+    config["training"]["score_weights"]["a1"] = None
+    nlp = English.from_config(config)
+    score_weights = nlp.config["training"]["score_weights"]
+    expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58}  # rounding :(
+    assert score_weights == expected
 
 
 def test_pipe_factories_from_source():
diff --git a/spacy/util.py b/spacy/util.py
index 025fe5288..f7c5cff59 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1209,8 +1209,19 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
     weights (List[dict]): The weights defined by the components.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
+    # We first need to extract all None/null values for score weights that
+    # shouldn't be shown in the table *or* be weighted
     result = {}
+    all_weights = []
     for w_dict in weights:
+        filtered_weights = {}
+        for key, value in w_dict.items():
+            if value is None:
+                result[key] = None
+            else:
+                filtered_weights[key] = value
+        all_weights.append(filtered_weights)
+    for w_dict in all_weights:
         # We need to account for weights that don't sum to 1.0 and normalize
         # the score weights accordingly, then divide score by the number of
         # components.
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index a7b9c0d88..dd3cc57dd 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -145,17 +145,16 @@ examples, see the
 > )
 > ```
 
-| Name                    | Description                                                                                                                                                                                                                                      |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                       |
-| _keyword-only_          |                                                                                                                                                                                                                                                  |
-| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                   |
-| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                             |
-| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     |
-| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
-| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                 |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                                                                                                         |
+| _keyword-only_          |                                                                                                                                                                                                                                                                                                                                    |
+| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     |
+| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               |
+| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
+| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                   |
 
 ## Language.\_\_call\_\_ {#call tag="method"}
 
@@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or
 component is defined and stored on the `Language` class for each component
 instance and factory instance.
 
-| Name                    | Description                                                                                                                                                                                                                                      |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                            |
-| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                   |
-| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                             |
-| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     |
-| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                                                                                                              |
+| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     |
+| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               |
+| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
+| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                              |
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index b63145636..65afd0eb4 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -470,6 +470,7 @@ score.
 ```ini
 [training.score_weights]
 dep_las = 0.4
+dep_uas = null
 ents_f = 0.4
 tag_acc = 0.2
 token_acc = 0.0
@@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by
 combining and normalizing the default score weights of the pipeline components.
 The default score weights are defined by each pipeline component via the
 `default_score_weights` setting on the
-[`@Language.component`](/api/language#component) or
-[`@Language.factory`](/api/language#factory). By default, all pipeline
-components are weighted equally.
+[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline
+components are weighted equally. If a score weight is set to `null`, it will be
+excluded from the logs and the score won't be weighted.
 
 <Accordion title="Understanding the training output and score types" spaced>
 

From 17a6b0a1731321380914d3638e7e3bc25fd23a28 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 10:30:42 +0200
Subject: [PATCH 134/516] Make project pull order insensitive (#6131)

---
 spacy/cli/project/pull.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index edcd410bd..3119d3a12 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -27,19 +27,32 @@ def project_pull_cli(
 
 
 def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
+    # TODO: We don't have tests for this :(. It would take a bit of mockery to
+    # set up. I guess see if it breaks first?
     config = load_project_config(project_dir)
     if remote in config.get("remotes", {}):
         remote = config["remotes"][remote]
     storage = RemoteStorage(project_dir, remote)
-    for cmd in config.get("commands", []):
-        deps = [project_dir / dep for dep in cmd.get("deps", [])]
-        if any(not dep.exists() for dep in deps):
-            continue
-        cmd_hash = get_command_hash("", "", deps, cmd["script"])
-        for output_path in cmd.get("outputs", []):
-            url = storage.pull(output_path, command_hash=cmd_hash)
-            yield url, output_path
+    commands = list(config.get("commands", []))
+    # We use a while loop here because we don't know how the commands
+    # will be ordered. A command might need dependencies from one that's later
+    # in the list.
+    while commands:
+        for i, cmd in enumerate(list(commands)):
+            deps = [project_dir / dep for dep in cmd.get("deps", [])]
+            if all(dep.exists() for dep in deps):
+                cmd_hash = get_command_hash("", "", deps, cmd["script"])
+                for output_path in cmd.get("outputs", []):
+                    url = storage.pull(output_path, command_hash=cmd_hash)
+                    yield url, output_path
 
-        out_locs = [project_dir / out for out in cmd.get("outputs", [])]
-        if all(loc.exists() for loc in out_locs):
-            update_lockfile(project_dir, cmd)
+                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
+                if all(loc.exists() for loc in out_locs):
+                    update_lockfile(project_dir, cmd)
+                # We remove the command from the list here, and break, so that
+                # we iterate over the loop again.
+                commands.remove(i)
+                break
+        else:
+            # If we didn't break the for loop, break the while loop.
+            break

From c645c4e7ceddbd819b7a56e56f013bb8447dea4b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 24 Sep 2020 10:31:17 +0200
Subject: [PATCH 135/516] fix micro PRF for textcat (#6130)

* fix micro PRF for textcat

* small fix
---
 spacy/scorer.py                      |  8 ++++----
 spacy/tests/pipeline/test_textcat.py | 29 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index da22d59d4..c50de3d43 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -240,7 +240,7 @@ class Scorer:
                             pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
                 per_feat[field].score_set(
-                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
         result = {k: v.to_dict() for k, v in per_feat.items()}
         return {f"{attr}_per_feat": result}
@@ -418,9 +418,9 @@ class Scorer:
                     f_per_type[pred_label].fp += 1
         micro_prf = PRFScore()
         for label_prf in f_per_type.values():
-            micro_prf.tp = label_prf.tp
-            micro_prf.fn = label_prf.fn
-            micro_prf.fp = label_prf.fp
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
         n_cats = len(f_per_type) + 1e-100
         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 99b5132ca..232b53e1d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -8,6 +8,7 @@ from spacy.language import Language
 from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
+from spacy.scorer import Scorer
 
 from ..util import make_tempdir
 from ...cli.train import verify_textcat_config
@@ -224,3 +225,31 @@ def test_positive_class_not_binary():
     assert textcat.labels == ("SOME", "THING", "POS")
     with pytest.raises(ValueError):
         verify_textcat_config(nlp, pipe_config)
+
+def test_textcat_evaluation():
+    train_examples = []
+    nlp = English()
+    ref1 = nlp("one")
+    ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0}
+    pred1 = nlp("one")
+    pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
+    train_examples.append(Example(pred1, ref1))
+
+    ref2 = nlp("two")
+    ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
+    pred2 = nlp("two")
+    pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
+    train_examples.append(Example(pred2, ref2))
+
+    scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
+    assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
+    assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
+    assert scores["cats_f_per_type"]["summer"]["p"] == 0
+    assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
+    assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
+    assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
+    assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
+    assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
+
+    assert scores["cats_micro_p"] == 4/5
+    assert scores["cats_micro_r"] == 4/6

From 4bbe41f017ffc6334a35f2a682804cf6365dfd9e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:42:47 +0200
Subject: [PATCH 136/516] Fix combined scores and update test

---
 spacy/language.py                           |  7 ++-----
 spacy/tests/pipeline/test_pipe_factories.py |  4 ++--
 spacy/util.py                               | 10 ++++++++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 0b7deacad..a52391419 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -251,11 +251,8 @@ class Language:
         # We're merging the existing score weights back into the combined
         # weights to make sure we're preserving custom settings in the config
         # but also reflect updates (e.g. new components added)
-        prev_score_weights = self._config["training"].get("score_weights", {})
-        combined_score_weights = combine_score_weights(score_weights)
-        combined_score_weights.update(prev_score_weights)
-        # Combine the scores a second time to normalize them
-        combined_score_weights = combine_score_weights([combined_score_weights])
+        prev_weights = self._config["training"].get("score_weights", {})
+        combined_score_weights = combine_score_weights(score_weights, prev_weights)
         self._config["training"]["score_weights"] = combined_score_weights
         if not srsly.is_json_serializable(self._config):
             raise ValueError(Errors.E961.format(config=self._config))
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 4ab1c4248..4c197005e 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -378,14 +378,14 @@ def test_language_factories_scores():
     config["training"]["score_weights"]["b3"] = 1.0
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59}
+    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
     assert score_weights == expected
     # Test with null values
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = None
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58}  # rounding :(
+    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
     assert score_weights == expected
 
 
diff --git a/spacy/util.py b/spacy/util.py
index f7c5cff59..709da8d29 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1202,11 +1202,16 @@ def get_arg_names(func: Callable) -> List[str]:
     return list(set([*argspec.args, *argspec.kwonlyargs]))
 
 
-def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
+def combine_score_weights(
+    weights: List[Dict[str, float]],
+    overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
+) -> Dict[str, float]:
     """Combine and normalize score weights defined by components, e.g.
     {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
 
     weights (List[dict]): The weights defined by the components.
+    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
+        should be preserved.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
     # We first need to extract all None/null values for score weights that
@@ -1216,6 +1221,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
     for w_dict in weights:
         filtered_weights = {}
         for key, value in w_dict.items():
+            value = overrides.get(key, value)
             if value is None:
                 result[key] = None
             else:
@@ -1227,7 +1233,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
         # components.
         total = sum(w_dict.values())
         for key, value in w_dict.items():
-            weight = round(value / total / len(weights), 2)
+            weight = round(value / total / len(all_weights), 2)
             result[key] = result.get(key, 0.0) + weight
     return result
 

From 4eb39b5c43c74f8eabc1b2a8fa3b68e8baa02d3a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 11:04:35 +0200
Subject: [PATCH 137/516] Fix logging

---
 spacy/errors.py           |  1 +
 spacy/training/loggers.py | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 47a134c1f..ee2091225 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,7 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
     E917 = ("Received invalid value {value} for 'state_type' in "
             "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index dddf20169..d35b5a4bd 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -13,7 +13,8 @@ def console_logger():
     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
         # we assume here that only components are enabled that should be trained & logged
         logged_pipes = nlp.pipe_names
-        score_cols = list(nlp.config["training"]["score_weights"])
+        score_weights = nlp.config["training"]["score_weights"]
+        score_cols = [col for col, value in score_weights.items() if value is not None]
         score_widths = [max(len(col), 6) for col in score_cols]
         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
         loss_widths = [max(len(col), 8) for col in loss_cols]
@@ -40,10 +41,15 @@ def console_logger():
                 ) from None
             scores = []
             for col in score_cols:
-                score = float(info["other_scores"].get(col, 0.0))
-                if col != "speed":
-                    score *= 100
-                scores.append("{0:.2f}".format(score))
+                score = info["other_scores"].get(col, 0.0)
+                try:
+                    score = float(score)
+                    if col != "speed":
+                        score *= 100
+                    scores.append("{0:.2f}".format(score))
+                except TypeError:
+                    err = Errors.E916.format(name=col, score_type=type(score))
+                    raise TypeError(err) from None
             data = (
                 [info["epoch"], info["step"]]
                 + losses

From f69fea8b252ac5f28c4daac40046df507ab6f07f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 11:29:07 +0200
Subject: [PATCH 138/516] Improve error handling around non-number scores

---
 spacy/cli/train.py        | 7 ++++++-
 spacy/errors.py           | 4 ++++
 spacy/training/loggers.py | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3485a4ff2..eabc82be0 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -214,7 +214,12 @@ def create_evaluation_callback(
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
         scores = nlp.evaluate(dev_examples)
-        # Calculate a weighted sum based on score_weights for the main score
+        # Calculate a weighted sum based on score_weights for the main score.
+        # We can only consider scores that are ints/floats, not dicts like
+        # entity scores per type etc.
+        for key, value in scores.items():
+            if key in weights and not isinstance(value, (int, float)):
+                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
         try:
             weighted_score = sum(
                 scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
diff --git a/spacy/errors.py b/spacy/errors.py
index ee2091225..dce5cf51c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,10 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
+            "float or int but got: {score_type}. To exclude the score from the "
+            "final score, set its weight to null in the [training.score_weights] "
+            "section of your training config.")
     E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
     E917 = ("Received invalid value {value} for 'state_type' in "
             "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index d35b5a4bd..0f054d433 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -49,7 +49,7 @@ def console_logger():
                     scores.append("{0:.2f}".format(score))
                 except TypeError:
                     err = Errors.E916.format(name=col, score_type=type(score))
-                    raise TypeError(err) from None
+                    raise ValueError(err) from None
             data = (
                 [info["epoch"], info["step"]]
                 + losses

From 8eaacaae97f0caf77576e843a8d6bcf866c79236 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 12:36:51 +0200
Subject: [PATCH 139/516] Refactor Doc.ents setter to use Doc.set_ents

Additional changes:

* Entity spans with missing labels are ignored
* Fix ent_kb_id setting in `Doc.set_ents`
---
 spacy/tests/doc/test_add_entities.py |  4 +--
 spacy/tests/doc/test_doc_api.py      |  2 +-
 spacy/tokens/doc.pyx                 | 50 ++++++----------------------
 3 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 40aff8e31..615ab9e5b 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
     ner.begin_training(lambda: [_ner_example(ner)])
     ner(doc)
 
-    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+    doc.ents = [("ANIMAL", 3, 4)]
     assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
 
-    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+    doc.ents = [("WORD", 0, 2)]
     assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
 
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 892b65cf4..e5e72fe2a 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -534,4 +534,4 @@ def test_doc_ents_setter():
     vocab = Vocab()
     ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
     doc = Doc(vocab, words=words, ents=ents)
-    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
\ No newline at end of file
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4bf6f0e5e..670c7440f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -673,49 +673,16 @@ cdef class Doc:
             # TODO:
             # 1. Test basic data-driven ORTH gazetteer
             # 2. Test more nuanced date and currency regex
-            tokens_in_ents = {}
-            cdef attr_t entity_type
-            cdef attr_t kb_id
-            cdef int ent_start, ent_end, token_index
+            cdef attr_t entity_type, kb_id
+            cdef int ent_start, ent_end
+            ent_spans = []
             for ent_info in ents:
                 entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
                 if isinstance(entity_type_, str):
                     self.vocab.strings.add(entity_type_)
-                entity_type = self.vocab.strings.as_int(entity_type_)
-                for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents:
-                        raise ValueError(Errors.E103.format(
-                            span1=(tokens_in_ents[token_index][0],
-                                   tokens_in_ents[token_index][1],
-                                   self.vocab.strings[tokens_in_ents[token_index][2]]),
-                            span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
-                    tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
-            cdef int i
-            for i in range(self.length):
-                # default values
-                entity_type = 0
-                kb_id = 0
-
-                # Set ent_iob to Outside (2) by default
-                ent_iob = 2
-
-                # overwrite if the token was part of a specified entity
-                if i in tokens_in_ents.keys():
-                    ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
-                    if entity_type is None or entity_type <= 0:
-                        # Only allow labelled spans
-                        print(i, ent_start, ent_end, entity_type)
-                        raise ValueError(Errors.E1013)
-                    elif ent_start == i:
-                        # Marking the start of an entity
-                        ent_iob = 3
-                    else:
-                        # Marking the inside of an entity
-                        ent_iob = 1
-
-                self.c[i].ent_type = entity_type
-                self.c[i].ent_kb_id = kb_id
-                self.c[i].ent_iob = ent_iob
+                span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
+                ent_spans.append(span)
+            self.set_ents(ent_spans, default=SetEntsDefault.outside)
 
     def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
         """Set entity annotation.
@@ -734,6 +701,9 @@ cdef class Doc:
         if default not in SetEntsDefault.values():
             raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
 
+        # Ignore spans with missing labels
+        entities = [ent for ent in entities if ent.label > 0]
+
         if blocked is None:
             blocked = tuple()
         if missing is None:
@@ -742,6 +712,7 @@ cdef class Doc:
             outside = tuple()
 
         # Find all tokens covered by spans and check that none are overlapping
+        cdef int i
         seen_tokens = set()
         for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
             if not isinstance(span, Span):
@@ -761,6 +732,7 @@ cdef class Doc:
                 else:
                     self.c[i].ent_iob = 1
                 self.c[i].ent_type = span.label
+                self.c[i].ent_kb_id = span.kb_id
         for span in blocked:
             for i in range(span.start, span.end):
                 self.c[i].ent_iob = 3

From d7ab6a2ffe8e11ee644286ea815bae8cf59bfabb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:37:21 +0200
Subject: [PATCH 140/516] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index a00229867..4b25418b5 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -22,12 +22,13 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 <figure>
 
 | Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
-| ------------------------------------------------------------------------------ | --------: | --------- |
-| spaCy RoBERTa (2020)                                                           |
-| spaCy CNN (2020)                                                               |           |
-| spaCy CNN (2017)                                                               |      86.4 |
-| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |
-| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |
+| ------------------------------------------------------------------------------ | --------: | --------: |
+| spaCy RoBERTa (2020)                                                           |           |      92.2 |
+| spaCy CNN (2020)                                                               |           |      88.4 |
+| spaCy CNN (2017)                                                               |      86.4 |           |
+| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 |
+| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 |
+| BERT Base<sup>3</sup>                                                          |         - |      92.4 |
 
 <figcaption class="caption">
 
@@ -36,7 +37,8 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 [CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
 [NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
 more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
-**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/)
+**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3.
+** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
 
 </figcaption>
 

From be56c0994b09a8ba5042eb563d05ea5bb7f75a6d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:40:25 +0200
Subject: [PATCH 141/516] Add [training.before_to_disk] callback

---
 spacy/cli/train.py       | 18 ++++++++++++++++++
 spacy/default_config.cfg |  2 ++
 spacy/errors.py          |  3 +++
 spacy/schemas.py         |  1 +
 4 files changed, 24 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index eabc82be0..6d61c2425 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -97,6 +97,7 @@ def train(
     dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
+    before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T_cfg["frozen_components"]
     # Sourced components that require resume_training
@@ -167,6 +168,7 @@ def train(
                     with nlp.select_pipes(disable=frozen_components):
                         update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
+                        nlp = before_to_disk(nlp)
                         nlp.to_disk(output_path / "model-best")
                 progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
                 progress.set_description(f"Epoch {info['epoch']}")
@@ -179,6 +181,7 @@ def train(
                 f"Aborting and saving the final best model. "
                 f"Encountered exception: {str(e)}"
             )
+            nlp = before_to_disk(nlp)
             nlp.to_disk(output_path / "model-final")
         raise e
     finally:
@@ -233,6 +236,21 @@ def create_evaluation_callback(
     return evaluate
 
 
+def create_before_to_disk_callback(
+    callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
+
+
 def train_while_improving(
     nlp: Language,
     optimizer: Optimizer,
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 5cd97a0eb..6f8c0aa00 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -72,6 +72,8 @@ frozen_components = []
 dev_corpus = "corpora.dev"
 # Location in the config where the train corpus is defined
 train_corpus = "corpora.train"
+# Optional callback before nlp object is saved to disk after training
+before_to_disk = null
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
diff --git a/spacy/errors.py b/spacy/errors.py
index dce5cf51c..d67f01a1d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,9 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E914 = ("Executing {name} callback failed. Expected the function to "
+            "returnthe nlp object but got: {value}. Maybe you forgot to return "
+            "the modified object in your function?")
     E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
             "float or int but got: {score_type}. To exclude the score from the "
             "final score, set its weight to null in the [training.score_weights] "
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e34841008..6a9a82d06 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -217,6 +217,7 @@ class ConfigSchemaTraining(BaseModel):
     optimizer: Optimizer = Field(..., title="The optimizer to use")
     logger: Logger = Field(..., title="The logger to track training progress")
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
     # fmt: on
 
     class Config:

From 138c8d45dbd1372fafe6b280fdedf33790d20d32 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:43:39 +0200
Subject: [PATCH 142/516] Update docs

---
 website/docs/api/data-formats.md | 45 ++++++++++++++++----------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e3b3900be..6f156fe37 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                  |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
+| Name                  | Description                                                                                                                                                                                                                               |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                    |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                              |
+| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
+| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                           |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                            |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                 |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                         |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                           |
+| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                  |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                           |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                 |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                   |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                           |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                       |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                             |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                           |
+| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                       |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                          |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -275,8 +276,8 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
-> can help you convert entity offsets to the right format.
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
+> help you convert entity offsets to the right format.
 
 ```python
 ### Example structure

From 1c63f02f99d6c3d663c4a9cfb0e3395986bd7598 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 12:51:16 +0200
Subject: [PATCH 143/516] Add API docs

---
 website/docs/api/doc.md | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7175f6e7f..e10d9d077 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -219,6 +219,30 @@ alignment mode `"strict".
 | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
 | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
+## Doc.set_ents {#ents tag="method" new="3"}
+
+Set the named entities in the document.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Span
+> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc.set_ents([Span(doc, 0, 2, "PERSON")])
+> ents = list(doc.ents)
+> assert ents[0].label_ == "PERSON"
+> assert ents[0].text == "Mr. Best"
+> ```
+
+| Name           | Description                                                                                                                                                                               |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| entities       | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                           |
+| blocked        | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                        |
+| missing        | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                   |
+| outside        | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                            |
+| default        | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
+
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 
 Make a semantic similarity estimate. The default estimate is cosine similarity
@@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
 > ```python
 > doc = nlp("Mr. Best flew to New York on Saturday morning.")
 > ents = list(doc.ents)
-> assert ents[0].label == 346
 > assert ents[0].label_ == "PERSON"
 > assert ents[0].text == "Mr. Best"
 > ```

From 3dd5f409ec874fbb57cf020577eeff03b5c98bc6 Mon Sep 17 00:00:00 2001
From: walterhenry <55140654+walterhenry@users.noreply.github.com>
Date: Thu, 24 Sep 2020 13:15:28 +0200
Subject: [PATCH 144/516] Proofreading

Proofread some API docs
---
 website/docs/api/architectures.md      | 14 +++++++-------
 website/docs/api/attributeruler.md     |  4 ++--
 website/docs/api/cli.md                |  4 ++--
 website/docs/api/data-formats.md       |  4 ++--
 website/docs/api/dependencyparser.md   |  3 +--
 website/docs/api/doc.md                |  2 +-
 website/docs/api/entitylinker.md       |  2 +-
 website/docs/api/entityrecognizer.md   |  4 ++--
 website/docs/api/entityruler.md        |  2 +-
 website/docs/api/example.md            |  8 ++++----
 website/docs/api/language.md           | 16 +++++++---------
 website/docs/api/lemmatizer.md         |  2 +-
 website/docs/api/matcher.md            |  2 +-
 website/docs/api/morphology.md         |  4 ++--
 website/docs/api/pipeline-functions.md |  2 +-
 website/docs/api/sentencerecognizer.md |  2 +-
 website/docs/api/sentencizer.md        |  4 ++--
 website/docs/api/span.md               |  2 +-
 18 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 30d863b17..698e1ee56 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build a mixed representations. The features used
+a feed-forward subnetwork to build mixed representations. The features used
 are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
 definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
 pretrained static vectors can also be incorporated into the concatenated
@@ -170,7 +170,7 @@ representation.
 > nC = 8
 > ```
 
-Construct an embedded representations based on character embeddings, using a
+Construct an embedded representation based on character embeddings, using a
 feed-forward network. A fixed number of UTF-8 byte characters are used for each
 word, taken from the beginning and end of the word equally. Padding is used in
 the center for words that are too short.
@@ -392,7 +392,7 @@ a single token vector given zero or more wordpiece vectors.
 > ```
 
 Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
-**not** allow multiple components to share the transformer weights, and does
+**not** allow multiple components to share the transformer weights and does
 **not** allow the transformer to set annotations into the [`Doc`](/api/doc)
 object, but it's a **simpler solution** if you only need the transformer within
 one component.
@@ -436,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
 helpful for background information. The neural network state prediction model
 consists of either two or three subnetworks:
 
-- **tok2vec**: Map each token into a vector representations. This subnetwork is
+- **tok2vec**: Map each token into a vector representation. This subnetwork is
   run once for each batch.
 - **lower**: Construct a feature-specific vector for each `(token, feature)`
   pair. This is also run once for each batch. Constructing the state
@@ -573,14 +573,14 @@ architecture is usually less accurate than the ensemble, but runs faster.
 > nO = null
 > ```
 
-An ngram "bag-of-words" model. This architecture should run much faster than the
+An n-gram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.
 
 | Name                | Description                                                                                                                                                                                        |
 | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                               |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                               |
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
 
@@ -594,7 +594,7 @@ into the "real world". This requires 3 main components:
   synonyms and prior probabilities.
 - A candidate generation step to produce a set of likely identifiers, given a
   certain textual mention.
-- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
+- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
   most plausible ID from the set of candidates.
 
 ### spacy.EntityLinker.v1 {#EntityLinker}
diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index 53c8c46cf..60fda6bda 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -71,7 +71,7 @@ pattern_dicts = [
 
 ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
 
-Apply the attribute ruler to a Doc, setting token attributes for tokens matched
+Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
 by the provided patterns.
 
 | Name        | Description                      |
@@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name       | Description                                                    |
 | ---------- | -------------------------------------------------------------- |
 | `vocab`    | The shared [`Vocab`](/api/vocab).                              |
-| `patterns` | The Matcher patterns. You usually don't want to exclude this.  |
+| `patterns` | The `Matcher` patterns. You usually don't want to exclude this.  |
 | `attrs`    | The attributes to set. You usually don't want to exclude this. |
 | `indices`  | The token indices. You usually don't want to exclude this.     |
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8449d23e1..2a216f5f8 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent]
 Find all trained pipeline packages installed in the current environment and
 check whether they are compatible with the currently installed version of spaCy.
 Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
-all installed packages are can be used with the new version. It will show a list
+all installed packages can be used with the new version. It will show a list
 of packages and their installed versions. If any package is out of date, the
 latest compatible versions and command for updating are shown.
 
@@ -406,7 +406,7 @@ File       /path/to/spacy/training/corpus.py (line 18)
 
 ### debug data {#debug-data tag="command"}
 
-Analyze, debug, and validate your training and development data. Get useful
+Analyze, debug and validate your training and development data. Get useful
 stats, and find problems like invalid entity annotations, cyclic dependencies,
 low data labels and more.
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 79ecb08b3..7c7b58a15 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -188,7 +188,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used
 as input format for specifying a [training corpus](/api/corpus) and for spaCy's
 CLI [`train`](/api/cli#train) command. The built-in
 [`convert`](/api/cli#convert) command helps you convert spaCy's previous
-[JSON format](#json-input) to the new binary format format. It also supports
+[JSON format](#json-input) to the new binary format. It also supports
 conversion of the `.conllu` format used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies).
 
@@ -252,7 +252,7 @@ $ python -m spacy convert ./data.json ./output.spacy
 
 <Accordion title="Sample JSON data" spaced>
 
-Here's an example of dependencies, part-of-speech tags and names entities, taken
+Here's an example of dependencies, part-of-speech tags and named entities, taken
 from the English Wall Street Journal portion of the Penn Treebank:
 
 ```json
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 8af4455d3..7e809c642 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -21,8 +21,7 @@ non-projective parses.
 The parser is trained using an **imitation learning objective**. It follows the
 actions predicted by the current weights, and at each state, determines which
 actions are compatible with the optimal parse that could be reached from the
-current state. The weights such that the scores assigned to the set of optimal
-actions is increased, while scores assigned to other actions are decreased. Note
+current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
 that more than one action may be optimal for a given state.
 
 ## Config and implementation {#config}
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 88dc62c2a..b4097ddb7 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -445,7 +445,7 @@ Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
 `LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
-dictionary mapping attribute names to values as the `"_"` key.
+dictionary mapping attribute name to values as the `"_"` key.
 
 > #### Example
 >
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 9cb35b487..890548f0e 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -94,7 +94,7 @@ providing custom registered functions.
 
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
 
-Apply the pipe to one document. The document is modified in place, and returned.
+Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
 and all pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 8af73f44b..d22dae12c 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
 
 | Setting                       | Description                                                                                                                                                                                                                                         |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]                                                                                                                                       |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                       |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
 
@@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and
 
 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
 
-Apply the pipe to one document. The document is modified in place, and returned.
+Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
 and all pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/entityrecognizer#call) and
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 7be44bc95..7b7e5b635 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler.
 | Name              | Description                                                                                                           |
 | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
 | `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~                                     |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                     |
 | `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
 | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
diff --git a/website/docs/api/example.md b/website/docs/api/example.md
index 668c8028f..2811f4d91 100644
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@@ -33,8 +33,8 @@ both documents.
 
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `predicted`    | The document containing (partial) predictions. Can not be `None`. ~~Doc~~                                                |
-| `reference`    | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~                                            |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~                                                |
+| `reference`    | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~                                            |
 | _keyword-only_ |                                                                                                                          |
 | `alignment`    | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
 
@@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input).
 
 | Name           | Description                                                               |
 | -------------- | ------------------------------------------------------------------------- |
-| `predicted`    | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
-| `example_dict` | `Dict[str, obj]`                                                          | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
+| `example_dict` | `Dict[str, obj]`                                                          | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
 | **RETURNS**    | The newly constructed object. ~~Example~~                                 |
 
 ## Example.text {#text tag="property"}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index ffdae9ec6..92663c44a 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -46,9 +46,7 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
 ## Language.from_config {#from_config tag="classmethod" new="3"}
 
 Create a `Language` object from a loaded config. Will set up the tokenizer and
-language data, add pipeline components based on the pipeline and components
-define in the config and validate the results. If no config is provided, the
-default config of the given language is used. This is also how spaCy loads a
+language data, add pipeline components based on the pipeline and add pipeline components based on the definitions specified in the config. If no config is provided, the default config of the given language is used. This is also how spaCy loads a
 model under the hood based on its [`config.cfg`](/api/data-formats#config).
 
 > #### Example
@@ -107,7 +105,7 @@ decorator. For more details and examples, see the
 | `assigns`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
 | `requires`     | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
 | `retokenizes`  | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                               |
-| `func`         | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~                                                                                    |
+| `func`         | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~                                                                                    |
 
 ## Language.factory {#factory tag="classmethod"}
 
@@ -155,7 +153,7 @@ examples, see the
 | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                             |
 | `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     |
 | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
-| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                 |
+| `func`                  | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                 |
 
 ## Language.\_\_call\_\_ {#call tag="method"}
 
@@ -602,7 +600,7 @@ does nothing.
 
 ## Language.enable_pipe {#enable_pipe tag="method" new="3"}
 
-Enable a previously disable component (e.g. via
+Enable a previously disabled component (e.g. via
 [`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
 the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
 already enabled, this method does nothing.
@@ -629,7 +627,7 @@ pipeline will be restored to the initial state at the end of the block.
 Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
 you can use to undo your changes. You can specify either `disable` (as a list or
 string), or `enable`. In the latter case, all components not in the `enable`
-list, will be disabled. Under the hood, this method calls into
+list will be disabled. Under the hood, this method calls into
 [`disable_pipe`](/api/language#disable_pipe) and
 [`enable_pipe`](/api/language#enable_pipe).
 
@@ -662,7 +660,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
 | -------------- | ------------------------------------------------------------------------------------------------------ |
 | _keyword-only_ |                                                                                                        |
 | `disable`      | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~                     |
-| `enable`       | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~     |
+| `enable`       | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~     |
 | **RETURNS**    | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
 
 ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@@ -874,7 +872,7 @@ Loads state from a directory, including all data that was saved with the
 
 <Infobox variant="warning" title="Important note">
 
-Keep in mind that this method **only loads serialized state** and doesn't set up
+Keep in mind that this method **only loads the serialized state** and doesn't set up
 the `nlp` object. This means that it requires the correct language class to be
 initialized and all pipeline components to be added to the pipeline. If you want
 to load a serialized pipeline from a directory, you should use
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index f9978dcf9..3693429c4 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
 `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 [`config.cfg` for training](/usage/training#config). For examples of the lookups
-data formats used by the lookup and rule-based lemmatizers, see
+data format used by the lookup and rule-based lemmatizers, see
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
 
 > #### Example
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 1f1946be5..3b885727b 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -61,7 +61,7 @@ matched:
 | `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
 | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
 | `+` | Require the pattern to match 1 or more times.                    |
-| `*` | Allow the pattern to match zero or more times.                   |
+| `*` | Allow the pattern to match 0 or more times.                   |
 
 Token patterns can also map to a **dictionary of properties** instead of a
 single value to indicate whether the expected value is a member of a list or how
diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md
index 5d5324061..e64f26bdd 100644
--- a/website/docs/api/morphology.md
+++ b/website/docs/api/morphology.md
@@ -12,7 +12,7 @@ container storing a single morphological analysis.
 
 ## Morphology.\_\_init\_\_ {#init tag="method"}
 
-Create a Morphology object.
+Create a `Morphology` object.
 
 > #### Example
 >
@@ -101,7 +101,7 @@ representation.
 | Name         | Description                                                                                                                                             |
 | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~                                                                                          |
-| **RETURNS**  | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
+| **RETURNS**  | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
 
 ## Attributes {#attributes}
 
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 0dc03a16a..8bb52d0f9 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -26,7 +26,7 @@ Merge noun chunks into a single token. Also available via the string name
 
 <Infobox variant="warning">
 
-Since noun chunks require part-of-speech tags and the dependency parse, make
+Since noun chunks require part-of-speech tags and the dependency parser, make
 sure to add this component _after_ the `"tagger"` and `"parser"` components. By
 default, `nlp.add_pipe` will add components to the end of the pipeline and after
 all other components.
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index acf94fb8e..131ef26ce 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -202,7 +202,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model, to try to address
+current model to make predictions similar to an initial model to try to address
 the "catastrophic forgetting" problem. This feature is experimental.
 
 > #### Example
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index ae31e4ddf..594a85f74 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -8,7 +8,7 @@ api_string_name: sentencizer
 api_trainable: false
 ---
 
-A simple pipeline component, to allow custom sentence boundary detection logic
+A simple pipeline component to allow custom sentence boundary detection logic
 that doesn't require the dependency parse. By default, sentence segmentation is
 performed by the [`DependencyParser`](/api/dependencyparser), so the
 `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
@@ -130,7 +130,7 @@ Score a batch of examples.
 
 ## Sentencizer.to_disk {#to_disk tag="method"}
 
-Save the sentencizer settings (punctuation characters) a directory. Will create
+Save the sentencizer settings (punctuation characters) to a directory. Will create
 a file `sentencizer.json`. This also happens automatically when you save an
 `nlp` object with a sentencizer added to its pipeline.
 
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 1c7bc9592..242ceaed0 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object.
 
 ## Span.\_\_init\_\_ {#init tag="method"}
 
-Create a Span object from the slice `doc[start : end]`.
+Create a `Span` object from the slice `doc[start : end]`.
 
 > #### Example
 >

From 6836b664330926a401d05f16fe95cf475febff08 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 13:41:25 +0200
Subject: [PATCH 145/516] Update docs and resolve todos [ci skip]

---
 website/docs/usage/_benchmarks-models.md      | 8 ++++----
 website/docs/usage/embeddings-transformers.md | 2 --
 website/docs/usage/facts-figures.md           | 2 +-
 website/docs/usage/linguistic-features.md     | 9 ++++++---
 website/docs/usage/processing-pipelines.md    | 7 +++++--
 website/docs/usage/projects.md                | 5 ++++-
 6 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 4b25418b5..5b193d3a4 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -1,10 +1,10 @@
 import { Help } from 'components/typography'; import Link from 'components/link'
 
-<!-- TODO: update, add project template -->
+<!-- TODO: update numbers -->
 
 <figure>
 
-| System                                                     | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
+| Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
 | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
 | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
@@ -21,10 +21,10 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figure>
 
-| Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
+| Named Entity Recognition System                                                | OntoNotes | CoNLL '03 |
 | ------------------------------------------------------------------------------ | --------: | --------: |
 | spaCy RoBERTa (2020)                                                           |           |      92.2 |
-| spaCy CNN (2020)                                                               |           |      88.4 |
+| spaCy CNN (2020)                                                               |      85.3 |      88.4 |
 | spaCy CNN (2017)                                                               |      86.4 |           |
 | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 |
 | <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index d61172a5b..b00760e62 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -235,8 +235,6 @@ The `Transformer` component sets the
 [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
 which lets you access the transformers outputs at runtime.
 
-<!-- TODO: update/confirm once we have final models trained -->
-
 ```cli
 $ python -m spacy download en_core_trf_lg
 ```
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 743dae74d..a31559b04 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -63,7 +63,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 <figure>
 
-| System                                                                         |  UAS |  LAS |
+| Dependency Parsing System                                                      |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: |
 | spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 |
 | spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 914e18acb..d9a894398 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
 component that only provides sentence boundaries. Along with being faster and
 smaller than the parser, its primary advantage is that it's easier to train
 because it only requires annotated sentence boundaries rather than full
-dependency parses.
-
-<!-- TODO: update/confirm usage once we have final models trained -->
+dependency parses. spaCy's [trained pipelines](/models) include both a parser
+and a trained sentence segmenter, which is
+[disabled](/usage/processing-pipelines#disabling) by default. If you only need
+sentence boundaries and no parser, you can use the `enable` and `disable`
+arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
+disable the parser.
 
 > #### senter vs. parser
 >
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 97806dc2a..dbf0881ac 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -253,8 +253,6 @@ different mechanisms you can use:
 Disabled and excluded component names can be provided to
 [`spacy.load`](/api/top-level#spacy.load) as a list.
 
-<!-- TODO: update with info on our models shipped with optional components -->
-
 > #### 💡 Optional pipeline components
 >
 > The `disable` mechanism makes it easy to distribute pipeline packages with
@@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to
 > your pipeline may include a statistical _and_ a rule-based component for
 > sentence segmentation, and you can choose which one to run depending on your
 > use case.
+>
+> For example, spaCy's [trained pipelines](/models) like
+> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and
+> `senter` that perform sentence segmentation, but the `senter` is disabled by
+> default.
 
 ```python
 # Load the pipeline without the entity recognizer
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 8e093e8d6..6d5746308 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC.
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 
 The Prodigy integration will require a nightly version of Prodigy that supports
-spaCy v3+.
+spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
+exporting your data with
+[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
+[`spacy convert`](/api/cli#convert) to convert it to the binary format.
 
 </Infobox>
 

From 5c13e0cf1bdf536c54660340e71742bf0493ea07 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 13:41:55 +0200
Subject: [PATCH 146/516] Remove unused error

---
 spacy/errors.py      | 3 ---
 spacy/tokens/doc.pyx | 2 --
 2 files changed, 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 27091810d..998e57f27 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -692,9 +692,6 @@ class Errors:
              "options: {modes}")
     E1012 = ("Entity spans and blocked/missing/outside spans should be "
              "provided to doc.set_ents as lists of `Span` objects.")
-    E1013 = ("Unable to set entity for span with empty label. Entity spans are "
-             "required to have a label. To set entity information as missing "
-             "or blocked, use the keyword arguments with doc.set_ents.")
 
 
 @add_codes
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 670c7440f..b4027f87e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -725,8 +725,6 @@ cdef class Doc:
         # Set all specified entity information
         for span in entities:
             for i in range(span.start, span.end):
-                if not span.label:
-                    raise ValueError(Errors.E1013)
                 if i == span.start:
                     self.c[i].ent_iob = 3
                 else:

From 92f8b6959a359ff4495205df42f9e86c30aeb8f6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 13:48:41 +0200
Subject: [PATCH 147/516] Fix typo

---
 spacy/errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d67f01a1d..708b7fda8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -481,7 +481,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E914 = ("Executing {name} callback failed. Expected the function to "
-            "returnthe nlp object but got: {value}. Maybe you forgot to return "
+            "return the nlp object but got: {value}. Maybe you forgot to return "
             "the modified object in your function?")
     E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
             "float or int but got: {score_type}. To exclude the score from the "

From 88e54caa1275481a43b1069c8ec6d352f554e333 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:32:35 +0200
Subject: [PATCH 148/516] accuracy -> performance

---
 spacy/cli/info.py                | 4 +++-
 spacy/schemas.py                 | 3 +--
 website/docs/api/data-formats.md | 2 +-
 website/src/templates/models.js  | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 2b87163c2..2f2515278 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
         meta["source"] = str(model_path.resolve())
     else:
         meta["source"] = str(model_path)
-    return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
+    return {
+        k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
+    }
 
 
 def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e34841008..1ff73bccc 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -182,8 +182,7 @@ class ModelMetaSchema(BaseModel):
     sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
     vectors: Dict[str, Any] = Field({}, title="Included word vectors")
     labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
-    accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
-    speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
+    performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
     spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
     # fmt: on
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e3b3900be..34565f160 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -518,7 +518,7 @@ source of truth** used for loading a pipeline.
 >     "ner": ["PERSON", "ORG", "PRODUCT"],
 >     "textcat": ["POSITIVE", "NEGATIVE"]
 >   },
->   "accuracy": {
+>   "performance": {
 >     "ents_f": 82.7300930714,
 >     "ents_p": 82.135523614,
 >     "ents_r": 83.3333333333,
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 5d705048b..413f23dc5 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -117,7 +117,7 @@ function formatModelMeta(data) {
         license: data.license,
         labels: isEmptyObj(data.labels) ? null : data.labels,
         vectors: formatVectors(data.vectors),
-        accuracy: formatAccuracy(data.accuracy),
+        accuracy: formatAccuracy(data.performance),
     }
 }
 

From 3b58a8be2b32b29a4a121bf0ed75ae3cd2920ee9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:32:42 +0200
Subject: [PATCH 149/516] Update docs

---
 website/docs/api/data-formats.md         | 4 ++--
 website/docs/usage/_benchmarks-models.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 34565f160..0fc3481a4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -275,8 +275,8 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
-> can help you convert entity offsets to the right format.
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
+> help you convert entity offsets to the right format.
 
 ```python
 ### Example structure
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 5b193d3a4..88e79112f 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -7,7 +7,7 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 | Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
 | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |   92.1 |   97.4 | 87.0 |                                                                  7k |                                                                    |
 | `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
 
 <figcaption class="caption">

From 24e7ac3f2bbdab6a1e124c2770c7545cd08906c8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:43:56 +0200
Subject: [PATCH 150/516] Fix download CLI [ci skip]

---
 spacy/cli/download.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 036aeab17..0e7ec2ea5 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -88,7 +88,6 @@ def get_compatibility() -> dict:
 
 
 def get_version(model: str, comp: dict) -> str:
-    model = get_base_version(model)
     if model not in comp:
         msg.fail(
             f"No compatible package found for '{model}' (spaCy v{about.__version__})",

From 3f751e68f596d1c186e0baa125a6cba1ff6a7995 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:45:41 +0200
Subject: [PATCH 151/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 8d019897b..56b05257a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a23"
+__version__ = "3.0.0a24"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 6bc5058d137daa28184c0494f9380b7832770c59 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 14:53:34 +0200
Subject: [PATCH 152/516] Update models directory [ci skip]

---
 website/src/templates/models.js | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 413f23dc5..cdfe2e46d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -32,11 +32,17 @@ const MODEL_META = {
     las: 'Labelled dependencies',
     token_acc: 'Tokenization',
     tok: 'Tokenization',
+    lemma: 'Statistical lemmatization',
+    morph: 'Morphological analysis',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
     tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+    pos: 'Part-of-speech tags (coarse grained tags, Token.pos)',
     ents_f: 'Named entities (F-score)',
     ents_p: 'Named entities (precision)',
     ents_r: 'Named entities (recall)',
+    ner_f: 'Named entities (F-score)',
+    ner_p: 'Named entities (precision)',
+    ner_r: 'Named entities (recall)',
     sent_f: 'Sentence segmentation (F-score)',
     sent_p: 'Sentence segmentation (precision)',
     sent_r: 'Sentence segmentation (recall)',
@@ -88,11 +94,12 @@ function formatVectors(data) {
 }
 
 function formatAccuracy(data) {
+    const exclude = ['speed']
     if (!data) return []
     return Object.keys(data)
         .map(label => {
             const value = data[label]
-            return isNaN(value)
+            return isNaN(value) || exclude.includes(label)
                 ? null
                 : {
                       label,
@@ -109,6 +116,7 @@ function formatModelMeta(data) {
         version: data.version,
         sizeFull: data.size,
         pipeline: data.pipeline,
+        components: data.components,
         notes: data.notes,
         description: data.description,
         sources: data.sources,
@@ -117,7 +125,8 @@ function formatModelMeta(data) {
         license: data.license,
         labels: isEmptyObj(data.labels) ? null : data.labels,
         vectors: formatVectors(data.vectors),
-        accuracy: formatAccuracy(data.performance),
+        // TODO: remove accuracy fallback
+        accuracy: formatAccuracy(data.accuracy || data.performance),
     }
 }
 

From 0bc214c1028bbc33c101c7cc48c3f1a2dff6c663 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 16:11:33 +0200
Subject: [PATCH 153/516] Fix pull

---
 spacy/cli/project/pull.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 3119d3a12..26676d5b3 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                     update_lockfile(project_dir, cmd)
                 # We remove the command from the list here, and break, so that
                 # we iterate over the loop again.
-                commands.remove(i)
+                commands.pop(i)
                 break
         else:
             # If we didn't break the for loop, break the while loop.

From d0ef4a4cf5f3d2db1e6624634731ac09b2eeda42 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 16:42:13 +0200
Subject: [PATCH 154/516] Prevent division by zero in score weights

---
 spacy/tests/pipeline/test_pipe_factories.py | 5 +++--
 spacy/util.py                               | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 4c197005e..07648024c 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -345,12 +345,13 @@ def test_language_factories_invalid():
             [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
             {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
     ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
     result = combine_score_weights(weights)
-    assert sum(result.values()) in (0.99, 1.0)
+    assert sum(result.values()) in (0.99, 1.0, 0.0)
     assert result == expected
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 709da8d29..ad3298651 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1233,7 +1233,10 @@ def combine_score_weights(
         # components.
         total = sum(w_dict.values())
         for key, value in w_dict.items():
-            weight = round(value / total / len(all_weights), 2)
+            if total == 0:
+                weight = 0.0
+            else:
+                weight = round(value / total / len(all_weights), 2)
             result[key] = result.get(key, 0.0) + weight
     return result
 

From c7eedd3534f551d5d23b0dfddc5e2be603780ddd Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 24 Sep 2020 16:53:59 +0200
Subject: [PATCH 155/516] updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
---
 spacy/errors.py                               |   4 +-
 spacy/kb.pyx                                  |  39 ++++--
 spacy/pipeline/entity_linker.py               |  26 +++-
 spacy/scorer.py                               |  68 ++++++++++
 spacy/tests/pipeline/test_entity_linker.py    | 121 +++++++++++++++---
 .../tests/{ => training}/test_new_example.py  |  19 +++
 website/docs/api/entitylinker.md              |  15 +++
 website/docs/api/scorer.md                    |  23 ++++
 8 files changed, 273 insertions(+), 42 deletions(-)
 rename spacy/tests/{ => training}/test_new_example.py (91%)

diff --git a/spacy/errors.py b/spacy/errors.py
index 6fdf8cb57..50d2fea5f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -517,8 +517,8 @@ class Errors:
             "instead.")
     E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
             "property or default function argument?")
-    E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
-            "provided argument {loc} is an existing directory.")
+    E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
+            "but the provided argument {loc} points to a file.")
     E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
             "not seem to exist.")
     E930 = ("Received invalid get_examples callback in {name}.begin_training. "
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ff5382c24..bdf652766 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -10,6 +10,8 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings
 
+from spacy.strings import StringStore
+
 from spacy import util
 
 from .typedefs cimport hash_t
@@ -83,6 +85,9 @@ cdef class KnowledgeBase:
     DOCS: https://nightly.spacy.io/api/kb
     """
 
+    contents_loc = "contents"
+    strings_loc = "strings.json"
+
     def __init__(self, Vocab vocab, entity_vector_length):
         """Create a KnowledgeBase."""
         self.mem = Pool()
@@ -319,15 +324,29 @@ cdef class KnowledgeBase:
 
         return 0.0
 
-
     def to_disk(self, path):
         path = util.ensure_path(path)
-        if path.is_dir():
+        if not path.exists():
+            path.mkdir(parents=True)
+        if not path.is_dir():
             raise ValueError(Errors.E928.format(loc=path))
-        if not path.parent.exists():
-            path.parent.mkdir(parents=True)
+        self.write_contents(path / self.contents_loc)
+        self.vocab.strings.to_disk(path / self.strings_loc)
 
-        cdef Writer writer = Writer(path)
+    def from_disk(self, path):
+        path = util.ensure_path(path)
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+        if not path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        self.read_contents(path / self.contents_loc)
+        kb_strings = StringStore()
+        kb_strings.from_disk(path / self.strings_loc)
+        for string in kb_strings:
+            self.vocab.strings.add(string)
+
+    def write_contents(self, file_path):
+        cdef Writer writer = Writer(file_path)
         writer.write_header(self.get_size_entities(), self.entity_vector_length)
 
         # dumping the entity vectors in their original order
@@ -366,13 +385,7 @@ cdef class KnowledgeBase:
 
         writer.close()
 
-    def from_disk(self, path):
-        path = util.ensure_path(path)
-        if path.is_dir():
-            raise ValueError(Errors.E928.format(loc=path))
-        if not path.exists():
-            raise ValueError(Errors.E929.format(loc=path))
-
+    def read_contents(self, file_path):
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
@@ -382,7 +395,7 @@ cdef class KnowledgeBase:
         cdef AliasC alias
         cdef float vector_element
 
-        cdef Reader reader = Reader(path)
+        cdef Reader reader = Reader(file_path)
 
         # STEP 0: load header and initialize KB
         cdef int64_t nr_entities
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 1debadd82..fec53c77a 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -16,6 +16,7 @@ from ..training import Example, validate_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
+from ..scorer import Scorer
 
 
 default_model_config = """
@@ -47,6 +48,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
     },
+    scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
+    default_score_weights={"nel_micro_f": 1.0},
 )
 def make_entity_linker(
     nlp: Language,
@@ -209,12 +212,11 @@ class EntityLinker(Pipe):
             # it does run the model twice :(
             predictions = self.model.predict(docs)
         for eg in examples:
-            sentences = [s for s in eg.predicted.sents]
+            sentences = [s for s in eg.reference.sents]
             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
-                kb_id = kb_ids[
-                    ent.start
-                ]  # KB ID of the first token is the same as the whole span
+            for ent in eg.reference.ents:
+                # KB ID of the first token is the same as the whole span
+                kb_id = kb_ids[ent.start]
                 if kb_id:
                     try:
                         # find the sentence in the list of sentences.
@@ -253,7 +255,7 @@ class EntityLinker(Pipe):
         entity_encodings = []
         for eg in examples:
             kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
-            for ent in eg.predicted.ents:
+            for ent in eg.reference.ents:
                 kb_id = kb_ids[ent.start]
                 if kb_id:
                     entity_encoding = self.kb.get_vector(kb_id)
@@ -415,6 +417,18 @@ class EntityLinker(Pipe):
                 for token in ent:
                     token.ent_kb_id_ = kb_id
 
+    def score(self, examples, **kwargs):
+        """Score a batch of examples.
+
+        examples (Iterable[Example]): The examples to score.
+        RETURNS (Dict[str, Any]): The scores.
+
+        DOCS TODO: https://nightly.spacy.io/api/entity_linker#score
+        """
+        validate_examples(examples, "EntityLinker.score")
+        return Scorer.score_links(examples, negative_labels=[self.NIL])
+
+
     def to_disk(
         self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> None:
diff --git a/spacy/scorer.py b/spacy/scorer.py
index c50de3d43..cd3b013cd 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -451,6 +451,74 @@ class Scorer:
             results[f"{attr}_score_desc"] = "macro AUC"
         return results
 
+    @staticmethod
+    def score_links(
+        examples: Iterable[Example], *, negative_labels: Iterable[str]
+    ) -> Dict[str, Any]:
+        """Returns PRF for predicted links on the entity level.
+        To disentangle the performance of the NEL from the NER,
+        this method only evaluates NEL links for entities that overlap
+        between the gold reference and the predictions.
+
+        examples (Iterable[Example]): Examples to score
+        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
+        RETURNS (Dict[str, Any]): A dictionary containing the scores.
+
+        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+        """
+        f_per_type = {}
+        for example in examples:
+            gold_ent_by_offset = {}
+            for gold_ent in example.reference.ents:
+                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
+
+            for pred_ent in example.predicted.ents:
+                gold_span = gold_ent_by_offset.get(
+                    (pred_ent.start_char, pred_ent.end_char), None
+                )
+                label = gold_span.label_
+                if not label in f_per_type:
+                    f_per_type[label] = PRFScore()
+                gold = gold_span.kb_id_
+                # only evaluating entities that overlap between gold and pred,
+                # to disentangle the performance of the NEL from the NER
+                if gold is not None:
+                    pred = pred_ent.kb_id_
+                    if gold in negative_labels and pred in negative_labels:
+                        # ignore true negatives
+                        pass
+                    elif gold == pred:
+                        f_per_type[label].tp += 1
+                    elif gold in negative_labels:
+                        f_per_type[label].fp += 1
+                    elif pred in negative_labels:
+                        f_per_type[label].fn += 1
+                    else:
+                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
+                        f_per_type[label].fp += 1
+                        f_per_type[label].fn += 1
+        micro_prf = PRFScore()
+        for label_prf in f_per_type.values():
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
+        n_labels = len(f_per_type) + 1e-100
+        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
+        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
+        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
+        results = {
+            f"nel_score": micro_prf.fscore,
+            f"nel_score_desc": "micro F",
+            f"nel_micro_p": micro_prf.precision,
+            f"nel_micro_r": micro_prf.recall,
+            f"nel_micro_f": micro_prf.fscore,
+            f"nel_macro_p": macro_p,
+            f"nel_macro_r": macro_r,
+            f"nel_macro_f": macro_f,
+            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+        }
+        return results
+
     @staticmethod
     def score_deps(
         examples: Iterable[Example],
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 88e0646b3..878f41a28 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -2,8 +2,10 @@ from typing import Callable, Iterable
 import pytest
 
 from spacy.kb import KnowledgeBase, get_candidates, Candidate
+from spacy.vocab import Vocab
 
 from spacy import util, registry
+from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
 from spacy.tests.util import make_tempdir
@@ -151,22 +153,15 @@ def test_kb_serialize(nlp):
         # normal read-write behaviour
         mykb.to_disk(d / "kb")
         mykb.from_disk(d / "kb")
-        mykb.to_disk(d / "kb.file")
-        mykb.from_disk(d / "kb.file")
         mykb.to_disk(d / "new" / "kb")
         mykb.from_disk(d / "new" / "kb")
         # allow overwriting an existing file
-        mykb.to_disk(d / "kb.file")
-        with pytest.raises(ValueError):
-            # can not write to a directory
-            mykb.to_disk(d)
-        with pytest.raises(ValueError):
-            # can not read from a directory
-            mykb.from_disk(d)
+        mykb.to_disk(d / "kb")
         with pytest.raises(ValueError):
             # can not read from an unknown file
             mykb.from_disk(d / "unknown" / "kb")
 
+
 def test_candidate_generation(nlp):
     """Test correct candidate generation"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp):
     assert doc[2].ent_kb_id_ == "Q2"
 
 
+def test_vocab_serialization(nlp):
+    """Test that string information is retained across storage"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+    q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+    # adding aliases
+    douglas_hash = mykb.add_alias(
+        alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
+    )
+    adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    candidates = mykb.get_alias_candidates("adam")
+    assert len(candidates) == 1
+    assert candidates[0].entity == q2_hash
+    assert candidates[0].entity_ == "Q2"
+    assert candidates[0].alias == adam_hash
+    assert candidates[0].alias_ == "adam"
+
+    with make_tempdir() as d:
+        mykb.to_disk(d / "kb")
+        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+        kb_new_vocab.from_disk(d / "kb")
+
+        candidates = kb_new_vocab.get_alias_candidates("adam")
+        assert len(candidates) == 1
+        assert candidates[0].entity == q2_hash
+        assert candidates[0].entity_ == "Q2"
+        assert candidates[0].alias == adam_hash
+        assert candidates[0].alias_ == "adam"
+
+
 def test_append_alias(nlp):
     """Test that we can append additional alias-entity pairs"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp):
 TRAIN_DATA = [
     ("Russ Cochran captured his first major title with his son as caddie.",
         {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran his reprints include EC Comics.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran has been publishing comic art.",
         {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
-         "entities": [(0, 12, "PERSON")]}),
+         "entities": [(0, 12, "PERSON")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}),
     ("Russ Cochran was a member of University of Kentucky's golf team.",
         {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
-         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}),
+         "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
+         "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
 ]
 GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 # fmt: on
@@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
     nlp = English()
-    nlp.add_pipe("sentencizer")
     vector_length = 3
 
-    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-
     # Convert the texts to docs to make sure we have doc.ents set for the training examples
     train_examples = []
     for text, annotation in TRAIN_DATA:
@@ -446,6 +472,16 @@ def test_overfitting_IO():
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["entity_linker"] < 0.001
 
+    # adding additional components that are required for the entity_linker
+    nlp.add_pipe("sentencizer", first=True)
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
     # test the trained model
     predictions = []
     for text, annotation in TRAIN_DATA:
@@ -465,3 +501,46 @@ def test_overfitting_IO():
             for ent in doc2.ents:
                 predictions.append(ent.kb_id_)
         assert predictions == GOLD_entities
+
+
+def test_scorer_links():
+    train_examples = []
+    nlp = English()
+    ref1 = nlp("Julia lives in London happily.")
+    ref1.ents = [
+        Span(ref1, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    pred1 = nlp("Julia lives in London happily.")
+    pred1.ents = [
+        Span(pred1, 0, 1, label="PERSON", kb_id="Q70"),
+        Span(pred1, 3, 4, label="LOC", kb_id="Q3"),
+    ]
+    train_examples.append(Example(pred1, ref1))
+
+    ref2 = nlp("She loves London.")
+    ref2.ents = [
+        Span(ref2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(ref2, 2, 3, label="LOC", kb_id="Q13"),
+    ]
+    pred2 = nlp("She loves London.")
+    pred2.ents = [
+        Span(pred2, 0, 1, label="PERSON", kb_id="Q2"),
+        Span(pred2, 2, 3, label="LOC", kb_id="NIL"),
+    ]
+    train_examples.append(Example(pred2, ref2))
+
+    ref3 = nlp("London is great.")
+    ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")]
+    pred3 = nlp("London is great.")
+    pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")]
+    train_examples.append(Example(pred3, ref3))
+
+    scores = Scorer().score_links(train_examples, negative_labels=["NIL"])
+    assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2
+    assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2
+    assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1
+    assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2
+
+    assert scores["nel_micro_p"] == 2 / 3
+    assert scores["nel_micro_r"] == 2 / 4
diff --git a/spacy/tests/test_new_example.py b/spacy/tests/training/test_new_example.py
similarity index 91%
rename from spacy/tests/test_new_example.py
rename to spacy/tests/training/test_new_example.py
index 597809286..81207b640 100644
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots):
     predicted = Doc(vocab, words=annots["words"])
     with pytest.raises(ValueError):
         Example.from_dict(predicted, annots)
+
+
+def test_Example_from_dict_sentences():
+    vocab = Vocab()
+    predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
+    annots = {"sent_starts": [1, 0, 0, 1, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 2
+
+    # this currently throws an error - bug or feature?
+    # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    # annots = {"sent_starts": [1, 0, 0, 0, 0]}
+    # ex = Example.from_dict(predicted, annots)
+    # assert len(list(ex.reference.sents)) == 1
+
+    predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
+    annots = {"sent_starts": [1, -1, 0, 0, 0]}
+    ex = Example.from_dict(predicted, annots)
+    assert len(list(ex.reference.sents)) == 1
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 9cb35b487..945a1568a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
 
+## EntityLinker.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = entity_linker.score(examples)
+> ```
+
+| Name        | Description                                                                                    |
+| ----------- | ---------------------------------------------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~                                                   |
+| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
+
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 1c0895bcf..0dbc0de33 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -206,3 +206,26 @@ depends on the scorer settings:
 | `multi_label`    | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~                                                                         |
 | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~                                                 |
 | **RETURNS**      | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~                                             |
+
+## Scorer.score_links {#score_links tag="staticmethod" new="3"}
+
+Returns PRF for predicted links on the entity level. To disentangle the
+performance of the NEL from the NER, this method only evaluates NEL links for
+entities that overlap between the gold reference and the predictions.
+
+> #### Example
+>
+> ```python
+> scores = Scorer.score_links(
+>     examples,
+>     negative_labels=["NIL", ""]
+> )
+> print(scores["nel_micro_f"])
+> ```
+
+| Name              | Description                                                                                                         |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples`        | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_    |                                                                                                                     |
+| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~                                       |
+| **RETURNS**       | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~                                                  |

From 59340606b7881928c924e4c11bc59192522fedb8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 16:54:39 +0200
Subject: [PATCH 156/516] Add option to disable Matcher errors (#6125)

* Add option to disable Matcher errors

* Add option to disable Matcher errors when a doc doesn't contain a
particular type of annotation

Minor additional change:

* Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH`
values

* Rename suppress_errors to allow_missing

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Refactor annotation checks in Matcher and PhraseMatcher

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/errors.py                         |  4 ----
 spacy/matcher/matcher.pyx               | 25 ++++++++++++++-----------
 spacy/matcher/phrasematcher.pyx         | 22 ++++++++++++----------
 spacy/pipeline/attributeruler.py        | 18 +++++++++++++-----
 spacy/tests/matcher/test_matcher_api.py |  3 +++
 5 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 50d2fea5f..4216e3936 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -401,10 +401,6 @@ class Errors:
             "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
             "instead of list(nlp.tokenizer.pipe()).")
-    E156 = ("The pipeline needs to include a parser in order to use "
-            "Matcher or PhraseMatcher with the attribute DEP. Try using "
-            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
-            "list(nlp.tokenizer.pipe()).")
     E157 = ("Can't render negative values for dependency arc start or end. "
             "Make sure that you're passing in absolute token indices, not "
             "relative token offsets.\nstart: {start}, end: {end}, label: "
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index d83f58181..39c7168e4 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -195,7 +195,7 @@ cdef class Matcher:
                 else:
                     yield doc
 
-    def __call__(self, object doclike, *, as_spans=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
         """Find all token sequences matching the supplied pattern.
 
         doclike (Doc or Span): The document to match over.
@@ -215,16 +215,19 @@ cdef class Matcher:
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
         cdef Pool tmp_pool = Pool()
-        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
-            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-        if POS in self._seen_attrs and not doc.has_annotation("POS"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
-            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
-            raise ValueError(Errors.E156.format())
+        if not allow_missing:
+            for attr in (TAG, POS, MORPH, LEMMA, DEP):
+                if attr in self._seen_attrs and not doc.has_annotation(attr):
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                 extensions=self._extensions, predicates=self._extra_predicates)
         final_matches = []
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index b00ba157f..7e99859b5 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -186,16 +186,18 @@ cdef class PhraseMatcher:
             if isinstance(doc, Doc):
                 attrs = (TAG, POS, MORPH, LEMMA, DEP)
                 has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == TAG and not has_annotation[TAG]:
-                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-                if self.attr == POS and not has_annotation[POS]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-                if self.attr == MORPH and not has_annotation[MORPH]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-                if self.attr == LEMMA and not has_annotation[LEMMA]:
-                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-                if self.attr == DEP and not has_annotation[DEP]:
-                    raise ValueError(Errors.E156.format())
+                for attr in attrs:
+                    if self.attr == attr and not has_annotation[attr]:
+                        if attr == TAG:
+                            pipe = "tagger"
+                        elif attr in (POS, MORPH):
+                            pipe = "morphologizer"
+                        elif attr == LEMMA:
+                            pipe = "lemmatizer"
+                        elif attr == DEP:
+                            pipe = "parser"
+                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                        raise ValueError(error_msg)
                 if self._validate and any(has_annotation.values()) \
                         and self.attr not in attrs:
                     string_attr = self.vocab.strings[self.attr]
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index f64fcbc54..0d59a1ba0 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
-        matches = sorted(self.matcher(doc))
+        matches = sorted(self.matcher(doc, allow_missing=True))
 
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
@@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
         for tag, attrs in tag_map.items():
             pattern = [{"TAG": tag}]
             attrs, morph_attrs = _split_morph_attrs(attrs)
-            morph = self.vocab.morphology.add(morph_attrs)
-            attrs["MORPH"] = self.vocab.strings[morph]
+            if "MORPH" not in attrs:
+                morph = self.vocab.morphology.add(morph_attrs)
+                attrs["MORPH"] = self.vocab.strings[morph]
+            else:
+                morph = self.vocab.morphology.add(attrs["MORPH"])
+                attrs["MORPH"] = self.vocab.strings[morph]
             self.add([pattern], attrs)
 
     def load_from_morph_rules(
@@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
                 pattern = [{"ORTH": word, "TAG": tag}]
                 attrs = morph_rules[tag][word]
                 attrs, morph_attrs = _split_morph_attrs(attrs)
-                morph = self.vocab.morphology.add(morph_attrs)
-                attrs["MORPH"] = self.vocab.strings[morph]
+                if "MORPH" in attrs:
+                    morph = self.vocab.morphology.add(attrs["MORPH"])
+                    attrs["MORPH"] = self.vocab.strings[morph]
+                elif morph_attrs:
+                    morph = self.vocab.morphology.add(morph_attrs)
+                    attrs["MORPH"] = self.vocab.strings[morph]
                 self.add([pattern], attrs)
 
     def add(
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 04f9585f1..c407595e5 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
         matcher(doc2)
     with pytest.raises(ValueError):
         matcher(doc3)
+    # errors can be suppressed if desired
+    matcher(doc2, allow_missing=True)
+    matcher(doc3, allow_missing=True)
     # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = Matcher(en_vocab)

From 3c062b3911d70f0f9521653cac6d0a7b85bc272f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 24 Sep 2020 16:55:09 +0200
Subject: [PATCH 157/516] Add MORPH handling to Matcher (#6107)

* Add MORPH handling to Matcher

* Add `MORPH` to `Matcher` schema
* Rename `_SetMemberPredicate` to `_SetPredicate`
* Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate`
  * Add special handling for normalization and conversion of morph
    values into sets
  * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only
    matches for 0 or 1 values

* Update test

* Rename to IS_SUBSET and IS_SUPERSET
---
 spacy/matcher/matcher.pyx                 |  52 +++++++----
 spacy/schemas.py                          |   5 ++
 spacy/tests/matcher/test_matcher_api.py   | 100 ++++++++++++++++++++++
 website/docs/api/matcher.md               |  30 ++++---
 website/docs/usage/rule-based-matching.md |  30 ++++---
 5 files changed, 174 insertions(+), 43 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 39c7168e4..a4d20ec55 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,6 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
+from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 
 from ..schemas import validate_token_pattern
@@ -124,7 +125,7 @@ cdef class Matcher:
         key = self._normalize_key(key)
         for pattern in patterns:
             try:
-                specs = _preprocess_pattern(pattern, self.vocab.strings,
+                specs = _preprocess_pattern(pattern, self.vocab,
                     self._extensions, self._extra_predicates)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
@@ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
         extra_predicates.
     """
     tokens = []
+    string_store = vocab.strings
     for spec in token_specs:
         if not spec:
             # Signifier for 'any token'
@@ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates)))
     return tokens
@@ -732,7 +734,7 @@ def _get_attr_values(spec, string_store):
 class _RegexPredicate:
     operators = ("REGEX",)
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
         self.value = re.compile(value)
@@ -750,13 +752,18 @@ class _RegexPredicate:
         return bool(self.value.search(value))
 
 
-class _SetMemberPredicate:
-    operators = ("IN", "NOT_IN")
+class _SetPredicate:
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
-        self.value = set(get_string_id(v) for v in value)
+        self.vocab = vocab
+        if self.attr == MORPH:
+            # normalize morph strings
+            self.value = set(self.vocab.morphology.add(v) for v in value)
+        else:
+            self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@@ -768,19 +775,32 @@ class _SetMemberPredicate:
             value = get_string_id(token._.get(self.attr))
         else:
             value = get_token_attr_for_matcher(token.c, self.attr)
+
+        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+            if self.attr == MORPH:
+                # break up MORPH into individual Feat=Val values
+                value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
+            else:
+                # IS_SUBSET for other attrs will be equivalent to "IN"
+                # IS_SUPERSET will only match for other attrs with 0 or 1 values
+                value = set([value])
         if self.predicate == "IN":
             return value in self.value
-        else:
+        elif self.predicate == "NOT_IN":
             return value not in self.value
+        elif self.predicate == "IS_SUBSET":
+            return value <= self.value
+        elif self.predicate == "IS_SUPERSET":
+            return value >= self.value
 
     def __repr__(self):
-        return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
+        return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
 
 
 class _ComparisonPredicate:
     operators = ("==", "!=", ">=", "<=", ">", "<")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
         self.i = i
         self.attr = attr
         self.value = value
@@ -809,11 +829,13 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates):
+def _get_extra_predicates(spec, extra_predicates, vocab):
     predicate_types = {
         "REGEX": _RegexPredicate,
-        "IN": _SetMemberPredicate,
-        "NOT_IN": _SetMemberPredicate,
+        "IN": _SetPredicate,
+        "NOT_IN": _SetPredicate,
+        "IS_SUBSET": _SetPredicate,
+        "IS_SUPERSET": _SetPredicate,
         "==": _ComparisonPredicate,
         "!=": _ComparisonPredicate,
         ">=": _ComparisonPredicate,
@@ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
             value_with_upper_keys = {k.upper(): v for k, v in value.items()}
             for type_, cls in predicate_types.items():
                 if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
                     # Don't create a redundant predicates.
                     # This helps with efficiency, as we're caching the results.
                     if predicate.key in seen_predicates:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index eea6639d3..0c85dfe57 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
     IN: Optional[List[StrictStr]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
+    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
 
     class Config:
         extra = "forbid"
@@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
     REGEX: Optional[StrictStr] = Field(None, alias="regex")
     IN: Optional[List[StrictInt]] = Field(None, alias="in")
     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
+    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
+    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
     EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
     NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
     GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
     lower: Optional[StringValue] = None
     pos: Optional[StringValue] = None
     tag: Optional[StringValue] = None
+    morph: Optional[StringValue] = None
     dep: Optional[StringValue] = None
     lemma: Optional[StringValue] = None
     shape: Optional[StringValue] = None
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index c407595e5..627110cdd 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
     assert len(matches) == 1
 
 
+def test_matcher_subset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 3
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 2
+
+    # IS_SUBSET acts like "IN" for attrs other than MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUBSET with an empty list matches nothing
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUBSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+
+def test_matcher_superset_value_operator(en_vocab):
+    matcher = Matcher(en_vocab)
+    pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    assert len(matcher(doc)) == 0
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with more than one value only matches for MORPH
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 0
+
+    # IS_SUPERSET with one value is the same as ==
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 1
+
+    # IS_SUPERSET with an empty value matches everything
+    matcher = Matcher(en_vocab)
+    pattern = [{"TAG": {"IS_SUPERSET": []}}]
+    matcher.add("M", [pattern])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    doc[0].tag_ = "A"
+    assert len(matcher(doc)) == 3
+
+
+def test_matcher_morph_handling(en_vocab):
+    # order of features in pattern doesn't matter
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+    assert len(matcher(doc)) == 2
+    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+    # multiple values are split
+    matcher = Matcher(en_vocab)
+    pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
+    pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
+    matcher.add("M", [pattern1])
+    matcher.add("N", [pattern2])
+    doc = Doc(en_vocab, words=["a", "b", "c"])
+    assert len(matcher(doc)) == 0
+
+    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+    assert len(matcher(doc)) == 1
+    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+    assert len(matcher(doc)) == 2
+
+
 def test_matcher_regex(en_vocab):
     matcher = Matcher(en_vocab)
     pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 1f1946be5..3f7076a1c 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -30,20 +30,20 @@ pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
 
-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           |
 
 Operators and quantifiers define **how often** a token pattern should be
 matched:
@@ -79,6 +79,8 @@ it compares to another value.
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 
 ## Matcher.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 7e979b32e..256f4ccb4 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
 
-| Attribute                              |  Description                                                                                                              |
-| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               |
-| `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               |
-| `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             |
-|  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
-|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
-|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
-|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               |
-| `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         |
-| `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
-| `OP`                                   | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
+| Attribute                                       |  Description                                                                                                              |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               |
+| `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               |
+| `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             |
+|  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
+|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
+| `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
+| `OP`                                            | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           |
 
 <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
 
@@ -236,6 +236,8 @@ following rich comparison attributes are available:
 | -------------------------- | ------------------------------------------------------------------------------------------------------- |
 | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
 | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
 
 #### Regular expressions {#regex new="2.1"}

From 20b89a97176a5fc2d2c2c01e4f725f3a1d1e928b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 16:57:02 +0200
Subject: [PATCH 158/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 56b05257a..ea9f9f33e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a24"
+__version__ = "3.0.0a25"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 0b52b6904c78cc9e12db962d89db1ab2db38d545 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 17:10:35 +0200
Subject: [PATCH 159/516] Update entity_linker.py

---
 spacy/pipeline/entity_linker.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index fec53c77a..039e2a891 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -48,8 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
         "incl_context": True,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
     },
-    scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"],
-    default_score_weights={"nel_micro_f": 1.0},
+    default_score_weights={
+        "nel_micro_f": 1.0,
+        "nel_micro_r": None,
+        "nel_micro_p": None,
+    },
 )
 def make_entity_linker(
     nlp: Language,
@@ -428,7 +431,6 @@ class EntityLinker(Pipe):
         validate_examples(examples, "EntityLinker.score")
         return Scorer.score_links(examples, negative_labels=[self.NIL])
 
-
     def to_disk(
         self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> None:

From 26e28ed4134734dbc86fedb97339eec47282025a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 17:11:13 +0200
Subject: [PATCH 160/516] Fix combined scores if multiple components report it

---
 spacy/util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index ad3298651..378ec2823 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1237,7 +1237,9 @@ def combine_score_weights(
                 weight = 0.0
             else:
                 weight = round(value / total / len(all_weights), 2)
-            result[key] = result.get(key, 0.0) + weight
+            prev_weight = result.get(key, 0.0)
+            prev_weight = 0.0 if prev_weight is None else prev_weight
+            result[key] = prev_weight + weight
     return result
 
 

From 2abb4ba9db0d0ec074a7336be8a7395da78eaaa4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 18:13:39 +0200
Subject: [PATCH 161/516] Make a pre-check to speed up alignment cache (#6139)

* Dirty trick to fast-track alignment cache

* Improve alignment cache check

* Fix header

* Fix align cache

* Fix align logic
---
 spacy/training/example.pxd |  3 +++
 spacy/training/example.pyx | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd
index e06e36287..49e239757 100644
--- a/spacy/training/example.pxd
+++ b/spacy/training/example.pxd
@@ -1,4 +1,5 @@
 from ..tokens.doc cimport Doc
+from libc.stdint cimport uint64_t
 
 
 cdef class Example:
@@ -7,3 +8,5 @@ cdef class Example:
     cdef readonly object _cached_alignment
     cdef readonly object _cached_words_x
     cdef readonly object _cached_words_y
+    cdef readonly uint64_t _x_sig
+    cdef readonly uint64_t _y_sig
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1e7bea5df..6a9815c44 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,6 +1,7 @@
 from collections import Iterable as IterableInstance
 import warnings
 import numpy
+from murmurhash.mrmr cimport hash64
 
 from ..tokens.doc cimport Doc
 from ..tokens.span cimport Span
@@ -97,15 +98,36 @@ cdef class Example:
 
     @property
     def alignment(self):
-        words_x = [token.text for token in self.x]
-        words_y = [token.text for token in self.y]
-        if self._cached_alignment is None or \
-                words_x != self._cached_words_x or \
-                words_y != self._cached_words_y:
-            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+        x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0)
+        y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0)
+        if self._cached_alignment is None:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            self._x_sig = x_sig
+            self._y_sig = y_sig
             self._cached_words_x = words_x
             self._cached_words_y = words_y
-        return self._cached_alignment
+            self._cached_alignment = Alignment.from_strings(words_x, words_y)
+            return self._cached_alignment
+        elif self._x_sig == x_sig and self._y_sig == y_sig:
+            # If we have a cached alignment, check whether the cache is invalid
+            # due to retokenization. To make this check fast in loops, we first
+            # check a hash of the TokenC arrays.
+            return self._cached_alignment
+        else:
+            words_x = [token.text for token in self.x]
+            words_y = [token.text for token in self.y]
+            if words_x == self._cached_words_x and words_y == self._cached_words_y:
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
+            else:
+                self._cached_alignment = Alignment.from_strings(words_x, words_y)
+                self._cached_words_x = words_x
+                self._cached_words_y = words_y
+                self._x_sig = x_sig
+                self._y_sig = y_sig
+                return self._cached_alignment
 
     def get_aligned(self, field, as_string=False):
         """Return an aligned array for a token attribute."""

From 16475528f735114370d2db48b576106b1a6451e5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 20:38:57 +0200
Subject: [PATCH 162/516] Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/pipeline/ner.pyx | 15 ++++++++--
 spacy/scorer.py        | 64 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index c9b0a5031..fc0dda40d 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -6,7 +6,7 @@ from .transition_parser cimport Parser
 from ._parser_internals.ner cimport BiluoPushDown
 
 from ..language import Language
-from ..scorer import Scorer
+from ..scorer import get_ner_prf, PRFScore
 from ..training import validate_examples
 
 
@@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser):
         """Score a batch of examples.
 
         examples (Iterable[Example]): The examples to score.
-        RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
+        RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
 
         DOCS: https://nightly.spacy.io/api/entityrecognizer#score
         """
         validate_examples(examples, "EntityRecognizer.score")
-        return Scorer.score_spans(examples, "ents", **kwargs)
+        score_per_type = get_ner_prf(examples)
+        totals = PRFScore()
+        for prf in score_per_type.values():
+            totals += prf
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
diff --git a/spacy/scorer.py b/spacy/scorer.py
index cd3b013cd..c1795847d 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -1,5 +1,6 @@
 from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
 import numpy as np
+from collections import defaultdict
 
 from .training import Example
 from .tokens import Token, Doc, Span
@@ -23,6 +24,19 @@ class PRFScore:
         self.fp = 0
         self.fn = 0
 
+    def __iadd__(self, other):
+        self.tp += other.tp
+        self.fp += other.fp
+        self.fn += other.fn
+        return self
+
+    def __add__(self, other):
+        return PRFScore(
+            tp=self.tp+other.tp,
+            fp=self.fp+other.fp,
+            fn=self.fn+other.fn
+        )
+
     def score_set(self, cand: set, gold: set) -> None:
         self.tp += len(cand.intersection(gold))
         self.fp += len(cand - gold)
@@ -295,20 +309,19 @@ class Scorer:
             # Find all predidate labels, for all and per type
             gold_spans = set()
             pred_spans = set()
-            # Special case for ents:
-            # If we have missing values in the gold, we can't easily tell
-            # whether our NER predictions are true.
-            # It seems bad but it's what we've always done.
-            if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc):
-                continue
             for span in getter(gold_doc, attr):
                 gold_span = (span.label_, span.start, span.end - 1)
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             pred_per_type = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
-                pred_spans.add((span.label_, span.start, span.end - 1))
-                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            align_x2y = example.alignment.x2y
+            for pred_span in getter(pred_doc, attr):
+                indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel()
+                if len(indices):
+                    g_span = gold_doc[indices[0] : indices[-1]]
+                    span = (pred_span.label_, indices[0], indices[-1])
+                    pred_spans.add(span)
+                    pred_per_type[pred_span.label_].add(span)
             # Scores per label
             for k, v in score_per_type.items():
                 if k in pred_per_type:
@@ -613,6 +626,39 @@ class Scorer:
         }
 
 
+def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
+    """Compute per-entity PRFScore objects for a sequence of examples. The
+    results are returned as a dictionary keyed by the entity type. You can
+    add the PRFScore objects to get micro-averaged total.
+    """
+    scores = defaultdict(PRFScore)
+    for eg in examples:
+        if not eg.y.has_annotation("ENT_IOB"):
+            continue
+        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
+        align_x2y = eg.alignment.x2y
+        preds = set()
+        for pred_ent in eg.x.ents:
+            if pred_ent.label_ not in scores:
+                scores[pred_ent.label_] = PRFScore()
+            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            if len(indices):
+                g_span = eg.y[indices[0] : indices[-1] + 1]
+                # Check we aren't missing annotation on this span. If so,
+                # our prediction is neither right nor wrong, we just
+                # ignore it.
+                if all(token.ent_iob != 0 for token in g_span):
+                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
+                    if key in golds:
+                        scores[pred_ent.label_].tp += 1
+                        golds.remove(key)
+                    else:
+                        scores[pred_ent.label_].fp += 1
+        for label, start, end in golds:
+            scores[label].fn += 1
+    return scores
+
+
 #############################################################################
 #
 # The following implementation of roc_auc_score() is adapted from

From 2aa4d65734dec26d09d3326bf0498a2dafd54817 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 20:41:09 +0200
Subject: [PATCH 163/516] Update docs [ci skip]

---
 website/docs/api/entityrecognizer.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 8af73f44b..6d710f425 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -242,10 +242,10 @@ Score a batch of examples.
 > scores = ner.score(examples)
 > ```
 
-| Name        | Description                                                                                                            |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                           |
-| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name        | Description                                               |
+| ----------- | --------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~              |
+| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
 

From 93d7ff309fba4faa805ca105b56a04daefa77f5c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 21:05:27 +0200
Subject: [PATCH 164/516] Remove print

---
 spacy/training/example.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 6a9815c44..f2c78203a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -310,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot):
 
 
 def _add_entities_to_doc(doc, ner_data):
-    print(ner_data)
     if ner_data is None:
         return
     elif ner_data == []:

From 50f20cf7224edefbfa789755a1415841e6cd647b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 25 Sep 2020 08:21:30 +0200
Subject: [PATCH 165/516] Revert changes to Scorer.score_spans

---
 spacy/scorer.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index c1795847d..b2f97e163 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -314,14 +314,9 @@ class Scorer:
                 gold_spans.add(gold_span)
                 gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             pred_per_type = {label: set() for label in labels}
-            align_x2y = example.alignment.x2y
-            for pred_span in getter(pred_doc, attr):
-                indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel()
-                if len(indices):
-                    g_span = gold_doc[indices[0] : indices[-1]]
-                    span = (pred_span.label_, indices[0], indices[-1])
-                    pred_spans.add(span)
-                    pred_per_type[pred_span.label_].add(span)
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
+                pred_spans.add((span.label_, span.start, span.end - 1))
+                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
             # Scores per label
             for k, v in score_per_type.items():
                 if k in pred_per_type:

From 27c5795ea5b036fda98292a6486353ba4dc47ed3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 09:23:29 +0200
Subject: [PATCH 166/516] Fix version check in models directory [ci skip]

---
 website/src/templates/models.js | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 3c5e9d2a4..a1a6f3b5a 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -69,7 +69,12 @@ function isStableVersion(v) {
 function getLatestVersion(modelId, compatibility) {
     for (let [version, models] of Object.entries(compatibility)) {
         if (isStableVersion(version) && models[modelId]) {
-            return models[modelId][0]
+            const modelVersions = models[modelId]
+            for (let modelVersion of modelVersions) {
+                if (isStableVersion(modelVersion)) {
+                    return modelVersion
+                }
+            }
         }
     }
 }

From c7956a40474892b8459e5241de965e46ca388980 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 09:25:46 +0200
Subject: [PATCH 167/516] Update models.js [ci skip]

---
 website/src/templates/models.js | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index cdfe2e46d..f67188c0b 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -78,10 +78,15 @@ function isStableVersion(v) {
     return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc')
 }
 
-function getLatestVersion(modelId, compatibility) {
+function getLatestVersion(modelId, compatibility, prereleases) {
     for (let [version, models] of Object.entries(compatibility)) {
         if (isStableVersion(version) && models[modelId]) {
-            return models[modelId][0]
+            const modelVersions = models[modelId]
+            for (let modelVersion of modelVersions) {
+                if (isStableVersion(modelVersion) || prereleases) {
+                    return modelVersion
+                }
+            }
         }
     }
 }
@@ -147,12 +152,26 @@ const Help = ({ children }) => (
     </span>
 )
 
-const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => {
+const Model = ({
+    name,
+    langId,
+    langName,
+    baseUrl,
+    repo,
+    compatibility,
+    hasExamples,
+    licenses,
+    prereleases,
+}) => {
     const [initialized, setInitialized] = useState(false)
     const [isError, setIsError] = useState(true)
     const [meta, setMeta] = useState({})
     const { type, genre, size } = getModelComponents(name)
-    const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility])
+    const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
+        name,
+        compatibility,
+        prereleases,
+    ])
 
     useEffect(() => {
         window.dispatchEvent(new Event('resize')) // scroll position for progress
@@ -332,7 +351,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
     const [initialized, setInitialized] = useState(false)
     const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta, hasExamples } = pageContext
+    const { id, title, meta } = pageContext
     const { models, isStarters } = meta
     const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
 
@@ -381,6 +400,7 @@ const Models = ({ pageContext, repo, children }) => {
                             repo={repo}
                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
                             hasExamples={meta.hasExamples}
+                            prereleases={site.siteMetadata.nightly}
                         />
                     ))
                 }
@@ -397,6 +417,7 @@ const query = graphql`
     query ModelsQuery {
         site {
             siteMetadata {
+                nightly
                 licenses {
                     id
                     url

From 2cfe9340a1727acf9fcfd23a6ac0c0f2c0215010 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 13:21:20 +0200
Subject: [PATCH 168/516] Link model components [ci skip]

---
 website/src/templates/models.js | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index f67188c0b..8a73a6282 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -11,12 +11,23 @@ import { Table, Tr, Td, Th } from '../components/table'
 import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
-import Link from '../components/link'
+import Link, { OptionalLink } from '../components/link'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
 import { isString, isEmptyObj } from '../components/util'
 
+const COMPONENT_LINKS = {
+    tok2vec: '/api/tok2vec',
+    transformer: '/api/transformer',
+    tagger: '/api/tagger',
+    parser: '/api/dependencyparser',
+    ner: '/api/entityrecognizer',
+    lemmatizer: '/api/lemmatizer',
+    attribute_ruler: '/api/attributeruler',
+    senter: '/api/sentencerecognizer',
+}
+
 const MODEL_META = {
     core: 'Vocabulary, syntax, entities, vectors',
     core_sm: 'Vocabulary, syntax, entities',
@@ -146,6 +157,18 @@ function formatSources(data = []) {
     ))
 }
 
+function linkComponents(components = []) {
+    return join(
+        components.map(c => (
+            <Fragment key={c}>
+                <OptionalLink to={COMPONENT_LINKS[c]} hideIcon>
+                    <InlineCode>{c}</InlineCode>
+                </OptionalLink>
+            </Fragment>
+        ))
+    )
+}
+
 const Help = ({ children }) => (
     <span data-tooltip={children}>
         <Icon name="help2" width={16} variant="subtle" inline />
@@ -192,10 +215,8 @@ const Model = ({
 
     const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : ''
     const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}`
-    const pipeline =
-        meta.pipeline && join(meta.pipeline.map(p => <InlineCode key={p}>{p}</InlineCode>))
-    const components =
-        meta.components && join(meta.components.map(p => <InlineCode key={p}>{p}</InlineCode>))
+    const pipeline = linkComponents(meta.pipeline)
+    const components = linkComponents(meta.components)
     const sources = formatSources(meta.sources)
     const author = !meta.url ? meta.author : <Link to={meta.url}>{meta.author}</Link>
     const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null

From 02a1b6ab839f4a07c3cb1fb727c847f58a1c44f9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 25 Sep 2020 13:21:43 +0200
Subject: [PATCH 169/516] Update links [ci skip]

---
 website/src/templates/models.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 8a73a6282..f9895334d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -26,6 +26,7 @@ const COMPONENT_LINKS = {
     lemmatizer: '/api/lemmatizer',
     attribute_ruler: '/api/attributeruler',
     senter: '/api/sentencerecognizer',
+    morphologizer: '/api/morphologizer',
 }
 
 const MODEL_META = {

From 009ba14aafff1769bff408b2069e69245c441d2b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 25 Sep 2020 15:47:10 +0200
Subject: [PATCH 170/516] Fix pretraining in train script (#6143)

* update pretraining API in train CLI

* bump thinc to 8.0.0a35

* bump to 3.0.0a26

* doc fixes

* small doc fix
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 ++--
 spacy/about.py                                |  2 +-
 spacy/cli/train.py                            | 21 +++++++++----------
 spacy/errors.py                               |  2 +-
 spacy/training/corpus.py                      |  4 ++--
 website/docs/api/cli.md                       |  8 +++----
 website/docs/usage/embeddings-transformers.md |  2 +-
 website/docs/usage/training.md                |  2 +-
 10 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5290660aa..14d2c1e8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a34,<8.0.0a40",
+    "thinc>=8.0.0a35,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index a8b237aa1..b3a95dcff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a34,<8.0.0a40
+thinc>=8.0.0a35,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 9831402d1..b080d4330 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a34,<8.0.0a40
+    thinc>=8.0.0a35,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a34,<8.0.0a40
+    thinc>=8.0.0a35,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/about.py b/spacy/about.py
index ea9f9f33e..fbe772d25 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a25"
+__version__ = "3.0.0a26"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 6d61c2425..cbb0655ef 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -121,20 +121,19 @@ def train(
 
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     if weights_data is not None:
-        tok2vec_path = config["pretraining"].get("tok2vec_model", None)
-        if tok2vec_path is None:
+        tok2vec_component = config["pretraining"]["component"]
+        if tok2vec_component is None:
             msg.fail(
-                f"To pretrained tok2vec weights, the config needs to specify which "
-                f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them.",
                 exits=1,
             )
-        tok2vec = config
-        for subpath in tok2vec_path.split("."):
-            tok2vec = tok2vec.get(subpath)
-        if not tok2vec:
-            err = f"Could not locate the tok2vec model at {tok2vec_path}"
-            msg.fail(err, exits=1)
-        tok2vec.from_bytes(weights_data)
+        layer = nlp.get_pipe(tok2vec_component).model
+        tok2vec_layer = config["pretraining"]["layer"]
+        if tok2vec_layer:
+            layer = layer.get_ref(tok2vec_layer)
+        layer.from_bytes(weights_data)
+        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
 
     # Create iterator, which yields out info after each optimization step.
     msg.info("Start training")
diff --git a/spacy/errors.py b/spacy/errors.py
index 4216e3936..640419182 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,7 +85,7 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W090 = ("Could not locate any binary .spacy files in path '{path}'.")
+    W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
     W093 = ("Could not find any data to train the {name} on. Is your "
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 11f098993..848692f47 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -49,7 +49,7 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
         elif path.parts[-1].endswith(file_type):
             locs.append(path)
     if len(locs) == 0:
-        warnings.warn(Warnings.W090.format(path=orig_path))
+        warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
     return locs
 
 
@@ -200,7 +200,7 @@ class JsonlTexts:
 
         DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
         """
-        for loc in walk_corpus(self.path, "jsonl"):
+        for loc in walk_corpus(self.path, ".jsonl"):
             records = srsly.read_jsonl(loc)
             for record in records:
                 doc = nlp.make_doc(record["text"])
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 53cd954be..a6cb41e5e 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -754,7 +754,7 @@ in the section `[paths]`.
 </Infobox>
 
 ```cli
-$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
+$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
 ```
 
 | Name              | Description                                                                                                                                                                                |
@@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your pipeline. This technique may be especially helpful if you have little
-labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
-for more info.
+labelled data. See the usage docs on
+[pretraining](/usage/embeddings-transformers#pretraining) for more info.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on
 </Infobox>
 
 ```cli
-$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides]
+$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
 ```
 
 | Name                    | Description                                                                                                                                                                           |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index b00760e62..97249bfb2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting
 leading and trailing characters does that more than adequately, as the exact
 word sequence could be recovered with high accuracy if the initial and trailing
 characters are predicted accurately. With the vectors objective, the pretraining
-is use the embedding space learned by an algorithm such as
+uses the embedding space learned by an algorithm such as
 [GloVe](https://nlp.stanford.edu/projects/glove/) or
 [Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
 focus on the contextual modelling we actual care about.
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65afd0eb4..54be6b367 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -175,7 +175,7 @@ sections of a config file are:
 | `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI.          |
 | `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
-| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
+| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining).                                                |
 
 <Infobox title="Config format and settings" emoji="📖">
 

From c3b5a3cfff2f2e168073d3935afb3fe005f11627 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 25 Sep 2020 15:56:48 +0200
Subject: [PATCH 171/516] Clean up MorphAnalysisC struct (#6146)

---
 spacy/structs.pxd | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index a01244d7e..4a51bc9e0 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -60,49 +60,6 @@ cdef struct MorphAnalysisC:
     hash_t key
     int length
 
-    attr_t abbr
-    attr_t adp_type
-    attr_t adv_type
-    attr_t animacy
-    attr_t aspect
-    attr_t case
-    attr_t conj_type
-    attr_t connegative
-    attr_t definite
-    attr_t degree
-    attr_t derivation
-    attr_t echo
-    attr_t foreign
-    attr_t gender
-    attr_t hyph
-    attr_t inf_form
-    attr_t mood
-    attr_t negative
-    attr_t number
-    attr_t name_type
-    attr_t noun_type
-    attr_t num_form
-    attr_t num_type
-    attr_t num_value
-    attr_t part_form
-    attr_t part_type
-    attr_t person
-    attr_t polite
-    attr_t polarity
-    attr_t poss
-    attr_t prefix
-    attr_t prep_case
-    attr_t pron_type
-    attr_t punct_side
-    attr_t punct_type
-    attr_t reflex
-    attr_t style
-    attr_t style_variant
-    attr_t tense
-    attr_t typo
-    attr_t verb_form
-    attr_t voice
-    attr_t verb_type
     attr_t* fields
     attr_t* features
 

From 3d8388969e2eede035b2b52db999a99e0fd675f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 19:07:26 +0200
Subject: [PATCH 172/516] Sort paths for cache consistency

---
 spacy/cli/convert.py     | 2 ++
 spacy/training/corpus.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 8f8234c61..3fc530822 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -209,6 +209,8 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
             continue
         else:
             locs.append(path)
+    # It's good to sort these, in case the ordering messes up cache.
+    locs.sort()
     return locs
 
 
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 848692f47..12bda486e 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -50,6 +50,8 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
             locs.append(path)
     if len(locs) == 0:
         warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
+    # It's good to sort these, in case the ordering messes up a cache.
+    locs.sort()
     return locs
 
 

From 26afd3bd90ca175a20b1e8f52abec898655c7fd3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 21:47:22 +0200
Subject: [PATCH 173/516] Fix iteration order

---
 spacy/training/gold_io.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 524da0a16..8b9f5ab2b 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -62,7 +62,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
     """Read Example dictionaries from a json file or directory."""
     loc = util.ensure_path(loc)
     if loc.is_dir():
-        for filename in loc.iterdir():
+        for filename in sorted(loc.iterdir()):
             yield from read_json_file(loc / filename, limit=limit)
     else:
         with loc.open("rb") as file_:

From 092ce4648e959453cbc25843f7d9afcb234b540e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 22:20:44 +0200
Subject: [PATCH 174/516] Make DocBin output stable data (set iteration)

---
 spacy/tokens/_serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index c9a20f6c0..2d4e9af9d 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -171,7 +171,7 @@ class DocBin:
             "tokens": tokens.tobytes("C"),
             "spaces": spaces.tobytes("C"),
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
-            "strings": list(self.strings),
+            "strings": list(sorted(self.strings)),
             "cats": self.cats,
             "flags": self.flags,
         }

From 98327f66a9e66366ca3ee99083a5cfd9acfe8d7a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 25 Sep 2020 23:20:50 +0200
Subject: [PATCH 175/516] Fix attributeruler key

---
 spacy/pipeline/attributeruler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 0d59a1ba0..52f8b7ece 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -173,7 +173,9 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#add
         """
-        self.matcher.add(len(self.attrs), patterns)
+        # This needs to be a string, because otherwise it's interpreted as a
+        # string key.
+        self.matcher.add(f"attr_rules_{len(self.attrs)}", patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)

From 821f37254cf1caca8f943574b4cbaaaea4cfb251 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 26 Sep 2020 00:19:53 +0200
Subject: [PATCH 176/516] Fix attributeruler

---
 spacy/pipeline/attributeruler.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 52f8b7ece..e1ad91340 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -80,11 +80,14 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
         matches = sorted(self.matcher(doc, allow_missing=True))
+        print("Attrs", self.attrs)
+        print("Matches", matches)
 
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
-            attrs = self.attrs[span.label]
-            index = self.indices[span.label]
+            attr_id = _parse_key(span.label_)
+            attrs = self.attrs[attr_id]
+            index = self.indices[attr_id]
             try:
                 token = span[index]
             except IndexError:
@@ -173,9 +176,10 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#add
         """
-        # This needs to be a string, because otherwise it's interpreted as a
-        # string key.
-        self.matcher.add(f"attr_rules_{len(self.attrs)}", patterns)
+        # We need to make a string here, because otherwise the ID we pass back
+        # will be interpreted as the hash of a string, rather than an ordinal.
+        key = _make_key(len(self.attrs))
+        self.matcher.add(self.vocab.strings.add(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)
@@ -199,7 +203,7 @@ class AttributeRuler(Pipe):
         all_patterns = []
         for i in range(len(self.attrs)):
             p = {}
-            p["patterns"] = self.matcher.get(i)[1]
+            p["patterns"] = self.matcher.get(_make_key(i))[1]
             p["attrs"] = self._attrs_unnormed[i]
             p["index"] = self.indices[i]
             all_patterns.append(p)
@@ -303,6 +307,12 @@ class AttributeRuler(Pipe):
 
         return self
 
+def _make_key(n_attr):
+    return f"attr_rule_{n_attr}"
+
+def _parse_key(key):
+    return int(key.rsplit("_", 1)[1])
+
 
 def _split_morph_attrs(attrs):
     """Split entries from a tag map or morph rules dict into to two dicts, one

From 702edf52a0dcef071b49e0b52af7de6cfc9be140 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 26 Sep 2020 00:23:09 +0200
Subject: [PATCH 177/516] Fix attributeruler

---
 spacy/pipeline/attributeruler.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index e1ad91340..1dc2a10dd 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -79,26 +79,32 @@ class AttributeRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/attributeruler#call
         """
-        matches = sorted(self.matcher(doc, allow_missing=True))
-        print("Attrs", self.attrs)
-        print("Matches", matches)
-
-        for match_id, start, end in matches:
+        matches = self.matcher(doc, allow_missing=True)
+        # Sort by the attribute ID, so that later rules have precendence
+        matches = [
+            (_parse_key(self.vocab.strings[m_id]), m_id, s, e)
+            for m_id, s, e in matches
+        ]
+        matches.sort()
+        for attr_id, match_id, start, end in matches:
             span = Span(doc, start, end, label=match_id)
-            attr_id = _parse_key(span.label_)
             attrs = self.attrs[attr_id]
             index = self.indices[attr_id]
             try:
+                # The index can be negative, which makes it annoying to do
+                # the boundscheck. Let Span do it instead.
                 token = span[index]
             except IndexError:
+                # The original exception is just our conditional logic, so we
+                # raise from.
                 raise ValueError(
                     Errors.E1001.format(
                         patterns=self.matcher.get(span.label),
                         span=[t.text for t in span],
                         index=index,
                     )
-                ) from None
-            set_token_attrs(token, attrs)
+                ) from None 
+            set_token_attrs(span[index], attrs)
         return doc
 
     def pipe(self, stream, *, batch_size=128):

From 6c25e60089931e4801a4c74cc807ea31f2c02bee Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sat, 26 Sep 2020 11:12:39 +0200
Subject: [PATCH 178/516] Simplify string match IDs for AttributeRuler

---
 spacy/pipeline/attributeruler.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 1dc2a10dd..4243ebcfb 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -82,7 +82,7 @@ class AttributeRuler(Pipe):
         matches = self.matcher(doc, allow_missing=True)
         # Sort by the attribute ID, so that later rules have precendence
         matches = [
-            (_parse_key(self.vocab.strings[m_id]), m_id, s, e)
+            (int(self.vocab.strings[m_id]), m_id, s, e)
             for m_id, s, e in matches
         ]
         matches.sort()
@@ -184,7 +184,7 @@ class AttributeRuler(Pipe):
         """
         # We need to make a string here, because otherwise the ID we pass back
         # will be interpreted as the hash of a string, rather than an ordinal.
-        key = _make_key(len(self.attrs))
+        key = str(len(self.attrs))
         self.matcher.add(self.vocab.strings.add(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
@@ -209,7 +209,7 @@ class AttributeRuler(Pipe):
         all_patterns = []
         for i in range(len(self.attrs)):
             p = {}
-            p["patterns"] = self.matcher.get(_make_key(i))[1]
+            p["patterns"] = self.matcher.get(str(i))[1]
             p["attrs"] = self._attrs_unnormed[i]
             p["index"] = self.indices[i]
             all_patterns.append(p)
@@ -313,12 +313,6 @@ class AttributeRuler(Pipe):
 
         return self
 
-def _make_key(n_attr):
-    return f"attr_rule_{n_attr}"
-
-def _parse_key(key):
-    return int(key.rsplit("_", 1)[1])
-
 
 def _split_morph_attrs(attrs):
     """Split entries from a tag map or morph rules dict into to two dicts, one

From ca3c9970623f86a1638e9e3b38b3958659f0baaa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 26 Sep 2020 13:13:57 +0200
Subject: [PATCH 179/516] Improve CLI config validation with latest Thinc

---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 +--
 spacy/cli/_util.py                            | 26 ++++++++++++-------
 spacy/cli/debug_config.py                     |  6 ++---
 spacy/cli/train.py                            | 15 ++++++++++-
 spacy/tests/lang/zh/test_tokenizer.py         |  2 +-
 spacy/tests/pipeline/test_pipe_factories.py   |  3 +--
 .../tests/serialize/test_serialize_config.py  |  2 +-
 spacy/tests/test_cli.py                       |  2 +-
 10 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 14d2c1e8e..896ad339f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a35,<8.0.0a40",
+    "thinc>=8.0.0a36,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index b3a95dcff..2746ecc37 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a35,<8.0.0a40
+thinc>=8.0.0a36,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index b080d4330..33dabc91f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a35,<8.0.0a40
+    thinc>=8.0.0a36,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a35,<8.0.0a40
+    thinc>=8.0.0a36,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 21a4e54ce..506380b0b 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -10,7 +10,7 @@ from click import NoSuchOption
 from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
-from thinc.config import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError
 from configparser import InterpolationError
 import os
 
@@ -226,24 +226,28 @@ def get_checksum(path: Union[Path, str]) -> str:
 def show_validation_error(
     file_path: Optional[Union[str, Path]] = None,
     *,
-    title: str = "Config validation error",
+    title: Optional[str] = None,
+    desc: str = "",
+    show_config: Optional[bool] = None,
     hint_fill: bool = True,
 ):
     """Helper to show custom config validation errors on the CLI.
 
     file_path (str / Path): Optional file path of config file, used in hints.
-    title (str): Title of the custom formatted error.
+    title (str): Override title of custom formatted error.
+    desc (str): Override description of custom formatted error.
+    show_config (bool): Whether to output the config the error refers to.
     hint_fill (bool): Show hint about filling config.
     """
     try:
         yield
-    except (ConfigValidationError, InterpolationError) as e:
-        msg.fail(title, spaced=True)
-        # TODO: This is kinda hacky and we should probably provide a better
-        # helper for this in Thinc
-        err_text = str(e).replace("Config validation error", "").strip()
-        print(err_text)
-        if hint_fill and "field required" in err_text:
+    except ConfigValidationError as e:
+        title = title if title is not None else e.title
+        # Re-generate a new error object with overrides
+        err = e.from_error(e, title="", desc=desc, show_config=show_config)
+        msg.fail(title)
+        print(err.text.strip())
+        if hint_fill and "value_error.missing" in err.error_types:
             config_path = file_path if file_path is not None else "config.cfg"
             msg.text(
                 "If your config contains missing values, you can run the 'init "
@@ -252,6 +256,8 @@ def show_validation_error(
             )
             print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
         sys.exit(1)
+    except InterpolationError as e:
+        msg.fail("Config validation error", e, exits=1)
 
 
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index d07a0bb2d..c0c7de7ef 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -1,8 +1,8 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
-from thinc.api import Config
-from thinc.config import VARIABLE_RE, ConfigValidationError
+from thinc.api import Config, ConfigValidationError
+from thinc.config import VARIABLE_RE
 import typer
 
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@@ -115,4 +115,4 @@ def check_section_refs(config: Config, fields: List[str]) -> None:
             msg = f"not a valid section reference: {value}"
             errors.append({"loc": field.split("."), "msg": msg})
     if errors:
-        raise ConfigValidationError(config, errors)
+        raise ConfigValidationError(config=config, errors=errors)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index cbb0655ef..bc4f17ff3 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -89,7 +89,7 @@ def train(
         nlp, config = util.load_model_from_config(config)
     util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
     if config["training"]["vectors"] is not None:
-        util.load_vectors_into_model(nlp, config["training"]["vectors"])
+        add_vectors(nlp, config["training"]["vectors"])
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
@@ -195,6 +195,19 @@ def train(
             msg.good(f"Saved pipeline to output directory {final_model_path}")
 
 
+def add_vectors(nlp: Language, vectors: str) -> None:
+    title = f"Config validation error for vectors {vectors}"
+    desc = (
+        "This typically means that there's a problem in the config.cfg included "
+        "with the packaged vectors. Make sure that the vectors package you're "
+        "loading is compatible with the current version of spaCy."
+    )
+    with show_validation_error(
+        title=title, desc=desc, hint_fill=False, show_config=False
+    ):
+        util.load_vectors_into_model(nlp, vectors)
+
+
 def create_train_batches(iterator, batcher, max_epochs: int):
     epoch = 0
     examples = list(iterator)
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
index 70e753ba2..741eb0ace 100644
--- a/spacy/tests/lang/zh/test_tokenizer.py
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 from spacy.lang.zh import Chinese, _get_pkuseg_trie_data
-from thinc.config import ConfigValidationError
+from thinc.api import ConfigValidationError
 
 
 # fmt: off
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 07648024c..cac394913 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -4,8 +4,7 @@ from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.tokens import Doc
 from spacy.util import registry, SimpleFrozenDict, combine_score_weights
-from thinc.api import Model, Linear
-from thinc.config import ConfigValidationError
+from thinc.api import Model, Linear, ConfigValidationError
 from pydantic import StrictInt, StrictStr
 
 from ..util import make_tempdir
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index ec7544456..1a5be4bec 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -1,5 +1,5 @@
 import pytest
-from thinc.config import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError
 import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a66ab8de1..caf4ea890 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -8,7 +8,7 @@ from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 from spacy.cli.debug_config import check_section_refs
-from thinc.config import ConfigValidationError, Config
+from thinc.api import ConfigValidationError, Config
 import srsly
 import os
 

From e06ff8b71da4a3da9cab28ede4450e63ff51b271 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 26 Sep 2020 13:18:08 +0200
Subject: [PATCH 180/516] Update docs [ci skip]

---
 website/docs/usage/v3.md    | 3 ++-
 website/meta/languages.json | 2 +-
 website/meta/universe.json  | 6 ++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 91d97cae2..94c50e1ec 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -91,7 +91,8 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 - **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
   [`de_dep_news_trf`](/models/de#de_dep_news_trf),
   [`es_dep_news_trf`](/models/es#es_dep_news_trf),
-  [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
+  [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf),
+  [`zh_core_web_trf`](/models/zh#zh_core_web_trf)
 - **Implementation:**
   [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 5ef3a6469..5b54c1977 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -210,7 +210,7 @@
         {
             "code": "zh",
             "name": "Chinese",
-            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
+            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"],
             "dependencies": [
                 {
                     "name": "Jieba",
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 010ff3618..74c35bdb8 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2065,10 +2065,8 @@
             "code_example": [
                 "import spacy",
                 "",
-                "nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
-                "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
-                "print(doc[0].similarity(doc[7]))",
-                "print(doc._.trf_last_hidden_state.shape)"
+                "nlp = spacy.load(\"en_core_web_trf\")",
+                "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")"
             ],
             "author": "Explosion",
             "author_links": {

From b2d07de786752840013daf2c20d3b0ecdc23b394 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 26 Sep 2020 15:16:59 +0200
Subject: [PATCH 181/516] Construct nlp from uninterpolated config before
 training

---
 spacy/cli/train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index cbb0655ef..e64ee532b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -78,6 +78,9 @@ def train(
         config = util.load_config(
             config_path, overrides=config_overrides, interpolate=True
         )
+        # Keep a second un-interpolated config so we can preserve variables in
+        # the final nlp object we train and serialize
+        raw_config = util.load_config(config_path, overrides=config_overrides)
     if config["training"]["seed"] is not None:
         fix_random_seed(config["training"]["seed"])
     allocator = config["training"]["gpu_allocator"]
@@ -86,7 +89,7 @@ def train(
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
-        nlp, config = util.load_model_from_config(config)
+        nlp, config = util.load_model_from_config(raw_config)
     util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
     if config["training"]["vectors"] is not None:
         util.load_vectors_into_model(nlp, config["training"]["vectors"])

From 11e195d3ed1b138a882d90385210c78d9575febc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 27 Sep 2020 14:00:18 +0200
Subject: [PATCH 182/516] Update ChineseTokenizer

* Allow `pkuseg_model` to be set to `None` on initialization
* Don't save config within tokenizer
* Force convert pkuseg_model to use pickle protocol 4 by reencoding with
`pickle5` on serialization
* Update pkuseg serialization test
---
 spacy/errors.py                       | 13 ++++--
 spacy/lang/zh/__init__.py             | 60 ++++++++++++++-------------
 spacy/tests/lang/zh/test_serialize.py | 11 ++---
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 640419182..aad49e1ad 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -670,10 +670,15 @@ class Errors:
             "'{token_attrs}'.")
     E999 = ("Unable to merge the `Doc` objects because they do not all share "
             "the same `Vocab`.")
-    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
-             "initializing the pipeline:\n"
-             'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\n'
-             'nlp = Chinese(config=cfg)')
+    E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
+             "specified. Provide the name of a pretrained model or the path to "
+             "a model when initializing the pipeline:\n"
+             'config = {\n'
+             '   "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
+             '   "segmenter": "pkuseg",\n'
+             '   "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
+             '}\n'
+             'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
     E1001 = ("Target token outside of matched span for match with tokens "
              "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
     E1002 = ("Span index out of range.")
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 5d3bd2a96..d222e78f2 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -15,7 +15,8 @@ from .stop_words import STOP_WORDS
 from ... import util
 
 
-_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
+_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
+_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7.
 
 DEFAULT_CONFIG = """
 [nlp]
@@ -64,7 +65,7 @@ class ChineseTokenizer(DummyTokenizer):
         pkuseg_user_dict: Optional[str] = None,
     ):
         self.vocab = nlp.vocab
-        if isinstance(segmenter, Segmenter):  # we might have the Enum here
+        if isinstance(segmenter, Segmenter):
             segmenter = segmenter.value
         self.segmenter = segmenter
         self.pkuseg_model = pkuseg_model
@@ -136,18 +137,6 @@ class ChineseTokenizer(DummyTokenizer):
             warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
             warnings.warn(warn_msg)
 
-    def _get_config(self) -> Dict[str, Any]:
-        return {
-            "segmenter": self.segmenter,
-            "pkuseg_model": self.pkuseg_model,
-            "pkuseg_user_dict": self.pkuseg_user_dict,
-        }
-
-    def _set_config(self, config: Dict[str, Any] = {}) -> None:
-        self.segmenter = config.get("segmenter", Segmenter.char)
-        self.pkuseg_model = config.get("pkuseg_model", None)
-        self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
-
     def to_bytes(self, **kwargs):
         pkuseg_features_b = b""
         pkuseg_weights_b = b""
@@ -157,6 +146,20 @@ class ChineseTokenizer(DummyTokenizer):
                 self.pkuseg_seg.feature_extractor.save(tempdir)
                 self.pkuseg_seg.model.save(tempdir)
                 tempdir = Path(tempdir)
+                # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
+                # means that it will be saved with pickle protocol 5 with
+                # python 3.8, which can't be reloaded with python 3.6-3.7.
+                # To try to make the model compatible with python 3.6+, reload
+                # the data with pickle5 and convert it back to protocol 4.
+                try:
+                    import pickle5
+
+                    with open(tempdir / "features.pkl", "rb") as fileh:
+                        features = pickle5.load(fileh)
+                    with open(tempdir / "features.pkl", "wb") as fileh:
+                        pickle5.dump(features, fileh, protocol=4)
+                except:
+                    warnings.warn(_PKUSEG_PICKLE_WARNING)
                 with open(tempdir / "features.pkl", "rb") as fileh:
                     pkuseg_features_b = fileh.read()
                 with open(tempdir / "weights.npz", "rb") as fileh:
@@ -168,7 +171,6 @@ class ChineseTokenizer(DummyTokenizer):
                 sorted(list(self.pkuseg_seg.postprocesser.other_words)),
             )
         serializers = {
-            "cfg": lambda: srsly.json_dumps(self._get_config()),
             "pkuseg_features": lambda: pkuseg_features_b,
             "pkuseg_weights": lambda: pkuseg_weights_b,
             "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
@@ -188,7 +190,6 @@ class ChineseTokenizer(DummyTokenizer):
             pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
 
         deserializers = {
-            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
             "pkuseg_features": deserialize_pkuseg_features,
             "pkuseg_weights": deserialize_pkuseg_weights,
             "pkuseg_processors": deserialize_pkuseg_processors,
@@ -229,6 +230,16 @@ class ChineseTokenizer(DummyTokenizer):
                     path.mkdir(parents=True)
                 self.pkuseg_seg.model.save(path)
                 self.pkuseg_seg.feature_extractor.save(path)
+                # try to convert features.pkl to pickle protocol 4
+                try:
+                    import pickle5
+
+                    with open(path / "features.pkl", "rb") as fileh:
+                        features = pickle5.load(fileh)
+                    with open(path / "features.pkl", "wb") as fileh:
+                        pickle5.dump(features, fileh, protocol=4)
+                except:
+                    warnings.warn(_PKUSEG_PICKLE_WARNING)
 
         def save_pkuseg_processors(path):
             if self.pkuseg_seg:
@@ -241,7 +252,6 @@ class ChineseTokenizer(DummyTokenizer):
                 srsly.write_msgpack(path, data)
 
         serializers = {
-            "cfg": lambda p: srsly.write_json(p, self._get_config()),
             "pkuseg_model": lambda p: save_pkuseg_model(p),
             "pkuseg_processors": lambda p: save_pkuseg_processors(p),
         }
@@ -277,7 +287,6 @@ class ChineseTokenizer(DummyTokenizer):
                 self.pkuseg_seg.postprocesser.other_words = set(other_words)
 
         serializers = {
-            "cfg": lambda p: self._set_config(srsly.read_json(p)),
             "pkuseg_model": lambda p: load_pkuseg_model(p),
             "pkuseg_processors": lambda p: load_pkuseg_processors(p),
         }
@@ -314,21 +323,14 @@ def try_jieba_import(segmenter: str) -> None:
             raise ImportError(msg) from None
 
 
-def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
+def try_pkuseg_import(segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str) -> None:
     try:
         import pkuseg
 
-        if pkuseg_model:
+        if pkuseg_model is None:
+            return None
+        else:
             return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
-        elif segmenter == Segmenter.pkuseg:
-            msg = (
-                "The Chinese word segmenter is 'pkuseg' but no pkuseg model "
-                "was specified. Please provide the name of a pretrained model "
-                "or the path to a model with:\n"
-                'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
-                "nlp = Chinese.from_config(cfg)"
-            )
-            raise ValueError(msg)
     except ImportError:
         if segmenter == Segmenter.pkuseg:
             msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 1c6fdf419..5491314e2 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -27,9 +27,10 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 
 @pytest.mark.slow
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
-    nlp = Chinese(
-        meta={
-            "tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
-        }
-    )
+    config = {
+        "@tokenizers": "spacy.zh.ChineseTokenizer",
+        "segmenter": "pkuseg",
+        "pkuseg_model": "medicine",
+    }
+    nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
     zh_tokenizer_serialize(nlp.tokenizer)

From 54fe8719355534ec1dd51b20252bf154c25a8be3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 27 Sep 2020 14:37:28 +0200
Subject: [PATCH 183/516] Fix formatting, refactor pickle5 exceptions

---
 spacy/lang/zh/__init__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index d222e78f2..f9887a4df 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List
 from enum import Enum
 import tempfile
 import srsly
@@ -16,7 +16,7 @@ from ... import util
 
 
 _PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
-_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7.
+_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
 
 DEFAULT_CONFIG = """
 [nlp]
@@ -158,7 +158,9 @@ class ChineseTokenizer(DummyTokenizer):
                         features = pickle5.load(fileh)
                     with open(tempdir / "features.pkl", "wb") as fileh:
                         pickle5.dump(features, fileh, protocol=4)
-                except:
+                except ImportError as e:
+                    raise(e)
+                except Exception:
                     warnings.warn(_PKUSEG_PICKLE_WARNING)
                 with open(tempdir / "features.pkl", "rb") as fileh:
                     pkuseg_features_b = fileh.read()
@@ -238,7 +240,9 @@ class ChineseTokenizer(DummyTokenizer):
                         features = pickle5.load(fileh)
                     with open(path / "features.pkl", "wb") as fileh:
                         pickle5.dump(features, fileh, protocol=4)
-                except:
+                except ImportError as e:
+                    raise(e)
+                except Exception:
                     warnings.warn(_PKUSEG_PICKLE_WARNING)
 
         def save_pkuseg_processors(path):

From 8393dbedad3e122638fc996719e8d29611dd9a24 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 27 Sep 2020 15:15:53 +0200
Subject: [PATCH 184/516] Minor fixes

* Put `cfg` back in serialization
* Add `pickle5` to pytest conf
---
 spacy/lang/zh/__init__.py | 18 +++++++++++++++++-
 spacy/tests/conftest.py   |  1 +
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index f9887a4df..69c7b644d 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Dict, Any
 from enum import Enum
 import tempfile
 import srsly
@@ -137,6 +137,18 @@ class ChineseTokenizer(DummyTokenizer):
             warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
             warnings.warn(warn_msg)
 
+    def _get_config(self) -> Dict[str, Any]:
+        return {
+            "segmenter": self.segmenter,
+            "pkuseg_model": self.pkuseg_model,
+            "pkuseg_user_dict": self.pkuseg_user_dict,
+        }
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.segmenter = config.get("segmenter", Segmenter.char)
+        self.pkuseg_model = config.get("pkuseg_model", None)
+        self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
+
     def to_bytes(self, **kwargs):
         pkuseg_features_b = b""
         pkuseg_weights_b = b""
@@ -173,6 +185,7 @@ class ChineseTokenizer(DummyTokenizer):
                 sorted(list(self.pkuseg_seg.postprocesser.other_words)),
             )
         serializers = {
+            "cfg": lambda: srsly.json_dumps(self._get_config()),
             "pkuseg_features": lambda: pkuseg_features_b,
             "pkuseg_weights": lambda: pkuseg_weights_b,
             "pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
@@ -192,6 +205,7 @@ class ChineseTokenizer(DummyTokenizer):
             pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
 
         deserializers = {
+            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
             "pkuseg_features": deserialize_pkuseg_features,
             "pkuseg_weights": deserialize_pkuseg_weights,
             "pkuseg_processors": deserialize_pkuseg_processors,
@@ -256,6 +270,7 @@ class ChineseTokenizer(DummyTokenizer):
                 srsly.write_msgpack(path, data)
 
         serializers = {
+            "cfg": lambda p: srsly.write_json(p, self._get_config()),
             "pkuseg_model": lambda p: save_pkuseg_model(p),
             "pkuseg_processors": lambda p: save_pkuseg_processors(p),
         }
@@ -291,6 +306,7 @@ class ChineseTokenizer(DummyTokenizer):
                 self.pkuseg_seg.postprocesser.other_words = set(other_words)
 
         serializers = {
+            "cfg": lambda p: self._set_config(srsly.read_json(p)),
             "pkuseg_model": lambda p: load_pkuseg_model(p),
             "pkuseg_processors": lambda p: load_pkuseg_processors(p),
         }
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 3a9a1f26b..23fc5e98f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -282,6 +282,7 @@ def zh_tokenizer_jieba():
 @pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
     pytest.importorskip("pkuseg")
+    pytest.importorskip("pickle5")
     config = {
         "@tokenizers": "spacy.zh.ChineseTokenizer",
         "segmenter": "pkuseg",

From f29d5b9b89f526473f7d431f71d27d7fc967ea36 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 18:39:38 +0200
Subject: [PATCH 185/516] Update docs [ci skip]

---
 website/docs/api/data-formats.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 420c09237..6ff3bfd0d 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -219,7 +219,6 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
 | `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
-|                |
 
 ## Training data {#training}
 

From 39b178999c67fc8512b93a8c83ca90676351d7c9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 27 Sep 2020 20:13:38 +0200
Subject: [PATCH 186/516] Tmp notes

---
 spacy/cli/init_model.py  | 295 +--------------------------------------
 spacy/cli/train.py       |  78 ++---------
 spacy/default_config.cfg |   9 ++
 3 files changed, 21 insertions(+), 361 deletions(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 05bf99ccd..6decb6172 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -13,18 +13,6 @@ import warnings
 from wasabi import msg, Printer
 import typer
 
-from ._util import app, init_cli, Arg, Opt
-from ..vectors import Vectors
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
-
-try:
-    import ftfy
-except ImportError:
-    ftfy = None
-
-
 DEFAULT_OOV_PROB = -20
 
 
@@ -63,7 +51,7 @@ def init_model_cli(
             "'python -m spacy init --help' for an overview of the other "
             "available initialization commands."
         )
-    init_model(
+    init_vocab(
         lang,
         output_dir,
         freqs_loc=freqs_loc,
@@ -77,284 +65,3 @@ def init_model_cli(
         base_model=base_model,
         silent=False,
     )
-
-
-def init_model(
-    lang: str,
-    output_dir: Path,
-    freqs_loc: Optional[Path] = None,
-    clusters_loc: Optional[Path] = None,
-    jsonl_loc: Optional[Path] = None,
-    vectors_loc: Optional[Path] = None,
-    prune_vectors: int = -1,
-    truncate_vectors: int = 0,
-    vectors_name: Optional[str] = None,
-    model_name: Optional[str] = None,
-    base_model: Optional[str] = None,
-    silent: bool = True,
-) -> Language:
-    msg = Printer(no_print=silent, pretty=not silent)
-    if jsonl_loc is not None:
-        if freqs_loc is not None or clusters_loc is not None:
-            settings = ["-j"]
-            if freqs_loc:
-                settings.append("-f")
-            if clusters_loc:
-                settings.append("-c")
-            msg.warn(
-                "Incompatible arguments",
-                "The -f and -c arguments are deprecated, and not compatible "
-                "with the -j argument, which should specify the same "
-                "information. Either merge the frequencies and clusters data "
-                "into the JSONL-formatted file (recommended), or use only the "
-                "-f and -c files, without the other lexical attributes.",
-            )
-        jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = srsly.read_jsonl(jsonl_loc)
-    else:
-        clusters_loc = ensure_path(clusters_loc)
-        freqs_loc = ensure_path(freqs_loc)
-        if freqs_loc is not None and not freqs_loc.exists():
-            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
-        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
-
-    with msg.loading("Creating blank pipeline..."):
-        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
-
-    msg.good("Successfully created blank pipeline")
-    if vectors_loc is not None:
-        add_vectors(
-            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
-        )
-    vec_added = len(nlp.vocab.vectors)
-    lex_added = len(nlp.vocab)
-    msg.good(
-        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
-    )
-    if not output_dir.exists():
-        output_dir.mkdir()
-    nlp.to_disk(output_dir)
-    return nlp
-
-
-def open_file(loc: Union[str, Path]) -> IO:
-    """Handle .gz, .tar.gz or unzipped files"""
-    loc = ensure_path(loc)
-    if tarfile.is_tarfile(str(loc)):
-        return tarfile.open(str(loc), "r:gz")
-    elif loc.parts[-1].endswith("gz"):
-        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
-    elif loc.parts[-1].endswith("zip"):
-        zip_file = zipfile.ZipFile(str(loc))
-        names = zip_file.namelist()
-        file_ = zip_file.open(names[0])
-        return (line.decode("utf8") for line in file_)
-    else:
-        return loc.open("r", encoding="utf8")
-
-
-def read_attrs_from_deprecated(
-    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
-) -> List[Dict[str, Any]]:
-    if freqs_loc is not None:
-        with msg.loading("Counting frequencies..."):
-            probs, _ = read_freqs(freqs_loc)
-        msg.good("Counted frequencies")
-    else:
-        probs, _ = ({}, DEFAULT_OOV_PROB)  # noqa: F841
-    if clusters_loc:
-        with msg.loading("Reading clusters..."):
-            clusters = read_clusters(clusters_loc)
-        msg.good("Read clusters")
-    else:
-        clusters = {}
-    lex_attrs = []
-    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
-    if len(sorted_probs):
-        for i, (word, prob) in tqdm(enumerate(sorted_probs)):
-            attrs = {"orth": word, "id": i, "prob": prob}
-            # Decode as a little-endian string, so that we can do & 15 to get
-            # the first 4 bits. See _parse_features.pyx
-            if word in clusters:
-                attrs["cluster"] = int(clusters[word][::-1], 2)
-            else:
-                attrs["cluster"] = 0
-            lex_attrs.append(attrs)
-    return lex_attrs
-
-
-def create_model(
-    lang: str,
-    lex_attrs: List[Dict[str, Any]],
-    name: Optional[str] = None,
-    base_model: Optional[Union[str, Path]] = None,
-) -> Language:
-    if base_model:
-        nlp = load_model(base_model)
-        # keep the tokenizer but remove any existing pipeline components due to
-        # potentially conflicting vectors
-        for pipe in nlp.pipe_names:
-            nlp.remove_pipe(pipe)
-    else:
-        lang_class = get_lang_class(lang)
-        nlp = lang_class()
-    for lexeme in nlp.vocab:
-        lexeme.rank = OOV_RANK
-    for attrs in lex_attrs:
-        if "settings" in attrs:
-            continue
-        lexeme = nlp.vocab[attrs["orth"]]
-        lexeme.set_attrs(**attrs)
-    if len(nlp.vocab):
-        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
-    else:
-        oov_prob = DEFAULT_OOV_PROB
-    nlp.vocab.cfg.update({"oov_prob": oov_prob})
-    if name:
-        nlp.meta["name"] = name
-    return nlp
-
-
-def add_vectors(
-    msg: Printer,
-    nlp: Language,
-    vectors_loc: Optional[Path],
-    truncate_vectors: int,
-    prune_vectors: int,
-    name: Optional[str] = None,
-) -> None:
-    vectors_loc = ensure_path(vectors_loc)
-    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
-        for lex in nlp.vocab:
-            if lex.rank and lex.rank != OOV_RANK:
-                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
-    else:
-        if vectors_loc:
-            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(
-                    msg, vectors_loc, truncate_vectors
-                )
-            msg.good(f"Loaded vectors from {vectors_loc}")
-        else:
-            vectors_data, vector_keys = (None, None)
-        if vector_keys is not None:
-            for word in vector_keys:
-                if word not in nlp.vocab:
-                    nlp.vocab[word]
-        if vectors_data is not None:
-            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
-    if prune_vectors >= 1:
-        nlp.vocab.prune_vectors(prune_vectors)
-
-
-def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
-    f = open_file(vectors_loc)
-    f = ensure_shape(f)
-    shape = tuple(int(size) for size in next(f).split())
-    if truncate_vectors >= 1:
-        shape = (truncate_vectors, shape[1])
-    vectors_data = numpy.zeros(shape=shape, dtype="f")
-    vectors_keys = []
-    for i, line in enumerate(tqdm(f)):
-        line = line.rstrip()
-        pieces = line.rsplit(" ", vectors_data.shape[1])
-        word = pieces.pop(0)
-        if len(pieces) != vectors_data.shape[1]:
-            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
-        vectors_data[i] = numpy.asarray(pieces, dtype="f")
-        vectors_keys.append(word)
-        if i == truncate_vectors - 1:
-            break
-    return vectors_data, vectors_keys
-
-
-def ensure_shape(lines):
-    """Ensure that the first line of the data is the vectors shape.
-
-    If it's not, we read in the data and output the shape as the first result,
-    so that the reader doesn't have to deal with the problem.
-    """
-    first_line = next(lines)
-    try:
-        shape = tuple(int(size) for size in first_line.split())
-    except ValueError:
-        shape = None
-    if shape is not None:
-        # All good, give the data
-        yield first_line
-        yield from lines
-    else:
-        # Figure out the shape, make it the first value, and then give the
-        # rest of the data.
-        width = len(first_line.split()) - 1
-        captured = [first_line] + list(lines)
-        length = len(captured)
-        yield f"{length} {width}"
-        yield from captured
-
-
-def read_freqs(
-    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
-):
-    counts = PreshCounter()
-    total = 0
-    with freqs_loc.open() as f:
-        for i, line in enumerate(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            freq = int(freq)
-            counts.inc(i + 1, freq)
-            total += freq
-    counts.smooth()
-    log_total = math.log(total)
-    probs = {}
-    with freqs_loc.open() as f:
-        for line in tqdm(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            doc_freq = int(doc_freq)
-            freq = int(freq)
-            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-                try:
-                    word = literal_eval(key)
-                except SyntaxError:
-                    # Take odd strings literally.
-                    word = literal_eval(f"'{key}'")
-                smooth_count = counts.smoother(int(freq))
-                probs[word] = math.log(smooth_count) - log_total
-    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs, oov_prob
-
-
-def read_clusters(clusters_loc: Path) -> dict:
-    clusters = {}
-    if ftfy is None:
-        warnings.warn(Warnings.W004)
-    with clusters_loc.open() as f:
-        for line in tqdm(f):
-            try:
-                cluster, word, freq = line.split()
-                if ftfy is not None:
-                    word = ftfy.fix_text(word)
-            except ValueError:
-                continue
-            # If the clusterer has only seen the word a few times, its
-            # cluster is unreliable.
-            if int(freq) >= 3:
-                clusters[word] = cluster
-            else:
-                clusters[word] = "0"
-    # Expand clusters with re-casing
-    for word, cluster in list(clusters.items()):
-        if word.lower() not in clusters:
-            clusters[word.lower()] = cluster
-        if word.title() not in clusters:
-            clusters[word.title()] = cluster
-        if word.upper() not in clusters:
-            clusters[word.upper()] = cluster
-    return clusters
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5fc4ff035..bb1bba4d5 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,6 +32,7 @@ def train_cli(
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
+    dave_path: Optional[Path] = Opt(None, "--dave", "-D", help="etc etc"),
     # fmt: on
 ):
     """
@@ -52,9 +53,12 @@ def train_cli(
     verify_cli_args(config_path, output_path)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
+    if prepared is None:
+        prepare(config_path, output_path / "prepared", config_overrides=overrides)
     train(
         config_path,
         output_path=output_path,
+        dave_path=dave_path,
         config_overrides=overrides,
         use_gpu=use_gpu,
         resume_training=resume,
@@ -62,8 +66,7 @@ def train_cli(
 
 
 def train(
-    config_path: Path,
-    output_path: Optional[Path] = None,
+    output_path: Path,
     config_overrides: Dict[str, Any] = {},
     use_gpu: int = -1,
     resume_training: bool = False,
@@ -74,73 +77,14 @@ def train(
     else:
         msg.info("Using CPU")
     msg.info(f"Loading config and nlp from: {config_path}")
+    # TODO: The details of this will change
+    dave_path = output_path / "dave"
+    config_path = dave_path / "config.cfg"
     with show_validation_error(config_path):
-        config = util.load_config(
-            config_path, overrides=config_overrides, interpolate=True
-        )
-        # Keep a second un-interpolated config so we can preserve variables in
-        # the final nlp object we train and serialize
-        raw_config = util.load_config(config_path, overrides=config_overrides)
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    # Use original config here before it's resolved to functions
-    sourced_components = get_sourced_components(config)
-    with show_validation_error(config_path):
-        nlp, config = util.load_model_from_config(raw_config)
-    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
-    if config["training"]["vectors"] is not None:
-        add_vectors(nlp, config["training"]["vectors"])
-    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
-    T_cfg = config["training"]
-    optimizer = T_cfg["optimizer"]
-    train_corpus = dot_to_object(config, T_cfg["train_corpus"])
-    dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
-    batcher = T_cfg["batcher"]
-    train_logger = T_cfg["logger"]
-    before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T_cfg["frozen_components"]
-    # Sourced components that require resume_training
-    resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if resume_components:
-        with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
-            nlp.resume_training(sgd=optimizer)
-    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-    # Verify the config after calling 'begin_training' to ensure labels are properly initialized
-    verify_config(nlp)
+        config = fill_config_etc_etc(config_path)
+        nlp = make_and_load_nlp_etc_etc(config, dave_path)
+        optimizer, train_corpus, dev_corpus, score_weights, T_cfg = resolve_more_things_etc_etc(config)
 
-    if tag_map:
-        # Replace tag map with provided mapping
-        nlp.vocab.morphology.load_tag_map(tag_map)
-    if morph_rules:
-        # Load morph rules
-        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
-    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    if weights_data is not None:
-        tok2vec_component = config["pretraining"]["component"]
-        if tok2vec_component is None:
-            msg.fail(
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them.",
-                exits=1,
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = config["pretraining"]["layer"]
-        if tok2vec_layer:
-            layer = layer.get_ref(tok2vec_layer)
-        layer.from_bytes(weights_data)
-        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
-
-    # Create iterator, which yields out info after each optimization step.
-    msg.info("Start training")
-    score_weights = T_cfg["score_weights"]
     training_step_iterator = train_while_improving(
         nlp,
         optimizer,
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 6f8c0aa00..a8f4a9497 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -48,6 +48,15 @@ max_length = 0
 # Limitation on number of training examples
 limit = 0
 
+[prepare]
+# The 'prepare' step is run before training or pretraining. Components and
+# the tokenizer can each define their own prepare step, giving them a chance
+# to gather resources like lookup-tables, build label sets, construct vocabularies,
+# etc. After 'prepare' is finished, the result will be saved out to disk, which
+# will then be read in at the start of training. You can call the prepare step
+# separately with the `spacy prepare` command, or you can let the train script
+# do it for you.
+
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}

From a6548ead1764e4bcff4b19ebba6588780b93d334 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 27 Sep 2020 22:20:14 +0200
Subject: [PATCH 187/516] Add _ as a symbol (#6153)

* Add _ to StringStore in Morphology

* Add _ as a symbol

Add `_` as a symbol instead of adding to the `StringStore`.
---
 spacy/morphology.pyx | 3 ++-
 spacy/symbols.pxd    | 1 +
 spacy/symbols.pyx    | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index fcfe216ba..cc0f61cea 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -29,7 +29,8 @@ cdef class Morphology:
     FEATURE_SEP = "|"
     FIELD_SEP = "="
     VALUE_SEP = ","
-    EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
+    # not an empty string so that the PreshMap key is not 0
+    EMPTY_MORPH = symbols.NAMES[symbols._]
 
     def __init__(self, StringStore strings):
         self.mem = Pool()
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index e516f3ed9..bc15d9b80 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -466,3 +466,4 @@ cdef enum symbol_t:
     ENT_ID
 
     IDX
+    _
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 92607e120..b0345c710 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -465,6 +465,7 @@ IDS = {
     "acl": acl,
     "LAW": LAW,
     "MORPH": MORPH,
+    "_": _,
 }
 
 

From 013b66de05ee31e5e05a440ab5b29173530929fa Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 27 Sep 2020 22:20:45 +0200
Subject: [PATCH 188/516] Add tokenizer scoring to ja / ko / zh (#6152)

---
 spacy/lang/ja/__init__.py | 6 ++++++
 spacy/lang/ko/__init__.py | 6 ++++++
 spacy/lang/zh/__init__.py | 6 ++++++
 3 files changed, 18 insertions(+)

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 117514c09..e7cc1ef3b 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -12,8 +12,10 @@ from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...compat import copy_reg
 from ...errors import Errors
 from ...language import Language
+from ...scorer import Scorer
 from ...symbols import POS
 from ...tokens import Doc
+from ...training import validate_examples
 from ...util import DummyTokenizer, registry
 from ... import util
 
@@ -130,6 +132,10 @@ class JapaneseTokenizer(DummyTokenizer):
                     )
         return sub_tokens_list
 
+    def score(self, examples):
+        validate_examples(examples, "JapaneseTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
     def _get_config(self) -> Dict[str, Any]:
         return {"split_mode": self.split_mode}
 
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 47a3887a6..dd07ef89c 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -7,7 +7,9 @@ from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...compat import copy_reg
+from ...scorer import Scorer
 from ...symbols import POS
+from ...training import validate_examples
 from ...util import DummyTokenizer, registry
 
 
@@ -62,6 +64,10 @@ class KoreanTokenizer(DummyTokenizer):
                 lemma = surface
             yield {"surface": surface, "lemma": lemma, "tag": tag}
 
+    def score(self, examples):
+        validate_examples(examples, "KoreanTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
 
 class KoreanDefaults(Language.Defaults):
     config = Config().from_str(DEFAULT_CONFIG)
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 5d3bd2a96..fa9bb810d 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -8,7 +8,9 @@ from thinc.api import Config
 
 from ...errors import Warnings, Errors
 from ...language import Language
+from ...scorer import Scorer
 from ...tokens import Doc
+from ...training import validate_examples
 from ...util import DummyTokenizer, registry
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
@@ -136,6 +138,10 @@ class ChineseTokenizer(DummyTokenizer):
             warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
             warnings.warn(warn_msg)
 
+    def score(self, examples):
+        validate_examples(examples, "ChineseTokenizer.score")
+        return Scorer.score_tokenization(examples)
+
     def _get_config(self) -> Dict[str, Any]:
         return {
             "segmenter": self.segmenter,

From 7e938ed63ee9ff7c1f37c0fa8f32f778d8c5fdf5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 22:21:31 +0200
Subject: [PATCH 189/516] Update config resolution to use new Thinc

---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 +--
 spacy/cli/debug_config.py                     |  3 +-
 spacy/cli/debug_data.py                       | 13 +++----
 spacy/cli/debug_model.py                      | 22 ++++++------
 spacy/cli/init_config.py                      |  6 ++--
 spacy/cli/pretrain.py                         | 22 ++++++------
 spacy/cli/train.py                            | 32 +++++++++--------
 spacy/language.py                             | 24 ++++++-------
 spacy/schemas.py                              | 23 +++++++++++--
 spacy/tests/doc/test_add_entities.py          |  4 +--
 spacy/tests/parser/test_add_label.py          |  6 ++--
 spacy/tests/parser/test_arc_eager_oracle.py   |  2 +-
 spacy/tests/parser/test_neural_parser.py      |  8 ++---
 spacy/tests/parser/test_preset_sbd.py         |  2 +-
 spacy/tests/pipeline/test_tok2vec.py          |  4 +--
 spacy/tests/regression/test_issue3001-3500.py |  2 +-
 spacy/tests/regression/test_issue3501-4000.py |  8 ++---
 .../tests/serialize/test_serialize_config.py  | 18 +++++-----
 .../serialize/test_serialize_pipeline.py      | 20 +++++------
 spacy/tests/test_util.py                      | 15 ++++----
 spacy/tests/training/test_readers.py          |  9 ++---
 spacy/util.py                                 | 34 +++++++++++++++----
 24 files changed, 163 insertions(+), 122 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 896ad339f..0b35f8519 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a36,<8.0.0a40",
+    "thinc>=8.0.0a40,<8.0.0a50",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 2746ecc37..770e74506 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a36,<8.0.0a40
+thinc>=8.0.0a40,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 33dabc91f..80e96122e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a36,<8.0.0a40
+    thinc>=8.0.0a40,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a36,<8.0.0a40
+    thinc>=8.0.0a40,<8.0.0a50
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index c0c7de7ef..131fecf6d 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -51,9 +51,10 @@ def debug_config(
     msg.divider("Config validation")
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
-        nlp, resolved = util.load_model_from_config(config)
+        nlp = util.load_model_from_config(config)
         # Use the resolved config here in case user has one function returning
         # a dict of corpora etc.
+        resolved = util.resolve_training_config(nlp.config)
         check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
     msg.good("Config is valid")
     if show_vars:
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index d52f30b82..302bfd563 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -93,18 +93,19 @@ def debug_data(
         msg.fail("Config file not found", config_path, exists=1)
     with show_validation_error(config_path):
         cfg = util.load_config(config_path, overrides=config_overrides)
-        nlp, config = util.load_model_from_config(cfg)
+        nlp = util.load_model_from_config(cfg)
+        C = util.resolve_training_config(nlp.config)
     # Use original config here, not resolved version
     sourced_components = get_sourced_components(cfg)
-    frozen_components = config["training"]["frozen_components"]
+    frozen_components = C["training"]["frozen_components"]
     resume_components = [p for p in sourced_components if p not in frozen_components]
     pipeline = nlp.pipe_names
     factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
-    tag_map_path = util.ensure_path(config["training"]["tag_map"])
+    tag_map_path = util.ensure_path(C["training"]["tag_map"])
     tag_map = {}
     if tag_map_path is not None:
         tag_map = srsly.read_json(tag_map_path)
-    morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
+    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
     morph_rules = {}
     if morph_rules_path is not None:
         morph_rules = srsly.read_json(morph_rules_path)
@@ -144,10 +145,10 @@ def debug_data(
 
     train_texts = gold_train_data["texts"]
     dev_texts = gold_dev_data["texts"]
-    frozen_components = config["training"]["frozen_components"]
+    frozen_components = C["training"]["frozen_components"]
 
     msg.divider("Training stats")
-    msg.text(f"Language: {config['nlp']['lang']}")
+    msg.text(f"Language: {C['nlp']['lang']}")
     msg.text(f"Training pipeline: {', '.join(pipeline)}")
     if resume_components:
         msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 7f8e1dabc..6f554ed2d 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
 
@@ -57,14 +56,17 @@ def debug_model_cli(
     }
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
-        config = util.load_config(
-            config_path, overrides=config_overrides, interpolate=True
+        raw_config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=False
         )
-        allocator = config["training"]["gpu_allocator"]
-        if use_gpu >= 0 and allocator:
-            set_gpu_allocator(allocator)
-        nlp, config = util.load_model_from_config(config)
-    seed = config["training"]["seed"]
+    config = raw_config.iterpolate()
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    with show_validation_error(config_path):
+        nlp = util.load_model_from_config(raw_config)
+        C = util.resolve_training_config(nlp.config)
+    seed = C["training"]["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
@@ -75,7 +77,7 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(config, nlp, model, print_settings=print_settings)
+    debug_model(C, nlp, model, print_settings=print_settings)
 
 
 def debug_model(
@@ -108,7 +110,7 @@ def debug_model(
                 _set_output_dim(nO=7, model=model)
                 nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
-            except:
+            except Exception:
                 msg.fail(
                     "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
                     exits=1,
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 5203c5dbb..9f73b17ae 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -88,10 +88,10 @@ def fill_config(
     msg = Printer(no_print=no_print)
     with show_validation_error(hint_fill=False):
         config = util.load_config(base_path)
-        nlp, _ = util.load_model_from_config(config, auto_fill=True, validate=False)
+        nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
     # Load a second time with validation to be extra sure that the produced
     # config result is a valid config
-    nlp, _ = util.load_model_from_config(nlp.config)
+    nlp = util.load_model_from_config(nlp.config)
     filled = nlp.config
     if pretraining:
         validate_config_for_pretrain(filled, msg)
@@ -169,7 +169,7 @@ def init_config(
         msg.text(f"- {label}: {value}")
     with show_validation_error(hint_fill=False):
         config = util.load_config_from_str(base_template)
-        nlp, _ = util.load_model_from_config(config, auto_fill=True)
+        nlp = util.load_model_from_config(config, auto_fill=True)
         config = nlp.config
         if pretraining:
             validate_config_for_pretrain(config, msg)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 9e913396e..29e220b95 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -69,17 +69,18 @@ def pretrain_cli(
     msg.info(f"Loading config from: {config_path}")
 
     with show_validation_error(config_path):
-        config = util.load_config(
-            config_path, overrides=config_overrides, interpolate=True
+        raw_config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=False
         )
+    config = raw_config.interpolate()
     if not config.get("pretraining"):
         # TODO: What's the solution here? How do we handle optional blocks?
         msg.fail("The [pretraining] block in your config is empty", exits=1)
     if not output_dir.exists():
         output_dir.mkdir()
         msg.good(f"Created output directory: {output_dir}")
-
-    config.to_disk(output_dir / "config.cfg")
+    # Save non-interpolated config
+    raw_config.to_disk(output_dir / "config.cfg")
     msg.good("Saved config file in the output directory")
 
     pretrain(
@@ -103,14 +104,13 @@ def pretrain(
     allocator = config["training"]["gpu_allocator"]
     if use_gpu >= 0 and allocator:
         set_gpu_allocator(allocator)
-
-    nlp, config = util.load_model_from_config(config)
-    P_cfg = config["pretraining"]
-    corpus = dot_to_object(config, P_cfg["corpus"])
+    nlp = util.load_model_from_config(config)
+    C = util.resolve_training_config(nlp.config)
+    P_cfg = C["pretraining"]
+    corpus = dot_to_object(C, P_cfg["corpus"])
     batcher = P_cfg["batcher"]
-    model = create_pretraining_model(nlp, config["pretraining"])
-    optimizer = config["pretraining"]["optimizer"]
-
+    model = create_pretraining_model(nlp, C["pretraining"])
+    optimizer = C["pretraining"]["optimizer"]
     # Load in pretrained weights to resume from
     if resume_path is not None:
         _resume_model(model, resume_path, epoch_resume)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 5fc4ff035..8aef11e02 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -75,12 +75,12 @@ def train(
         msg.info("Using CPU")
     msg.info(f"Loading config and nlp from: {config_path}")
     with show_validation_error(config_path):
-        config = util.load_config(
-            config_path, overrides=config_overrides, interpolate=True
-        )
-        # Keep a second un-interpolated config so we can preserve variables in
+        # Keep an un-interpolated config so we can preserve variables in
         # the final nlp object we train and serialize
-        raw_config = util.load_config(config_path, overrides=config_overrides)
+        raw_config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=False
+        )
+    config = raw_config.interpolate()
     if config["training"]["seed"] is not None:
         fix_random_seed(config["training"]["seed"])
     allocator = config["training"]["gpu_allocator"]
@@ -89,15 +89,17 @@ def train(
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
-        nlp, config = util.load_model_from_config(raw_config)
-    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
-    if config["training"]["vectors"] is not None:
-        add_vectors(nlp, config["training"]["vectors"])
-    raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
-    T_cfg = config["training"]
+        nlp = util.load_model_from_config(raw_config)
+        # Resolve all training-relevant sections using the filled nlp config
+        C = util.resolve_training_config(nlp.config)
+    util.load_vocab_data_into_model(nlp, lookups=C["training"]["lookups"])
+    if C["training"]["vectors"] is not None:
+        add_vectors(nlp, C["training"]["vectors"])
+    raw_text, tag_map, morph_rules, weights_data = load_from_paths(C)
+    T_cfg = C["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = dot_to_object(config, T_cfg["train_corpus"])
-    dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
+    train_corpus = dot_to_object(C, T_cfg["train_corpus"])
+    dev_corpus = dot_to_object(C, T_cfg["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
@@ -124,7 +126,7 @@ def train(
 
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     if weights_data is not None:
-        tok2vec_component = config["pretraining"]["component"]
+        tok2vec_component = C["pretraining"]["component"]
         if tok2vec_component is None:
             msg.fail(
                 f"To use pretrained tok2vec weights, [pretraining.component] "
@@ -132,7 +134,7 @@ def train(
                 exits=1,
             )
         layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = config["pretraining"]["layer"]
+        tok2vec_layer = C["pretraining"]["layer"]
         if tok2vec_layer:
             layer = layer.get_ref(tok2vec_layer)
         layer.from_bytes(weights_data)
diff --git a/spacy/language.py b/spacy/language.py
index a52391419..bf3911888 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -166,11 +166,10 @@ class Language:
         self._components = []
         self._disabled = set()
         self.max_length = max_length
-        self.resolved = {}
         # Create the default tokenizer from the default config
         if not create_tokenizer:
             tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
-            create_tokenizer = registry.make_from_config(tokenizer_cfg)["tokenizer"]
+            create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
         self.tokenizer = create_tokenizer(self)
 
     def __init_subclass__(cls, **kwargs):
@@ -467,7 +466,7 @@ class Language:
             if "nlp" not in arg_names or "name" not in arg_names:
                 raise ValueError(Errors.E964.format(name=name))
             # Officially register the factory so we can later call
-            # registry.make_from_config and refer to it in the config as
+            # registry.resolve and refer to it in the config as
             # @factories = "spacy.Language.xyz". We use the class name here so
             # different classes can have different factories.
             registry.factories.register(internal_name, func=factory_func)
@@ -650,8 +649,9 @@ class Language:
         cfg = {factory_name: config}
         # We're calling the internal _fill here to avoid constructing the
         # registered functions twice
-        resolved, filled = registry.resolve(cfg, validate=validate)
-        filled = Config(filled[factory_name])
+        resolved = registry.resolve(cfg, validate=validate)
+        filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
+        filled = Config(filled)
         filled["factory"] = factory_name
         filled.pop("@factories", None)
         # Remove the extra values we added because we don't want to keep passing
@@ -1518,15 +1518,14 @@ class Language:
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
         config["components"] = {}
-        resolved, filled = registry.resolve(
-            config, validate=validate, schema=ConfigSchema
-        )
+        filled = registry.fill(config, validate=validate, schema=ConfigSchema)
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
-        create_tokenizer = resolved["nlp"]["tokenizer"]
-        before_creation = resolved["nlp"]["before_creation"]
-        after_creation = resolved["nlp"]["after_creation"]
-        after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"]
+        resolved_nlp = registry.resolve(filled["nlp"], validate=validate)
+        create_tokenizer = resolved_nlp["tokenizer"]
+        before_creation = resolved_nlp["before_creation"]
+        after_creation = resolved_nlp["after_creation"]
+        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
         lang_cls = cls
         if before_creation is not None:
             lang_cls = before_creation(cls)
@@ -1587,7 +1586,6 @@ class Language:
         disabled_pipes = [*config["nlp"]["disabled"], *disable]
         nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
         nlp.config = filled if auto_fill else config
-        nlp.resolved = resolved
         if after_pipeline_creation is not None:
             nlp = after_pipeline_creation(nlp)
             if not isinstance(nlp, cls):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 0c85dfe57..6db05bbd9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -4,6 +4,7 @@ from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
 from pydantic import root_validator
+from thinc.config import Promise
 from collections import defaultdict
 from thinc.api import Optimizer
 
@@ -16,10 +17,12 @@ if TYPE_CHECKING:
     from .training import Example  # noqa: F401
 
 
+# fmt: off
 ItemT = TypeVar("ItemT")
-Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
-Reader = Callable[["Language", str], Iterable["Example"]]
-Logger = Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]]
+Batcher = Union[Callable[[Iterable[ItemT]], Iterable[List[ItemT]]], Promise]
+Reader = Union[Callable[["Language", str], Iterable["Example"]], Promise]
+Logger = Union[Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]], Promise]
+# fmt: on
 
 
 def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
@@ -292,6 +295,20 @@ class ConfigSchema(BaseModel):
         arbitrary_types_allowed = True
 
 
+class NlpSchema(BaseModel):
+    nlp: ConfigSchemaNlp
+
+
+class TrainingSchema(BaseModel):
+    training: ConfigSchemaTraining
+    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
+    corpora: Dict[str, Reader]
+
+    class Config:
+        extra = "allow"
+        arbitrary_types_allowed = True
+
+
 # Project config Schema
 
 
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 615ab9e5b..86aa883bd 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -24,7 +24,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_NER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
     ner.begin_training(lambda: [_ner_example(ner)])
     ner(doc)
@@ -46,7 +46,7 @@ def test_ents_reset(en_vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_NER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
     ner.begin_training(lambda: [_ner_example(ner)])
     ner(doc)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 3d67e6ef6..cd376e0fc 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -23,7 +23,7 @@ def parser(vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = DependencyParser(vocab, model, **config)
     return parser
 
@@ -82,7 +82,7 @@ def test_add_label_deserializes_correctly():
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_NER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     ner1 = EntityRecognizer(Vocab(), model, **config)
     ner1.add_label("C")
     ner1.add_label("B")
@@ -111,7 +111,7 @@ def test_add_label_get_label(pipe_cls, n_moves, model_config):
     splitting the move names.
     """
     labels = ["A", "B", "C"]
-    model = registry.make_from_config({"model": model_config}, validate=True)["model"]
+    model = registry.resolve({"model": model_config}, validate=True)["model"]
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 826fc1d87..84070db73 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -127,7 +127,7 @@ def test_get_oracle_actions():
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = DependencyParser(doc.vocab, model, **config)
     parser.moves.add_action(0, "")
     parser.moves.add_action(1, "")
diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py
index 0747241d8..1bb5d4aa5 100644
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@@ -25,7 +25,7 @@ def arc_eager(vocab):
 @pytest.fixture
 def tok2vec():
     cfg = {"model": DEFAULT_TOK2VEC_MODEL}
-    tok2vec = registry.make_from_config(cfg, validate=True)["model"]
+    tok2vec = registry.resolve(cfg, validate=True)["model"]
     tok2vec.initialize()
     return tok2vec
 
@@ -38,14 +38,14 @@ def parser(vocab, arc_eager):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     return Parser(vocab, model, moves=arc_eager, **config)
 
 
 @pytest.fixture
 def model(arc_eager, tok2vec, vocab):
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     model.attrs["resize_output"](model, arc_eager.n_moves)
     model.initialize()
     return model
@@ -72,7 +72,7 @@ def test_build_model(parser, vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser.model = Parser(vocab, model=model, moves=parser.moves, **config).model
     assert parser.model is not None
 
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 1de05be1b..e8dfa68c7 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -28,7 +28,7 @@ def parser(vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = DependencyParser(vocab, model, **config)
     parser.cfg["token_vector_width"] = 4
     parser.cfg["hidden_width"] = 32
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 985314217..558b9079c 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -139,7 +139,7 @@ TRAIN_DATA = [
 
 def test_tok2vec_listener():
     orig_config = Config().from_str(cfg_string)
-    nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
     assert nlp.pipe_names == ["tok2vec", "tagger"]
     tagger = nlp.get_pipe("tagger")
     tok2vec = nlp.get_pipe("tok2vec")
@@ -173,7 +173,7 @@ def test_tok2vec_listener():
 
 def test_tok2vec_listener_callback():
     orig_config = Config().from_str(cfg_string)
-    nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
     assert nlp.pipe_names == ["tok2vec", "tagger"]
     tagger = nlp.get_pipe("tagger")
     tok2vec = nlp.get_pipe("tok2vec")
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index a64dc53e4..56ef23dbf 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -195,7 +195,7 @@ def test_issue3345():
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_NER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(doc.vocab, model, **config)
     # Add the OUT action. I wouldn't have thought this would be necessary...
     ner.moves.add_action(5, "")
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index a79be6638..304e654c3 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -264,9 +264,7 @@ def test_issue3830_no_subtok():
         "min_action_freq": 30,
         "update_with_oracle_cut_size": 100,
     }
-    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
-        "model"
-    ]
+    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
@@ -281,9 +279,7 @@ def test_issue3830_with_subtok():
         "min_action_freq": 30,
         "update_with_oracle_cut_size": 100,
     }
-    model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
-        "model"
-    ]
+    model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"]
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1a5be4bec..eb5f15007 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -108,8 +108,8 @@ def my_parser():
 def test_create_nlp_from_config():
     config = Config().from_str(nlp_config_string)
     with pytest.raises(ConfigValidationError):
-        nlp, _ = load_model_from_config(config, auto_fill=False)
-    nlp, resolved = load_model_from_config(config, auto_fill=True)
+        load_model_from_config(config, auto_fill=False)
+    nlp = load_model_from_config(config, auto_fill=True)
     assert nlp.config["training"]["batcher"]["size"] == 666
     assert len(nlp.config["training"]) > 1
     assert nlp.pipe_names == ["tok2vec", "tagger"]
@@ -136,7 +136,7 @@ def test_create_nlp_from_config_multiple_instances():
         "tagger2": config["components"]["tagger"],
     }
     config["nlp"]["pipeline"] = list(config["components"].keys())
-    nlp, _ = load_model_from_config(config, auto_fill=True)
+    nlp = load_model_from_config(config, auto_fill=True)
     assert nlp.pipe_names == ["t2v", "tagger1", "tagger2"]
     assert nlp.get_pipe_meta("t2v").factory == "tok2vec"
     assert nlp.get_pipe_meta("tagger1").factory == "tagger"
@@ -150,7 +150,7 @@ def test_create_nlp_from_config_multiple_instances():
 def test_serialize_nlp():
     """ Create a custom nlp pipeline from config and ensure it serializes it correctly """
     nlp_config = Config().from_str(nlp_config_string)
-    nlp, _ = load_model_from_config(nlp_config, auto_fill=True)
+    nlp = load_model_from_config(nlp_config, auto_fill=True)
     nlp.get_pipe("tagger").add_label("A")
     nlp.begin_training()
     assert "tok2vec" in nlp.pipe_names
@@ -209,7 +209,7 @@ def test_config_nlp_roundtrip():
     nlp = English()
     nlp.add_pipe("entity_ruler")
     nlp.add_pipe("ner")
-    new_nlp, new_config = load_model_from_config(nlp.config, auto_fill=False)
+    new_nlp = load_model_from_config(nlp.config, auto_fill=False)
     assert new_nlp.config == nlp.config
     assert new_nlp.pipe_names == nlp.pipe_names
     assert new_nlp._pipe_configs == nlp._pipe_configs
@@ -280,12 +280,12 @@ def test_config_overrides():
     overrides_dot = {"nlp.lang": "de", "nlp.pipeline": ["tagger"]}
     # load_model from config with overrides passed directly to Config
     config = Config().from_str(nlp_config_string, overrides=overrides_dot)
-    nlp, _ = load_model_from_config(config, auto_fill=True)
+    nlp = load_model_from_config(config, auto_fill=True)
     assert isinstance(nlp, German)
     assert nlp.pipe_names == ["tagger"]
     # Serialized roundtrip with config passed in
     base_config = Config().from_str(nlp_config_string)
-    base_nlp, _ = load_model_from_config(base_config, auto_fill=True)
+    base_nlp = load_model_from_config(base_config, auto_fill=True)
     assert isinstance(base_nlp, English)
     assert base_nlp.pipe_names == ["tok2vec", "tagger"]
     with make_tempdir() as d:
@@ -328,7 +328,7 @@ def test_config_optional_sections():
     config = Config().from_str(nlp_config_string)
     config = DEFAULT_CONFIG.merge(config)
     assert "pretraining" not in config
-    filled = registry.fill_config(config, schema=ConfigSchema, validate=False)
+    filled = registry.fill(config, schema=ConfigSchema, validate=False)
     # Make sure that optional "pretraining" block doesn't default to None,
     # which would (rightly) cause error because it'd result in a top-level
     # key that's not a section (dict). Note that the following roundtrip is
@@ -341,7 +341,7 @@ def test_config_auto_fill_extra_fields():
     config = Config({"nlp": {"lang": "en"}, "training": {}})
     assert load_model_from_config(config, auto_fill=True)
     config = Config({"nlp": {"lang": "en"}, "training": {"extra": "hello"}})
-    nlp, _ = load_model_from_config(config, auto_fill=True, validate=False)
+    nlp = load_model_from_config(config, auto_fill=True, validate=False)
     assert "extra" not in nlp.config["training"]
     # Make sure the config generated is valid
     load_model_from_config(nlp.config)
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index d1c4553be..1c605fea8 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -23,7 +23,7 @@ def parser(en_vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = DependencyParser(en_vocab, model, **config)
     parser.add_label("nsubj")
     return parser
@@ -37,7 +37,7 @@ def blank_parser(en_vocab):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = DependencyParser(en_vocab, model, **config)
     return parser
 
@@ -45,7 +45,7 @@ def blank_parser(en_vocab):
 @pytest.fixture
 def taggers(en_vocab):
     cfg = {"model": DEFAULT_TAGGER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     tagger1 = Tagger(en_vocab, model)
     tagger2 = Tagger(en_vocab, model)
     return tagger1, tagger2
@@ -59,7 +59,7 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = Parser(en_vocab, model, **config)
     new_parser = Parser(en_vocab, model, **config)
     new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
@@ -77,7 +77,7 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
         "update_with_oracle_cut_size": 100,
     }
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     parser = Parser(en_vocab, model, **config)
     with make_tempdir() as d:
         file_path = d / "parser"
@@ -111,7 +111,7 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
     tagger1 = tagger1.from_bytes(tagger1_b)
     assert tagger1.to_bytes() == tagger1_b
     cfg = {"model": DEFAULT_TAGGER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     new_tagger1 = Tagger(en_vocab, model).from_bytes(tagger1_b)
     new_tagger1_b = new_tagger1.to_bytes()
     assert len(new_tagger1_b) == len(tagger1_b)
@@ -126,7 +126,7 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
         tagger1.to_disk(file_path1)
         tagger2.to_disk(file_path2)
         cfg = {"model": DEFAULT_TAGGER_MODEL}
-        model = registry.make_from_config(cfg, validate=True)["model"]
+        model = registry.resolve(cfg, validate=True)["model"]
         tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
         tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
         assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
@@ -135,7 +135,7 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
 def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_TEXTCAT_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     textcat = TextCategorizer(
         en_vocab,
         model,
@@ -149,7 +149,7 @@ def test_serialize_textcat_empty(en_vocab):
 @pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_pipe_exclude(en_vocab, Parser):
     cfg = {"model": DEFAULT_PARSER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     config = {
         "learn_tokens": False,
         "min_action_freq": 0,
@@ -176,7 +176,7 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
 
 def test_serialize_sentencerecognizer(en_vocab):
     cfg = {"model": DEFAULT_SENTER_MODEL}
-    model = registry.make_from_config(cfg, validate=True)["model"]
+    model = registry.resolve(cfg, validate=True)["model"]
     sr = SentenceRecognizer(en_vocab, model)
     sr_b = sr.to_bytes()
     sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b)
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 1668991cd..0647b8556 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -82,10 +82,10 @@ def test_util_dot_section():
     no_output_layer = false
     """
     nlp_config = Config().from_str(cfg_string)
-    en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
+    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
     default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
     default_config["nlp"]["lang"] = "nl"
-    nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
+    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
     # Test that creation went OK
     assert isinstance(en_nlp, English)
     assert isinstance(nl_nlp, Dutch)
@@ -94,14 +94,15 @@ def test_util_dot_section():
     # not exclusive_classes
     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
     # Test that default values got overwritten
-    assert en_config["nlp"]["pipeline"] == ["textcat"]
-    assert nl_config["nlp"]["pipeline"] == []  # default value []
+    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
     # Test proper functioning of 'dot_to_object'
     with pytest.raises(KeyError):
-        dot_to_object(en_config, "nlp.pipeline.tagger")
+        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
     with pytest.raises(KeyError):
-        dot_to_object(en_config, "nlp.unknownattribute")
-    assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
+        dot_to_object(en_nlp.config, "nlp.unknownattribute")
+    resolved = util.resolve_training_config(nl_nlp.config)
+    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
 
 
 def test_simple_frozen_list():
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index d20a032e8..c06c9d282 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -3,6 +3,7 @@ import pytest
 from thinc.api import Config
 from spacy import Language
 from spacy.util import load_model_from_config, registry, dot_to_object
+from spacy.util import resolve_training_config
 from spacy.training import Example
 
 
@@ -37,8 +38,8 @@ def test_readers():
         return {"train": reader, "dev": reader, "extra": reader, "something": reader}
 
     config = Config().from_str(config_string)
-    nlp, resolved = load_model_from_config(config, auto_fill=True)
-
+    nlp = load_model_from_config(config, auto_fill=True)
+    resolved = resolve_training_config(nlp.config)
     train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
     assert isinstance(train_corpus, Callable)
     optimizer = resolved["training"]["optimizer"]
@@ -87,8 +88,8 @@ def test_cat_readers(reader, additional_config):
     config = Config().from_str(nlp_config_string)
     config["corpora"]["@readers"] = reader
     config["corpora"].update(additional_config)
-    nlp, resolved = load_model_from_config(config, auto_fill=True)
-
+    nlp = load_model_from_config(config, auto_fill=True)
+    resolved = resolve_training_config(nlp.config)
     train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
     optimizer = resolved["training"]["optimizer"]
     # simulate a training loop
diff --git a/spacy/util.py b/spacy/util.py
index 378ec2823..dd2115705 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -86,7 +86,7 @@ class registry(thinc.registry):
     # spacy_factories entry point. This registry only exists so we can easily
     # load them via the entry points. The "true" factories are added via the
     # Language.factory decorator (in the spaCy code base and user code) and those
-    # are the factories used to initialize components via registry.make_from_config.
+    # are the factories used to initialize components via registry.resolve.
     _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
     factories = catalogue.create("spacy", "internal_factories")
     # This is mostly used to get a list of all installed models in the current
@@ -351,9 +351,7 @@ def load_model_from_path(
         meta = get_model_meta(model_path)
     config_path = model_path / "config.cfg"
     config = load_config(config_path, overrides=dict_to_dot(config))
-    nlp, _ = load_model_from_config(
-        config, vocab=vocab, disable=disable, exclude=exclude
-    )
+    nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
     return nlp.from_disk(model_path, exclude=exclude)
 
 
@@ -365,7 +363,7 @@ def load_model_from_config(
     exclude: Iterable[str] = SimpleFrozenList(),
     auto_fill: bool = False,
     validate: bool = True,
-) -> Tuple["Language", Config]:
+) -> "Language":
     """Create an nlp object from a config. Expects the full config file including
     a section "nlp" containing the settings for the nlp object.
 
@@ -398,7 +396,31 @@ def load_model_from_config(
         auto_fill=auto_fill,
         validate=validate,
     )
-    return nlp, nlp.resolved
+    return nlp
+
+
+def resolve_training_config(
+    config: Config,
+    exclude: Iterable[str] = ("nlp", "components"),
+    validate: bool = True,
+) -> Dict[str, Any]:
+    """Resolve the config sections relevant for trainig and create all objects.
+    Mostly used in the CLI to separate training config (not resolved by default
+    because not runtime-relevant – an nlp object should load fine even if it's
+    [training] block refers to functions that are not available etc.).
+
+    config (Config): The config to resolve.
+    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
+        be available in the final resolved config.
+    validate (bool): Whether to validate the config.
+    RETURNS (Dict[str, Any]): The resolved config.
+    """
+    config = config.copy()
+    excluded = {}
+    for key in exclude:
+        if key in config:
+            excluded.pop(key, None)
+    return registry.resolve(config, validate=validate)
 
 
 def load_model_from_init_py(

From d7ad65a9bbfd09395de933ec38cac2d258e1a94b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 22:31:57 +0200
Subject: [PATCH 190/516] Fix handling of error description [ci skip]

---
 spacy/cli/_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 506380b0b..68cb572ea 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -243,6 +243,8 @@ def show_validation_error(
         yield
     except ConfigValidationError as e:
         title = title if title is not None else e.title
+        if e.desc:
+            desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
         # Re-generate a new error object with overrides
         err = e.from_error(e, title="", desc=desc, show_config=show_config)
         msg.fail(title)

From 5c53a76021775b2ed4cb6904c132f6e7780c7dc4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 22:39:04 +0200
Subject: [PATCH 191/516] Improve CLI error handling [ci skip]

---
 spacy/cli/_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 68cb572ea..f4a31d68b 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -258,8 +258,10 @@ def show_validation_error(
             )
             print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
         sys.exit(1)
-    except InterpolationError as e:
-        msg.fail("Config validation error", e, exits=1)
+    except Exception as e:
+        msg.fail("Error while loading the config")
+        print(e)
+        sys.exit(1)
 
 
 def import_code(code_path: Optional[Union[Path, str]]) -> None:

From 47c6a461e5078b24d0313b933e45d636bbda8b88 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 22:41:00 +0200
Subject: [PATCH 192/516] Revert except all in CLI error handling [ci skip]

---
 spacy/cli/_util.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index f4a31d68b..68cb572ea 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -258,10 +258,8 @@ def show_validation_error(
             )
             print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
         sys.exit(1)
-    except Exception as e:
-        msg.fail("Error while loading the config")
-        print(e)
-        sys.exit(1)
+    except InterpolationError as e:
+        msg.fail("Config validation error", e, exits=1)
 
 
 def import_code(code_path: Optional[Union[Path, str]]) -> None:

From 658fad428afdba327edb322cb40720f298b2262d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 22:50:36 +0200
Subject: [PATCH 193/516] Fix base schema integration

---
 spacy/language.py | 11 ++++++++---
 spacy/schemas.py  |  4 ----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index bf3911888..c1d2df026 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -27,7 +27,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema
+from .schemas import ConfigSchema, ConfigSchemaNlp
 from .git_info import GIT_VERSION
 from . import util
 from . import about
@@ -1518,10 +1518,15 @@ class Language:
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
         config["components"] = {}
-        filled = registry.fill(config, validate=validate, schema=ConfigSchema)
+        if auto_fill:
+            filled = registry.fill(config, validate=validate, schema=ConfigSchema)
+        else:
+            filled = config
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
-        resolved_nlp = registry.resolve(filled["nlp"], validate=validate)
+        resolved_nlp = registry.resolve(
+            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
+        )
         create_tokenizer = resolved_nlp["tokenizer"]
         before_creation = resolved_nlp["before_creation"]
         after_creation = resolved_nlp["after_creation"]
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 6db05bbd9..7951b851b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -295,10 +295,6 @@ class ConfigSchema(BaseModel):
         arbitrary_types_allowed = True
 
 
-class NlpSchema(BaseModel):
-    nlp: ConfigSchemaNlp
-
-
 class TrainingSchema(BaseModel):
     training: ConfigSchemaTraining
     pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}

From c0c842ae5b4775558e32ee4b4851a4be7c8d2dfb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 23:24:40 +0200
Subject: [PATCH 194/516] Update Thinc version

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0b35f8519..6d3a29fe9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a40,<8.0.0a50",
+    "thinc>=8.0.0a41,<8.0.0a50",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 770e74506..d696cd44b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a40,<8.0.0a50
+thinc>=8.0.0a41,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 80e96122e..b55c0d376 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a40,<8.0.0a50
+    thinc>=8.0.0a41,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a40,<8.0.0a50
+    thinc>=8.0.0a41,<8.0.0a50
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0

From 9016d23cc5a7ebae2bf1a0b90de6a69d31b1e416 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Sep 2020 23:34:03 +0200
Subject: [PATCH 195/516] Fix exclude and add test

---
 spacy/tests/test_misc.py | 14 ++++++++++++++
 spacy/util.py            |  3 +--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index e6ef45f90..4e079d29e 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,6 +7,7 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from thinc.api import Optimizer
 
 
 @pytest.fixture
@@ -157,3 +158,16 @@ def test_dot_to_dict(dot_notation, expected):
     result = util.dot_to_dict(dot_notation)
     assert result == expected
     assert util.dict_to_dot(result) == dot_notation
+
+
+def test_resolve_training_config():
+    config = {
+        "nlp": {"lang": "en", "disabled": []},
+        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
+        "corpora": {},
+    }
+    resolved = util.resolve_training_config(config)
+    assert resolved["training"]["dropout"] == 0.1
+    assert isinstance(resolved["training"]["optimizer"], Optimizer)
+    assert resolved["corpora"] == {}
+    assert "nlp" not in resolved
diff --git a/spacy/util.py b/spacy/util.py
index dd2115705..01232f5c5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -416,10 +416,9 @@ def resolve_training_config(
     RETURNS (Dict[str, Any]): The resolved config.
     """
     config = config.copy()
-    excluded = {}
     for key in exclude:
         if key in config:
-            excluded.pop(key, None)
+            config.pop(key)
     return registry.resolve(config, validate=validate)
 
 

From b5556093e251e4cfd31efda5f828fff98ba7f438 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 27 Sep 2020 23:59:44 +0200
Subject: [PATCH 196/516] Start updating train script

---
 spacy/cli/train.py | 55 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index bb1bba4d5..ab71dac26 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -16,6 +16,7 @@ from ._util import import_code, get_sourced_components
 from ..language import Language
 from .. import util
 from ..training.example import Example
+from ..training.initialize import must_initialize, init_pipeline
 from ..errors import Errors
 from ..util import dot_to_object
 
@@ -31,8 +32,6 @@ def train_cli(
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
-    resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
-    dave_path: Optional[Path] = Opt(None, "--dave", "-D", help="etc etc"),
     # fmt: on
 ):
     """
@@ -53,38 +52,37 @@ def train_cli(
     verify_cli_args(config_path, output_path)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
-    if prepared is None:
-        prepare(config_path, output_path / "prepared", config_overrides=overrides)
-    train(
-        config_path,
-        output_path=output_path,
-        dave_path=dave_path,
-        config_overrides=overrides,
-        use_gpu=use_gpu,
-        resume_training=resume,
-    )
-
-
-def train(
-    output_path: Path,
-    config_overrides: Dict[str, Any] = {},
-    use_gpu: int = -1,
-    resume_training: bool = False,
-) -> None:
     if use_gpu >= 0:
         msg.info(f"Using GPU: {use_gpu}")
         require_gpu(use_gpu)
     else:
         msg.info("Using CPU")
-    msg.info(f"Loading config and nlp from: {config_path}")
-    # TODO: The details of this will change
-    dave_path = output_path / "dave"
-    config_path = dave_path / "config.cfg"
-    with show_validation_error(config_path):
-        config = fill_config_etc_etc(config_path)
-        nlp = make_and_load_nlp_etc_etc(config, dave_path)
-        optimizer, train_corpus, dev_corpus, score_weights, T_cfg = resolve_more_things_etc_etc(config)
+    config = util.load_config(
+        config_path, overrides=config_overrides, interpolate=True
+    )
+    if output_path is None:
+        nlp = init_pipeline(config)
+    else:
+        init_path = output_path / "model-initial" 
+        if must_reinitialize(config, init_path):
+            nlp = init_pipeline(config)
+            nlp.to_disk(init_path)
+        else:
+            nlp = spacy.load(output_path / "model-initial")
+    msg.info("Start training")
+    train(nlp, config, output_path)
 
+
+def train(nlp: Language, output_path: Optional[Path]=None) -> None:
+    # Create iterator, which yields out info after each optimization step.
+    config = nlp.config
+    T_cfg = config["training"]
+    score_weights = T_cfg["score_weights"]
+    optimizer = T_cfg["optimizer"]
+    train_corpus = dot_to_object(config, T_cfg["train_corpus"])
+    dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
+    batcher = T_cfg["batcher"]
+ 
     training_step_iterator = train_while_improving(
         nlp,
         optimizer,
@@ -142,6 +140,7 @@ def train(
             msg.good(f"Saved pipeline to output directory {final_model_path}")
 
 
+
 def add_vectors(nlp: Language, vectors: str) -> None:
     title = f"Config validation error for vectors {vectors}"
     desc = (

From 13b1605ee6fddc527f703ed86715ef4f4cb24a50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 01:08:49 +0200
Subject: [PATCH 197/516] Add init script

---
 spacy/training/initialize.py | 378 +++++++++++++++++++++++++++++++++++
 1 file changed, 378 insertions(+)
 create mode 100644 spacy/training/initialize.py

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
new file mode 100644
index 000000000..07bbced8d
--- /dev/null
+++ b/spacy/training/initialize.py
@@ -0,0 +1,378 @@
+from pathlib import Path
+from typing import Dict
+from ._util import app, init_cli, Arg, Opt
+from ..vectors import Vectors
+from ..errors import Errors, Warnings
+from ..language import Language
+from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
+
+try:
+    import ftfy
+except ImportError:
+    ftfy = None
+
+
+def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
+    config = util.load_config(config_path, overrides=overrides)
+    if not init_path.exists():
+        return True
+    elif not (init_path / "config.cfg").exists():
+        return True
+    else:
+        init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
+        if config.to_str() != init_cfg.to_str():
+            return True
+        else:
+            return False
+
+
+def init_pipeline(config: Config, use_gpu: int=-1):
+    raw_config = config
+    config = raw_config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    # Use original config here before it's resolved to functions
+    sourced_components = get_sourced_components(config)
+    with show_validation_error(config_path):
+        nlp = util.load_model_from_config(raw_config)
+        # Resolve all training-relevant sections using the filled nlp config
+        T = registry.resolve(
+            config["training"],
+            schema=TrainingSchema,
+            validate=validate,
+        )
+        # TODO: It might not be 'corpora' 
+        corpora = registry.resolve(config["corpora"], validate=True)
+        raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
+    util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
+    if T["vectors"] is not None:
+        add_vectors(nlp, T["vectors"])
+    score_weights = T["score_weights"]
+    optimizer = T["optimizer"]
+    train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"])
+    dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"])
+    batcher = T["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced_components if p not in frozen_components]
+    msg.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            msg.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    # Verify the config after calling 'begin_training' to ensure labels
+    # are properly initialized
+    verify_config(nlp)
+
+    if tag_map:
+        # Replace tag map with provided mapping
+        nlp.vocab.morphology.load_tag_map(tag_map)
+    if morph_rules:
+        # Load morph rules
+        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
+
+    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    if weights_data is not None:
+        tok2vec_component = C["pretraining"]["component"]
+        if tok2vec_component is None:
+            msg.fail(
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them.",
+                exits=1,
+            )
+        layer = nlp.get_pipe(tok2vec_component).model
+        tok2vec_layer = C["pretraining"]["layer"]
+        if tok2vec_layer:
+            layer = layer.get_ref(tok2vec_layer)
+        layer.from_bytes(weights_data)
+        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
+    return nlp
+ 
+
+def init_vocab(
+    lang: str,
+    output_dir: Path,
+    freqs_loc: Optional[Path] = None,
+    clusters_loc: Optional[Path] = None,
+    jsonl_loc: Optional[Path] = None,
+    vectors_loc: Optional[Path] = None,
+    prune_vectors: int = -1,
+    truncate_vectors: int = 0,
+    vectors_name: Optional[str] = None,
+    model_name: Optional[str] = None,
+    base_model: Optional[str] = None,
+    silent: bool = True,
+) -> Language:
+    msg = Printer(no_print=silent, pretty=not silent)
+    if jsonl_loc is not None:
+        if freqs_loc is not None or clusters_loc is not None:
+            settings = ["-j"]
+            if freqs_loc:
+                settings.append("-f")
+            if clusters_loc:
+                settings.append("-c")
+            msg.warn(
+                "Incompatible arguments",
+                "The -f and -c arguments are deprecated, and not compatible "
+                "with the -j argument, which should specify the same "
+                "information. Either merge the frequencies and clusters data "
+                "into the JSONL-formatted file (recommended), or use only the "
+                "-f and -c files, without the other lexical attributes.",
+            )
+        jsonl_loc = ensure_path(jsonl_loc)
+        lex_attrs = srsly.read_jsonl(jsonl_loc)
+    else:
+        clusters_loc = ensure_path(clusters_loc)
+        freqs_loc = ensure_path(freqs_loc)
+        if freqs_loc is not None and not freqs_loc.exists():
+            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
+        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
+
+    with msg.loading("Creating blank pipeline..."):
+        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
+
+    msg.good("Successfully created blank pipeline")
+    if vectors_loc is not None:
+        add_vectors(
+            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
+        )
+    vec_added = len(nlp.vocab.vectors)
+    lex_added = len(nlp.vocab)
+    msg.good(
+        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
+    )
+    if not output_dir.exists():
+        output_dir.mkdir()
+    nlp.to_disk(output_dir)
+    return nlp
+
+
+def open_file(loc: Union[str, Path]) -> IO:
+    """Handle .gz, .tar.gz or unzipped files"""
+    loc = ensure_path(loc)
+    if tarfile.is_tarfile(str(loc)):
+        return tarfile.open(str(loc), "r:gz")
+    elif loc.parts[-1].endswith("gz"):
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+    elif loc.parts[-1].endswith("zip"):
+        zip_file = zipfile.ZipFile(str(loc))
+        names = zip_file.namelist()
+        file_ = zip_file.open(names[0])
+        return (line.decode("utf8") for line in file_)
+    else:
+        return loc.open("r", encoding="utf8")
+
+
+def read_attrs_from_deprecated(
+    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
+) -> List[Dict[str, Any]]:
+    if freqs_loc is not None:
+        with msg.loading("Counting frequencies..."):
+            probs, _ = read_freqs(freqs_loc)
+        msg.good("Counted frequencies")
+    else:
+        probs, _ = ({}, DEFAULT_OOV_PROB)  # noqa: F841
+    if clusters_loc:
+        with msg.loading("Reading clusters..."):
+            clusters = read_clusters(clusters_loc)
+        msg.good("Read clusters")
+    else:
+        clusters = {}
+    lex_attrs = []
+    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
+    if len(sorted_probs):
+        for i, (word, prob) in tqdm(enumerate(sorted_probs)):
+            attrs = {"orth": word, "id": i, "prob": prob}
+            # Decode as a little-endian string, so that we can do & 15 to get
+            # the first 4 bits. See _parse_features.pyx
+            if word in clusters:
+                attrs["cluster"] = int(clusters[word][::-1], 2)
+            else:
+                attrs["cluster"] = 0
+            lex_attrs.append(attrs)
+    return lex_attrs
+
+
+def create_model(
+    lang: str,
+    lex_attrs: List[Dict[str, Any]],
+    name: Optional[str] = None,
+    base_model: Optional[Union[str, Path]] = None,
+) -> Language:
+    if base_model:
+        nlp = load_model(base_model)
+        # keep the tokenizer but remove any existing pipeline components due to
+        # potentially conflicting vectors
+        for pipe in nlp.pipe_names:
+            nlp.remove_pipe(pipe)
+    else:
+        lang_class = get_lang_class(lang)
+        nlp = lang_class()
+    for lexeme in nlp.vocab:
+        lexeme.rank = OOV_RANK
+    for attrs in lex_attrs:
+        if "settings" in attrs:
+            continue
+        lexeme = nlp.vocab[attrs["orth"]]
+        lexeme.set_attrs(**attrs)
+    if len(nlp.vocab):
+        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
+    else:
+        oov_prob = DEFAULT_OOV_PROB
+    nlp.vocab.cfg.update({"oov_prob": oov_prob})
+    if name:
+        nlp.meta["name"] = name
+    return nlp
+
+
+def add_vectors(
+    msg: Printer,
+    nlp: Language,
+    vectors_loc: Optional[Path],
+    truncate_vectors: int,
+    prune_vectors: int,
+    name: Optional[str] = None,
+) -> None:
+    vectors_loc = ensure_path(vectors_loc)
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+        for lex in nlp.vocab:
+            if lex.rank and lex.rank != OOV_RANK:
+                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
+    else:
+        if vectors_loc:
+            with msg.loading(f"Reading vectors from {vectors_loc}"):
+                vectors_data, vector_keys = read_vectors(
+                    msg, vectors_loc, truncate_vectors
+                )
+            msg.good(f"Loaded vectors from {vectors_loc}")
+        else:
+            vectors_data, vector_keys = (None, None)
+        if vector_keys is not None:
+            for word in vector_keys:
+                if word not in nlp.vocab:
+                    nlp.vocab[word]
+        if vectors_data is not None:
+            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+    if name is None:
+        # TODO: Is this correct? Does this matter?
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
+    else:
+        nlp.vocab.vectors.name = name
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
+    if prune_vectors >= 1:
+        nlp.vocab.prune_vectors(prune_vectors)
+
+
+def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
+    f = open_file(vectors_loc)
+    f = ensure_shape(f)
+    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
+    vectors_keys = []
+    for i, line in enumerate(tqdm(f)):
+        line = line.rstrip()
+        pieces = line.rsplit(" ", vectors_data.shape[1])
+        word = pieces.pop(0)
+        if len(pieces) != vectors_data.shape[1]:
+            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
+        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
+    return vectors_data, vectors_keys
+
+
+def ensure_shape(lines):
+    """Ensure that the first line of the data is the vectors shape.
+
+    If it's not, we read in the data and output the shape as the first result,
+    so that the reader doesn't have to deal with the problem.
+    """
+    first_line = next(lines)
+    try:
+        shape = tuple(int(size) for size in first_line.split())
+    except ValueError:
+        shape = None
+    if shape is not None:
+        # All good, give the data
+        yield first_line
+        yield from lines
+    else:
+        # Figure out the shape, make it the first value, and then give the
+        # rest of the data.
+        width = len(first_line.split()) - 1
+        captured = [first_line] + list(lines)
+        length = len(captured)
+        yield f"{length} {width}"
+        yield from captured
+
+
+def read_freqs(
+    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
+):
+    counts = PreshCounter()
+    total = 0
+    with freqs_loc.open() as f:
+        for i, line in enumerate(f):
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
+            freq = int(freq)
+            counts.inc(i + 1, freq)
+            total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    probs = {}
+    with freqs_loc.open() as f:
+        for line in tqdm(f):
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
+            doc_freq = int(doc_freq)
+            freq = int(freq)
+            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
+                try:
+                    word = literal_eval(key)
+                except SyntaxError:
+                    # Take odd strings literally.
+                    word = literal_eval(f"'{key}'")
+                smooth_count = counts.smoother(int(freq))
+                probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_loc: Path) -> dict:
+    clusters = {}
+    if ftfy is None:
+        warnings.warn(Warnings.W004)
+    with clusters_loc.open() as f:
+        for line in tqdm(f):
+            try:
+                cluster, word, freq = line.split()
+                if ftfy is not None:
+                    word = ftfy.fix_text(word)
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = "0"
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters

From a976da168c74227281bbdc7b2aa4ab93a0f2afba Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:03:27 +0200
Subject: [PATCH 198/516] Support data augmentation in Corpus (#6155)

* Support data augmentation in Corpus

* Note initial docs for data augmentation

* Add augmenter to quickstart

* Fix flake8

* Format

* Fix test

* Update spacy/tests/training/test_training.py

* Improve data augmentation arguments

* Update templates

* Move randomization out into caller

* Refactor

* Update spacy/training/augment.py

* Update spacy/tests/training/test_training.py

* Fix augment

* Fix test
---
 spacy/cli/templates/quickstart_training.jinja |  1 +
 spacy/default_config.cfg                      |  5 ++
 spacy/tests/training/test_training.py         |  7 +-
 spacy/training/__init__.py                    |  1 +
 spacy/training/augment.py                     | 64 ++++++++++++-------
 spacy/training/corpus.py                      | 24 ++++++-
 spacy/util.py                                 |  1 +
 website/docs/api/corpus.md                    |  1 +
 website/docs/usage/training.md                | 11 ++++
 9 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9a8b9d1d7..56faeebfa 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -270,6 +270,7 @@ factory = "{{ pipe }}"
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
+augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 6f8c0aa00..63a0742e3 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -35,6 +35,11 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
+# Apply some simply data augmentation, where we replace tokens with variations.
+# This is especially useful for punctuation and case replacement, to help
+# generalize beyond corpora that don't have smart-quotes, or only have smart
+# quotes, etc.
+augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a04e6aadd..5311fae1e 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json
 from spacy.training.example import Example
 from spacy.training.converters import json_to_docs
-from spacy.training.augment import make_orth_variants_example
+from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
@@ -496,9 +496,8 @@ def test_make_orth_variants(doc):
         output_file = tmpdir / "roundtrip.spacy"
         DocBin(docs=[doc]).to_disk(output_file)
         # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(output_file)
-        train_example = next(reader(nlp))
-    make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
+        reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
+        train_examples = list(reader(nlp))
 
 
 @pytest.mark.skip("Outdated")
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 9172dde25..f71a5f521 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,6 +1,7 @@
 from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
+from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
 from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 4a01c8589..4d487ce93 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,30 +1,50 @@
+from typing import Callable
 import random
 import itertools
+import copy
+from functools import partial
+from ..util import registry
 
 
-def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
-    raw_text = example.text
-    orig_dict = example.to_dict()
-    variant_text, variant_token_annot = make_orth_variants(
-        nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
-    )
-    doc = nlp.make_doc(variant_text)
-    orig_dict["token_annotation"] = variant_token_annot
-    return example.from_dict(doc, orig_dict)
+@registry.augmenters("spacy.dont_augment.v1")
+def create_null_augmenter():
+    return dont_augment
 
 
-def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
-    if random.random() >= orth_variant_level:
-        return raw_text, orig_token_dict
-    if not orig_token_dict:
-        return raw_text, orig_token_dict
-    raw = raw_text
-    token_dict = orig_token_dict
-    lower = False
-    if random.random() >= 0.5:
-        lower = True
-        if raw is not None:
-            raw = raw.lower()
+@registry.augmenters("spacy.orth_variants.v1")
+def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
+    """Create a data augmentation callback that uses orth-variant replacement.
+    The callback can be added to a corpus or other data iterator during training.
+    """
+    return partial(orth_variants_augmenter, level=level, lower=lower)
+
+
+def dont_augment(nlp, example):
+    yield example
+
+
+def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
+    if random.random() >= level:
+        yield example
+    else:
+        raw_text = example.text
+        orig_dict = example.to_dict()
+        if not orig_dict["token_annotation"]:
+            yield example
+        else:
+            variant_text, variant_token_annot = make_orth_variants(
+                nlp,
+                raw_text,
+                orig_dict["token_annotation"],
+                lower=raw_text is not None and random.random() < lower
+            )
+            doc = nlp.make_doc(variant_text)
+            orig_dict["token_annotation"] = variant_token_annot
+            yield example.from_dict(doc, orig_dict)
+
+
+def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
+    orig_token_dict = copy.deepcopy(token_dict)
     orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
@@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
             # something went wrong, abort
             # (add a warning message?)
             if not match_found:
-                return raw_text, orig_token_dict
+                return raw, orig_token_dict
             # add following whitespace
             while raw_idx < len(raw) and raw[raw_idx].isspace():
                 variant_raw += raw[raw_idx]
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 12bda486e..90eb62474 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -1,9 +1,11 @@
 import warnings
 from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
+from typing import Optional
 from pathlib import Path
 import srsly
 
 from .. import util
+from .augment import dont_augment
 from .example import Example
 from ..errors import Warnings
 from ..tokens import DocBin, Doc
@@ -18,9 +20,19 @@ FILE_TYPE = ".spacy"
 
 @util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
-    path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0
+    path: Path,
+    gold_preproc: bool,
+    max_length: int = 0,
+    limit: int = 0,
+    augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
-    return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
+    return Corpus(
+        path,
+        gold_preproc=gold_preproc,
+        max_length=max_length,
+        limit=limit,
+        augmenter=augmenter,
+    )
 
 
 @util.registry.readers("spacy.JsonlReader.v1")
@@ -70,6 +82,8 @@ class Corpus:
         0, which indicates no limit.
     limit (int): Limit corpus to a subset of examples, e.g. for debugging.
         Defaults to 0, which indicates no limit.
+    augment (Callable[Example, Iterable[Example]]): Optional data augmentation
+        function, to extrapolate additional examples from your annotations.
 
     DOCS: https://nightly.spacy.io/api/corpus
     """
@@ -81,11 +95,13 @@ class Corpus:
         limit: int = 0,
         gold_preproc: bool = False,
         max_length: int = 0,
+        augmenter: Optional[Callable] = None,
     ) -> None:
         self.path = util.ensure_path(path)
         self.gold_preproc = gold_preproc
         self.max_length = max_length
         self.limit = limit
+        self.augmenter = augmenter if augmenter is not None else dont_augment
 
     def __call__(self, nlp: "Language") -> Iterator[Example]:
         """Yield examples from the data.
@@ -100,7 +116,9 @@ class Corpus:
             examples = self.make_examples_gold_preproc(nlp, ref_docs)
         else:
             examples = self.make_examples(nlp, ref_docs)
-        yield from examples
+        for real_eg in examples:
+            for augmented_eg in self.augmenter(nlp, real_eg):
+                yield augmented_eg
 
     def _make_example(
         self, nlp: "Language", reference: Doc, gold_preproc: bool
diff --git a/spacy/util.py b/spacy/util.py
index 01232f5c5..1cc7abf57 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -81,6 +81,7 @@ class registry(thinc.registry):
     callbacks = catalogue.create("spacy", "callbacks")
     batchers = catalogue.create("spacy", "batchers", entry_points=True)
     readers = catalogue.create("spacy", "readers", entry_points=True)
+    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
     loggers = catalogue.create("spacy", "loggers", entry_points=True)
     # These are factories registered via third-party packages and the
     # spacy_factories entry point. This registry only exists so we can easily
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 2b308d618..e7d6773e6 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -74,6 +74,7 @@ train/test skew.
 |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~                     |
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                     |
+| `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
 
 ## Corpus.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 54be6b367..eb02b135a 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -6,6 +6,7 @@ menu:
   - ['Introduction', 'basics']
   - ['Quickstart', 'quickstart']
   - ['Config System', 'config']
+  <!-- - ['Data Utilities', 'data'] -->
   - ['Custom Functions', 'custom-functions']
   - ['Parallel Training', 'parallel-training']
   - ['Internal API', 'api']
@@ -505,6 +506,16 @@ still look good.
 
 </Accordion>
 
+<!--
+## Data Utilities {#data-utilities}
+
+* spacy convert
+* The [corpora] block
+* Custom corpus class
+* Minibatching
+* Data augmentation
+-->
+
 ## Custom Functions {#custom-functions}
 
 Registered functions in the training config files can refer to built-in

From a023cf3ecc6e0b433250b56101c40e67eb58f735 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:06:12 +0200
Subject: [PATCH 199/516] Add (untested) resolve_dot_names util

---
 spacy/util.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/spacy/util.py b/spacy/util.py
index 01232f5c5..fb3381f55 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -422,6 +422,28 @@ def resolve_training_config(
     return registry.resolve(config, validate=validate)
 
 
+def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]:
+    """Resolve one or more "dot notation" names, e.g. corpora.train. 
+    The paths could point anywhere into the config, so we don't know which
+    top-level section we'll be looking within.
+    
+    We resolve the whole top-level section, although we could resolve less --
+    we could find the lowest part of the tree.
+    """
+    resolved = {}
+    output = []
+    for name in dot_names:
+        if name is None:
+            output.append(name)
+        else:
+            section = name.split(".")[0]
+            # We want to avoid resolving the same thing twice.
+            if section not in resolved:
+                resolved[section] = registry.resolve(config[section], schema=None)
+            output.append(dot_to_object(resolved, name))
+    return output
+
+
 def load_model_from_init_py(
     init_file: Union[Path, str],
     *,

From 3a0a3b8db684eb4cc67551814e7f8f8be1675362 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:06:33 +0200
Subject: [PATCH 200/516] Dont hard-code for 'corpora' name

---
 spacy/cli/train.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e27a499a7..e7b36a38f 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -77,12 +77,10 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
+    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
     optimizer T["optimizer"]
     score_weights = T["score_weights"]
-    # TODO: This might not be called corpora
-    corpora = registry.resolve(config["corpora"], schema=ConfigSchemaCorpora)
-    train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"])
-    dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"])
     batcher = T["batcher"]
     train_logger = T["logger"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
@@ -101,7 +99,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
         patience=T["patience"],
         max_steps=T["max_steps"],
         eval_frequency=T["eval_frequency"],
-        raw_text=None,
+        raw_text=raw_text,
         exclude=frozen_components,
     )
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")

From ed2aff2db346d7be9d94e73e0e2e2921cf966ccf Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:12:31 +0200
Subject: [PATCH 201/516] Remove unused train code

---
 spacy/cli/train.py | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e7b36a38f..468de583b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -18,7 +18,7 @@ from .. import util
 from ..training.example import Example
 from ..training.initialize import must_initialize, init_pipeline
 from ..errors import Errors
-from ..util import dot_to_object
+from ..util import resolve_dot_names
 
 
 @app.command(
@@ -363,27 +363,6 @@ def update_meta(
         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 
 
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    # TODO: separate checks from loading
-    raw_text = util.ensure_path(config["training"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
-    tag_map = {}
-    morph_rules = {}
-    weights_data = None
-    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return raw_text, tag_map, morph_rules, weights_data
-
-
 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
     # Make sure all files and paths exists if they are needed
     if not config_path or not config_path.exists():

From b886f53c31204b3c71c5a5b42435e7de85ee7fbc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:42:47 +0200
Subject: [PATCH 202/516] init-pipeline runs (maybe doesnt work)

---
 spacy/cli/__init__.py   |  3 ++-
 spacy/cli/init_model.py | 13 +++++++------
 spacy/cli/train.py      |  3 +--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 92cb76971..5569e630d 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -15,7 +15,8 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
-from .init_model import init_model  # noqa: F401
+#from .init_model import init_model  # noqa: F401
+from .init_pipeline import init_pipeline  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 6decb6172..4194f1bd0 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -12,16 +12,17 @@ import srsly
 import warnings
 from wasabi import msg, Printer
 import typer
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 
 DEFAULT_OOV_PROB = -20
 
 
-@init_cli.command("vocab")
-@app.command(
-    "init-model",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-    hidden=True,  # hide this from main CLI help but still allow it to work with warning
-)
+#@init_cli.command("vocab")
+#@app.command(
+#    "init-model",
+#    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+#    hidden=True,  # hide this from main CLI help but still allow it to work with warning
+#)
 def init_model_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 468de583b..8a360ad44 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -16,7 +16,6 @@ from ._util import import_code, get_sourced_components
 from ..language import Language
 from .. import util
 from ..training.example import Example
-from ..training.initialize import must_initialize, init_pipeline
 from ..errors import Errors
 from ..util import resolve_dot_names
 
@@ -79,7 +78,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
     train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
-    optimizer T["optimizer"]
+    optimizer = T["optimizer"]
     score_weights = T["score_weights"]
     batcher = T["batcher"]
     train_logger = T["logger"]

From 65448b2e34ab55291a52caaa950e9c427f85902c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 03:42:58 +0200
Subject: [PATCH 203/516] Remove schema=None until Optional

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index fb3381f55..90ae9cf20 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -439,7 +439,7 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Op
             section = name.split(".")[0]
             # We want to avoid resolving the same thing twice.
             if section not in resolved:
-                resolved[section] = registry.resolve(config[section], schema=None)
+                resolved[section] = registry.resolve(config[section])
             output.append(dot_to_object(resolved, name))
     return output
 

From 44bad1474c7be6b6fce31aa7a69352b7288135ce Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 09:47:34 +0200
Subject: [PATCH 204/516] Add init_pipeline file

---
 spacy/cli/init_pipeline.py | 111 +++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 spacy/cli/init_pipeline.py

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
new file mode 100644
index 000000000..34b396a50
--- /dev/null
+++ b/spacy/cli/init_pipeline.py
@@ -0,0 +1,111 @@
+from typing import Optional, Dict, Any, Tuple, Union, Callable, List
+import logging
+import srsly
+from pathlib import Path
+from wasabi import msg
+import typer
+from thinc.api import Config, fix_random_seed
+
+from .train import create_before_to_disk_callback
+from .. import util
+from ..util import registry
+from ..schemas import ConfigSchemaTraining
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, get_sourced_components
+from ..util import resolve_dot_names
+
+
+@init_cli.command(
+    "pipeline",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def init_pipeline_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the prepared data"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    # fmt: on
+):
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(config_path):
+        nlp = init_pipeline(config)
+    nlp.to_disk(output_path)
+
+
+def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
+    config = util.load_config(config_path, overrides=overrides)
+    if not init_path.exists():
+        return True
+    elif not (init_path / "config.cfg").exists():
+        return True
+    else:
+        init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
+        if config.to_str() != init_cfg.to_str():
+            return True
+        else:
+            return False
+
+
+def init_pipeline(config: Config, use_gpu=-1):
+    raw_config = config
+    config = raw_config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    # Use original config here before it's resolved to functions
+    sourced_components = get_sourced_components(config)
+    nlp = util.load_model_from_config(raw_config)
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(
+        config["training"],
+        schema=ConfigSchemaTraining,
+        validate=True,
+    )
+    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
+    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
+    util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
+    if T["vectors"] is not None:
+        add_vectors(nlp, T["vectors"])
+    score_weights = T["score_weights"]
+    optimizer = T["optimizer"]
+    batcher = T["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced_components if p not in frozen_components]
+    msg.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            msg.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    # Verify the config after calling 'begin_training' to ensure labels
+    # are properly initialized
+    verify_config(nlp)
+
+    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    if weights_data is not None:
+        tok2vec_component = C["pretraining"]["component"]
+        if tok2vec_component is None:
+            msg.fail(
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them.",
+                exits=1,
+            )
+        layer = nlp.get_pipe(tok2vec_component).model
+        tok2vec_layer = C["pretraining"]["layer"]
+        if tok2vec_layer:
+            layer = layer.get_ref(tok2vec_layer)
+        layer.from_bytes(weights_data)
+        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
+    return nlp

From 09d42d4bf0e4fd08229b372a1e81bc486ee1a699 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 28 Sep 2020 09:49:59 +0200
Subject: [PATCH 205/516] Add pickle5 to Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d44063f83..a180063b9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 SHELL := /bin/bash
 
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
 endif
 
 ifndef PYVER

From 553bfea6418e76c28b8786de35df7a3df0e0b56a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 10:53:17 +0200
Subject: [PATCH 206/516] Fix commands

---
 spacy/cli/_util.py         |  22 ++++++++
 spacy/cli/init_model.py    |  68 -----------------------
 spacy/cli/init_pipeline.py | 110 +++++++++++++++++++++++++++++--------
 spacy/cli/train.py         |  84 +++++++++-------------------
 4 files changed, 134 insertions(+), 150 deletions(-)
 delete mode 100644 spacy/cli/init_model.py

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 68cb572ea..6eafee4df 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -16,6 +16,7 @@ import os
 
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import ensure_path
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -458,3 +459,24 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
             p = int(p)
         result.append(p)
     return result
+
+
+def load_from_paths(
+    config: Config,
+) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
+    # TODO: separate checks from loading
+    raw_text = ensure_path(config["training"]["raw_text"])
+    if raw_text is not None:
+        if not raw_text.exists():
+            msg.fail("Can't find raw text", raw_text, exits=1)
+        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
+    tag_map = {}
+    morph_rules = {}
+    weights_data = None
+    init_tok2vec = ensure_path(config["training"]["init_tok2vec"])
+    if init_tok2vec is not None:
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    return raw_text, tag_map, morph_rules, weights_data
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
deleted file mode 100644
index 4194f1bd0..000000000
--- a/spacy/cli/init_model.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Optional, List, Dict, Any, Union, IO
-import math
-from tqdm import tqdm
-import numpy
-from ast import literal_eval
-from pathlib import Path
-from preshed.counter import PreshCounter
-import tarfile
-import gzip
-import zipfile
-import srsly
-import warnings
-from wasabi import msg, Printer
-import typer
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-
-DEFAULT_OOV_PROB = -20
-
-
-#@init_cli.command("vocab")
-#@app.command(
-#    "init-model",
-#    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-#    hidden=True,  # hide this from main CLI help but still allow it to work with warning
-#)
-def init_model_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    lang: str = Arg(..., help="Pipeline language"),
-    output_dir: Path = Arg(..., help="Pipeline output directory"),
-    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
-    clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
-    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
-    vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
-    prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
-    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
-    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
-    model_name: Optional[str] = Opt(None, "--meta-name", "-mn", help="Optional name of the package for the pipeline meta"),
-    base_model: Optional[str] = Opt(None, "--base", "-b", help="Name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers)")
-    # fmt: on
-):
-    """
-    Create a new blank pipeline directory with vocab and vectors from raw data.
-    If vectors are provided in Word2Vec format, they can be either a .txt or
-    zipped as a .zip or .tar.gz.
-
-    DOCS: https://nightly.spacy.io/api/cli#init-vocab
-    """
-    if ctx.command.name == "init-model":
-        msg.warn(
-            "The init-model command is now called 'init vocab'. You can run "
-            "'python -m spacy init --help' for an overview of the other "
-            "available initialization commands."
-        )
-    init_vocab(
-        lang,
-        output_dir,
-        freqs_loc=freqs_loc,
-        clusters_loc=clusters_loc,
-        jsonl_loc=jsonl_loc,
-        vectors_loc=vectors_loc,
-        prune_vectors=prune_vectors,
-        truncate_vectors=truncate_vectors,
-        vectors_name=vectors_name,
-        model_name=model_name,
-        base_model=base_model,
-        silent=False,
-    )
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 34b396a50..ca70b51d1 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -1,18 +1,17 @@
-from typing import Optional, Dict, Any, Tuple, Union, Callable, List
+from typing import Optional, Dict, Callable, Any
 import logging
-import srsly
 from pathlib import Path
 from wasabi import msg
 import typer
-from thinc.api import Config, fix_random_seed
+from thinc.api import Config, fix_random_seed, set_gpu_allocator
 
-from .train import create_before_to_disk_callback
 from .. import util
-from ..util import registry
-from ..schemas import ConfigSchemaTraining
+from ..util import registry, resolve_dot_names
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..language import Language
+from ..errors import Errors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
-from ..util import resolve_dot_names
+from ._util import import_code, get_sourced_components, load_from_paths
 
 
 @init_cli.command(
@@ -31,10 +30,12 @@ def init_pipeline_cli(
     util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
-    config = util.load_config(config_path, overrides=overrides)
     with show_validation_error(config_path):
-        nlp = init_pipeline(config)
+        config = util.load_config(config_path, overrides=overrides)
+    nlp = init_pipeline(config)
     nlp.to_disk(output_path)
+    # TODO: add more instructions
+    msg.good(f"Saved initialized pipeline to {output_path}")
 
 
 def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
@@ -51,7 +52,7 @@ def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool
             return False
 
 
-def init_pipeline(config: Config, use_gpu=-1):
+def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     raw_config = config
     config = raw_config.interpolate()
     if config["training"]["seed"] is not None:
@@ -61,22 +62,19 @@ def init_pipeline(config: Config, use_gpu=-1):
         set_gpu_allocator(allocator)
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
-    nlp = util.load_model_from_config(raw_config)
+    with show_validation_error():
+        nlp = util.load_model_from_config(raw_config)
+    msg.good("Set up nlp object from config")
     # Resolve all training-relevant sections using the filled nlp config
-    T = registry.resolve(
-        config["training"],
-        schema=ConfigSchemaTraining,
-        validate=True,
-    )
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
     train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
     util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
+    msg.good("Created vocabulary")
     if T["vectors"] is not None:
         add_vectors(nlp, T["vectors"])
-    score_weights = T["score_weights"]
+        msg.good(f"Added vectors: {T['vectors']}")
     optimizer = T["optimizer"]
-    batcher = T["batcher"]
-    train_logger = T["logger"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
@@ -89,13 +87,23 @@ def init_pipeline(config: Config, use_gpu=-1):
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        msg.good(f"Initialized pipeline components")
     # Verify the config after calling 'begin_training' to ensure labels
     # are properly initialized
     verify_config(nlp)
+    if "pretraining" in config and config["pretraining"]:
+        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
+        add_tok2vec_weights({"training": T, "pretraining": P}, nlp)
+    # TODO: this should be handled better?
+    nlp = before_to_disk(nlp)
+    return nlp
 
+
+def add_tok2vec_weights(config: Config, nlp: Language) -> None:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    weights_data = load_from_paths(config)
     if weights_data is not None:
-        tok2vec_component = C["pretraining"]["component"]
+        tok2vec_component = config["pretraining"]["component"]
         if tok2vec_component is None:
             msg.fail(
                 f"To use pretrained tok2vec weights, [pretraining.component] "
@@ -103,9 +111,63 @@ def init_pipeline(config: Config, use_gpu=-1):
                 exits=1,
             )
         layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = C["pretraining"]["layer"]
+        tok2vec_layer = config["pretraining"]["layer"]
         if tok2vec_layer:
             layer = layer.get_ref(tok2vec_layer)
         layer.from_bytes(weights_data)
-        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
-    return nlp
+        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
+
+
+def add_vectors(nlp: Language, vectors: str) -> None:
+    title = f"Config validation error for vectors {vectors}"
+    desc = (
+        "This typically means that there's a problem in the config.cfg included "
+        "with the packaged vectors. Make sure that the vectors package you're "
+        "loading is compatible with the current version of spaCy."
+    )
+    with show_validation_error(
+        title=title, desc=desc, hint_fill=False, show_config=False
+    ):
+        util.load_vectors_into_model(nlp, vectors)
+
+
+def verify_config(nlp: Language) -> None:
+    """Perform additional checks based on the config, loaded nlp object and training data."""
+    # TODO: maybe we should validate based on the actual components, the list
+    # in config["nlp"]["pipeline"] instead?
+    for pipe_config in nlp.config["components"].values():
+        # We can't assume that the component name == the factory
+        factory = pipe_config["factory"]
+        if factory == "textcat":
+            verify_textcat_config(nlp, pipe_config)
+
+
+def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
+    # if 'positive_label' is provided: double check whether it's in the data and
+    # the task is binary
+    if pipe_config.get("positive_label"):
+        textcat_labels = nlp.get_pipe("textcat").labels
+        pos_label = pipe_config.get("positive_label")
+        if pos_label not in textcat_labels:
+            raise ValueError(
+                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
+            )
+        if len(list(textcat_labels)) != 2:
+            raise ValueError(
+                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
+            )
+
+
+def create_before_to_disk_callback(
+    callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8a360ad44..3476d5966 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,6 +1,5 @@
 from typing import Optional, Dict, Any, Tuple, Union, Callable, List
 from timeit import default_timer as timer
-import srsly
 import tqdm
 from pathlib import Path
 from wasabi import msg
@@ -11,13 +10,17 @@ import random
 import typer
 import logging
 
+from .init_pipeline import init_pipeline, must_initialize
+from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
+from ._util import import_code
+from ._util import load_from_paths  # noqa: F401 (needed for Ray extension for now)
 from ..language import Language
 from .. import util
 from ..training.example import Example
 from ..errors import Errors
-from ..util import resolve_dot_names
+from ..util import resolve_dot_names, registry
+from ..schemas import ConfigSchemaTraining
 
 
 @app.command(
@@ -56,25 +59,35 @@ def train_cli(
         require_gpu(use_gpu)
     else:
         msg.info("Using CPU")
-    config = util.load_config(
-        config_path, overrides=config_overrides, interpolate=False
-    )
+    config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    msg.divider("Initializing pipeline")
+    # TODO: add warnings / --initialize (?) argument
     if output_path is None:
         nlp = init_pipeline(config)
     else:
-        init_path = output_path / "model-initial" 
-        if must_reinitialize(config, init_path):
+        init_path = output_path / "model-initial"
+        if must_initialize(config, init_path):
             nlp = init_pipeline(config)
             nlp.to_disk(init_path)
+            msg.good(f"Saved initialized pipeline to {init_path}")
         else:
-            nlp = spacy.load(output_path / "model-initial")
-    msg.info("Start training")
-    train(nlp, config, output_path)
+            nlp = util.load_model(init_path)
+            msg.good(f"Loaded initialized pipeline from {init_path}")
+    msg.divider("Training pipeline")
+    train(nlp, output_path, use_gpu=use_gpu)
 
 
-def train(nlp: Language, output_path: Optional[Path]=None) -> None:
+def train(
+    nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
+) -> None:
+    # TODO: random seed, GPU allocator
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
     train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
@@ -85,9 +98,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
- 
     # Create iterator, which yields out info after each optimization step.
-    msg.info("Start training")
     training_step_iterator = train_while_improving(
         nlp,
         optimizer,
@@ -101,7 +112,7 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
         raw_text=raw_text,
         exclude=frozen_components,
     )
-    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
+    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
     with nlp.select_pipes(disable=frozen_components):
         print_row, finalize_logger = train_logger(nlp)
 
@@ -145,7 +156,6 @@ def train(nlp: Language, output_path: Optional[Path]=None) -> None:
             msg.good(f"Saved pipeline to output directory {final_model_path}")
 
 
-
 def add_vectors(nlp: Language, vectors: str) -> None:
     title = f"Config validation error for vectors {vectors}"
     desc = (
@@ -199,21 +209,6 @@ def create_evaluation_callback(
     return evaluate
 
 
-def create_before_to_disk_callback(
-    callback: Optional[Callable[[Language], Language]]
-) -> Callable[[Language], Language]:
-    def before_to_disk(nlp: Language) -> Language:
-        if not callback:
-            return nlp
-        modified_nlp = callback(nlp)
-        if not isinstance(modified_nlp, Language):
-            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
-            raise ValueError(err)
-        return modified_nlp
-
-    return before_to_disk
-
-
 def train_while_improving(
     nlp: Language,
     optimizer: Optimizer,
@@ -370,30 +365,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
         if not output_path.exists():
             output_path.mkdir()
             msg.good(f"Created output directory: {output_path}")
-
-
-def verify_config(nlp: Language) -> None:
-    """Perform additional checks based on the config, loaded nlp object and training data."""
-    # TODO: maybe we should validate based on the actual components, the list
-    # in config["nlp"]["pipeline"] instead?
-    for pipe_config in nlp.config["components"].values():
-        # We can't assume that the component name == the factory
-        factory = pipe_config["factory"]
-        if factory == "textcat":
-            verify_textcat_config(nlp, pipe_config)
-
-
-def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
-    # if 'positive_label' is provided: double check whether it's in the data and
-    # the task is binary
-    if pipe_config.get("positive_label"):
-        textcat_labels = nlp.get_pipe("textcat").labels
-        pos_label = pipe_config.get("positive_label")
-        if pos_label not in textcat_labels:
-            raise ValueError(
-                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
-            )
-        if len(list(textcat_labels)) != 2:
-            raise ValueError(
-                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
-            )

From 2fdb7285a02be4148610aaadd77861a2170dcbd5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 11:06:07 +0200
Subject: [PATCH 207/516] Update CLI

---
 spacy/cli/train.py | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3476d5966..7a83646ef 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -32,8 +32,9 @@ def train_cli(
     config_path: Path = Arg(..., help="Path to config file", exists=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    init_path: Optional[Path] = Opt(None, "--init", "-i", help="Path to already initialized pipeline directory, e.g. created with 'spacy init pipeline' (will speed up training)"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
     """
@@ -61,26 +62,38 @@ def train_cli(
         msg.info("Using CPU")
     config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
-    # TODO: add warnings / --initialize (?) argument
-    if output_path is None:
-        nlp = init_pipeline(config)
-    else:
-        init_path = output_path / "model-initial"
-        if must_initialize(config, init_path):
-            nlp = init_pipeline(config)
-            nlp.to_disk(init_path)
-            msg.good(f"Saved initialized pipeline to {init_path}")
-        else:
-            nlp = util.load_model(init_path)
-            msg.good(f"Loaded initialized pipeline from {init_path}")
+    nlp = init_nlp(config, output_path, init_path)
     msg.divider("Training pipeline")
     train(nlp, output_path, use_gpu=use_gpu)
 
 
+def init_nlp(
+    config: Config, output_path: Optional[Path], init_path: Optional[Path]
+) -> None:
+
+    if init_path is not None:
+        nlp = util.load_model(init_path)
+        # TODO: how to handle provided pipeline that needs to be reinitialized?
+        msg.good(f"Loaded initialized pipeline from {init_path}")
+        return nlp
+    if output_path is not None:
+        output_init_path = output_path / "model-initial"
+        if must_initialize(config, output_init_path):
+            msg.warn("TODO:")
+            nlp = init_pipeline(config)
+            nlp.to_disk(init_path)
+            msg.good(f"Saved initialized pipeline to {output_init_path}")
+        else:
+            nlp = util.load_model(output_init_path)
+            msg.good(f"Loaded initialized pipeline from {output_init_path}")
+        return nlp
+    msg.warn("TODO:")
+    return init_pipeline(config)
+
+
 def train(
     nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
 ) -> None:
-    # TODO: random seed, GPU allocator
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
     if config["training"]["seed"] is not None:
@@ -112,6 +125,9 @@ def train(
         raw_text=raw_text,
         exclude=frozen_components,
     )
+    msg.info(f"Pipeline: {nlp.pipe_names}")
+    if frozen_components:
+        msg.info(f"Frozen components: {frozen_components}")
     msg.info(f"Initial learn rate: {optimizer.learn_rate}")
     with nlp.select_pipes(disable=frozen_components):
         print_row, finalize_logger = train_logger(nlp)

From 8b74fd19df8f7af566f6e657376e9f13bc189f36 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 11:13:38 +0200
Subject: [PATCH 208/516] init pipeline -> init nlp

---
 spacy/cli/init_pipeline.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index ca70b51d1..2dc7a741e 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -15,8 +15,7 @@ from ._util import import_code, get_sourced_components, load_from_paths
 
 
 @init_cli.command(
-    "pipeline",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def init_pipeline_cli(
     # fmt: off

From d5155376fd7d913734507f0647ddd4d33c625bbe Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 11:30:18 +0200
Subject: [PATCH 209/516] Update vocab init

---
 spacy/cli/init_pipeline.py   |  38 +++-
 spacy/training/initialize.py | 378 -----------------------------------
 spacy/util.py                |  16 +-
 3 files changed, 41 insertions(+), 391 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 2dc7a741e..8ff47d4a8 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -4,16 +4,21 @@ from pathlib import Path
 from wasabi import msg
 import typer
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
+import srsly
 
 from .. import util
-from ..util import registry, resolve_dot_names
+from ..util import registry, resolve_dot_names, OOV_RANK
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..language import Language
+from ..lookups import Lookups
 from ..errors import Errors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, get_sourced_components, load_from_paths
 
 
+DEFAULT_OOV_PROB = -20
+
+
 @init_cli.command(
     "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
@@ -68,7 +73,8 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
     train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
-    util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
+    # TODO: move lookups to [initialize], add vocab data
+    init_vocab(nlp, lookups=T["lookups"])
     msg.good("Created vocabulary")
     if T["vectors"] is not None:
         add_vectors(nlp, T["vectors"])
@@ -98,6 +104,33 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     return nlp
 
 
+def init_vocab(
+    nlp: Language,
+    *,
+    vocab_data: Optional[Path] = None,
+    lookups: Optional[Lookups] = None,
+) -> Language:
+    if lookups:
+        nlp.vocab.lookups = lookups
+        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
+    data_path = util.ensure_path(vocab_data)
+    if data_path is not None:
+        lex_attrs = srsly.read_jsonl(data_path)
+        for lexeme in nlp.vocab:
+            lexeme.rank = OOV_RANK
+        for attrs in lex_attrs:
+            if "settings" in attrs:
+                continue
+            lexeme = nlp.vocab[attrs["orth"]]
+            lexeme.set_attrs(**attrs)
+        if len(nlp.vocab):
+            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
+        else:
+            oov_prob = DEFAULT_OOV_PROB
+        nlp.vocab.cfg.update({"oov_prob": oov_prob})
+        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+
+
 def add_tok2vec_weights(config: Config, nlp: Language) -> None:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     weights_data = load_from_paths(config)
@@ -128,6 +161,7 @@ def add_vectors(nlp: Language, vectors: str) -> None:
         title=title, desc=desc, hint_fill=False, show_config=False
     ):
         util.load_vectors_into_model(nlp, vectors)
+        msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
 
 
 def verify_config(nlp: Language) -> None:
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 07bbced8d..e69de29bb 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,378 +0,0 @@
-from pathlib import Path
-from typing import Dict
-from ._util import app, init_cli, Arg, Opt
-from ..vectors import Vectors
-from ..errors import Errors, Warnings
-from ..language import Language
-from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
-
-try:
-    import ftfy
-except ImportError:
-    ftfy = None
-
-
-def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
-    config = util.load_config(config_path, overrides=overrides)
-    if not init_path.exists():
-        return True
-    elif not (init_path / "config.cfg").exists():
-        return True
-    else:
-        init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
-        if config.to_str() != init_cfg.to_str():
-            return True
-        else:
-            return False
-
-
-def init_pipeline(config: Config, use_gpu: int=-1):
-    raw_config = config
-    config = raw_config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    # Use original config here before it's resolved to functions
-    sourced_components = get_sourced_components(config)
-    with show_validation_error(config_path):
-        nlp = util.load_model_from_config(raw_config)
-        # Resolve all training-relevant sections using the filled nlp config
-        T = registry.resolve(
-            config["training"],
-            schema=TrainingSchema,
-            validate=validate,
-        )
-        # TODO: It might not be 'corpora' 
-        corpora = registry.resolve(config["corpora"], validate=True)
-        raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
-    util.load_vocab_data_into_model(nlp, lookups=T["lookups"])
-    if T["vectors"] is not None:
-        add_vectors(nlp, T["vectors"])
-    score_weights = T["score_weights"]
-    optimizer = T["optimizer"]
-    train_corpus = dot_to_object({"corpora": corpora}, T["train_corpus"])
-    dev_corpus = dot_to_object({"corpora": corpora}, T["dev_corpus"])
-    batcher = T["batcher"]
-    train_logger = T["logger"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T["frozen_components"]
-    # Sourced components that require resume_training
-    resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if resume_components:
-        with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
-            nlp.resume_training(sgd=optimizer)
-    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-    # Verify the config after calling 'begin_training' to ensure labels
-    # are properly initialized
-    verify_config(nlp)
-
-    if tag_map:
-        # Replace tag map with provided mapping
-        nlp.vocab.morphology.load_tag_map(tag_map)
-    if morph_rules:
-        # Load morph rules
-        nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
-    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    if weights_data is not None:
-        tok2vec_component = C["pretraining"]["component"]
-        if tok2vec_component is None:
-            msg.fail(
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them.",
-                exits=1,
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = C["pretraining"]["layer"]
-        if tok2vec_layer:
-            layer = layer.get_ref(tok2vec_layer)
-        layer.from_bytes(weights_data)
-        msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
-    return nlp
- 
-
-def init_vocab(
-    lang: str,
-    output_dir: Path,
-    freqs_loc: Optional[Path] = None,
-    clusters_loc: Optional[Path] = None,
-    jsonl_loc: Optional[Path] = None,
-    vectors_loc: Optional[Path] = None,
-    prune_vectors: int = -1,
-    truncate_vectors: int = 0,
-    vectors_name: Optional[str] = None,
-    model_name: Optional[str] = None,
-    base_model: Optional[str] = None,
-    silent: bool = True,
-) -> Language:
-    msg = Printer(no_print=silent, pretty=not silent)
-    if jsonl_loc is not None:
-        if freqs_loc is not None or clusters_loc is not None:
-            settings = ["-j"]
-            if freqs_loc:
-                settings.append("-f")
-            if clusters_loc:
-                settings.append("-c")
-            msg.warn(
-                "Incompatible arguments",
-                "The -f and -c arguments are deprecated, and not compatible "
-                "with the -j argument, which should specify the same "
-                "information. Either merge the frequencies and clusters data "
-                "into the JSONL-formatted file (recommended), or use only the "
-                "-f and -c files, without the other lexical attributes.",
-            )
-        jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = srsly.read_jsonl(jsonl_loc)
-    else:
-        clusters_loc = ensure_path(clusters_loc)
-        freqs_loc = ensure_path(freqs_loc)
-        if freqs_loc is not None and not freqs_loc.exists():
-            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
-        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
-
-    with msg.loading("Creating blank pipeline..."):
-        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
-
-    msg.good("Successfully created blank pipeline")
-    if vectors_loc is not None:
-        add_vectors(
-            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
-        )
-    vec_added = len(nlp.vocab.vectors)
-    lex_added = len(nlp.vocab)
-    msg.good(
-        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
-    )
-    if not output_dir.exists():
-        output_dir.mkdir()
-    nlp.to_disk(output_dir)
-    return nlp
-
-
-def open_file(loc: Union[str, Path]) -> IO:
-    """Handle .gz, .tar.gz or unzipped files"""
-    loc = ensure_path(loc)
-    if tarfile.is_tarfile(str(loc)):
-        return tarfile.open(str(loc), "r:gz")
-    elif loc.parts[-1].endswith("gz"):
-        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
-    elif loc.parts[-1].endswith("zip"):
-        zip_file = zipfile.ZipFile(str(loc))
-        names = zip_file.namelist()
-        file_ = zip_file.open(names[0])
-        return (line.decode("utf8") for line in file_)
-    else:
-        return loc.open("r", encoding="utf8")
-
-
-def read_attrs_from_deprecated(
-    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
-) -> List[Dict[str, Any]]:
-    if freqs_loc is not None:
-        with msg.loading("Counting frequencies..."):
-            probs, _ = read_freqs(freqs_loc)
-        msg.good("Counted frequencies")
-    else:
-        probs, _ = ({}, DEFAULT_OOV_PROB)  # noqa: F841
-    if clusters_loc:
-        with msg.loading("Reading clusters..."):
-            clusters = read_clusters(clusters_loc)
-        msg.good("Read clusters")
-    else:
-        clusters = {}
-    lex_attrs = []
-    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
-    if len(sorted_probs):
-        for i, (word, prob) in tqdm(enumerate(sorted_probs)):
-            attrs = {"orth": word, "id": i, "prob": prob}
-            # Decode as a little-endian string, so that we can do & 15 to get
-            # the first 4 bits. See _parse_features.pyx
-            if word in clusters:
-                attrs["cluster"] = int(clusters[word][::-1], 2)
-            else:
-                attrs["cluster"] = 0
-            lex_attrs.append(attrs)
-    return lex_attrs
-
-
-def create_model(
-    lang: str,
-    lex_attrs: List[Dict[str, Any]],
-    name: Optional[str] = None,
-    base_model: Optional[Union[str, Path]] = None,
-) -> Language:
-    if base_model:
-        nlp = load_model(base_model)
-        # keep the tokenizer but remove any existing pipeline components due to
-        # potentially conflicting vectors
-        for pipe in nlp.pipe_names:
-            nlp.remove_pipe(pipe)
-    else:
-        lang_class = get_lang_class(lang)
-        nlp = lang_class()
-    for lexeme in nlp.vocab:
-        lexeme.rank = OOV_RANK
-    for attrs in lex_attrs:
-        if "settings" in attrs:
-            continue
-        lexeme = nlp.vocab[attrs["orth"]]
-        lexeme.set_attrs(**attrs)
-    if len(nlp.vocab):
-        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
-    else:
-        oov_prob = DEFAULT_OOV_PROB
-    nlp.vocab.cfg.update({"oov_prob": oov_prob})
-    if name:
-        nlp.meta["name"] = name
-    return nlp
-
-
-def add_vectors(
-    msg: Printer,
-    nlp: Language,
-    vectors_loc: Optional[Path],
-    truncate_vectors: int,
-    prune_vectors: int,
-    name: Optional[str] = None,
-) -> None:
-    vectors_loc = ensure_path(vectors_loc)
-    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
-        for lex in nlp.vocab:
-            if lex.rank and lex.rank != OOV_RANK:
-                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
-    else:
-        if vectors_loc:
-            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(
-                    msg, vectors_loc, truncate_vectors
-                )
-            msg.good(f"Loaded vectors from {vectors_loc}")
-        else:
-            vectors_data, vector_keys = (None, None)
-        if vector_keys is not None:
-            for word in vector_keys:
-                if word not in nlp.vocab:
-                    nlp.vocab[word]
-        if vectors_data is not None:
-            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
-    if name is None:
-        # TODO: Is this correct? Does this matter?
-        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
-    else:
-        nlp.vocab.vectors.name = name
-    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
-    if prune_vectors >= 1:
-        nlp.vocab.prune_vectors(prune_vectors)
-
-
-def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
-    f = open_file(vectors_loc)
-    f = ensure_shape(f)
-    shape = tuple(int(size) for size in next(f).split())
-    if truncate_vectors >= 1:
-        shape = (truncate_vectors, shape[1])
-    vectors_data = numpy.zeros(shape=shape, dtype="f")
-    vectors_keys = []
-    for i, line in enumerate(tqdm(f)):
-        line = line.rstrip()
-        pieces = line.rsplit(" ", vectors_data.shape[1])
-        word = pieces.pop(0)
-        if len(pieces) != vectors_data.shape[1]:
-            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
-        vectors_data[i] = numpy.asarray(pieces, dtype="f")
-        vectors_keys.append(word)
-        if i == truncate_vectors - 1:
-            break
-    return vectors_data, vectors_keys
-
-
-def ensure_shape(lines):
-    """Ensure that the first line of the data is the vectors shape.
-
-    If it's not, we read in the data and output the shape as the first result,
-    so that the reader doesn't have to deal with the problem.
-    """
-    first_line = next(lines)
-    try:
-        shape = tuple(int(size) for size in first_line.split())
-    except ValueError:
-        shape = None
-    if shape is not None:
-        # All good, give the data
-        yield first_line
-        yield from lines
-    else:
-        # Figure out the shape, make it the first value, and then give the
-        # rest of the data.
-        width = len(first_line.split()) - 1
-        captured = [first_line] + list(lines)
-        length = len(captured)
-        yield f"{length} {width}"
-        yield from captured
-
-
-def read_freqs(
-    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
-):
-    counts = PreshCounter()
-    total = 0
-    with freqs_loc.open() as f:
-        for i, line in enumerate(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            freq = int(freq)
-            counts.inc(i + 1, freq)
-            total += freq
-    counts.smooth()
-    log_total = math.log(total)
-    probs = {}
-    with freqs_loc.open() as f:
-        for line in tqdm(f):
-            freq, doc_freq, key = line.rstrip().split("\t", 2)
-            doc_freq = int(doc_freq)
-            freq = int(freq)
-            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-                try:
-                    word = literal_eval(key)
-                except SyntaxError:
-                    # Take odd strings literally.
-                    word = literal_eval(f"'{key}'")
-                smooth_count = counts.smoother(int(freq))
-                probs[word] = math.log(smooth_count) - log_total
-    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs, oov_prob
-
-
-def read_clusters(clusters_loc: Path) -> dict:
-    clusters = {}
-    if ftfy is None:
-        warnings.warn(Warnings.W004)
-    with clusters_loc.open() as f:
-        for line in tqdm(f):
-            try:
-                cluster, word, freq = line.split()
-                if ftfy is not None:
-                    word = ftfy.fix_text(word)
-            except ValueError:
-                continue
-            # If the clusterer has only seen the word a few times, its
-            # cluster is unreliable.
-            if int(freq) >= 3:
-                clusters[word] = cluster
-            else:
-                clusters[word] = "0"
-    # Expand clusters with re-casing
-    for word, cluster in list(clusters.items()):
-        if word.lower() not in clusters:
-            clusters[word.lower()] = cluster
-        if word.title() not in clusters:
-            clusters[word.title()] = cluster
-        if word.upper() not in clusters:
-            clusters[word.upper()] = cluster
-    return clusters
diff --git a/spacy/util.py b/spacy/util.py
index 90ae9cf20..de1fd7f81 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -253,14 +253,6 @@ def load_vectors_into_model(
                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
-def load_vocab_data_into_model(
-    nlp: "Language", *, lookups: Optional["Lookups"] = None
-) -> None:
-    """Load vocab data."""
-    if lookups:
-        nlp.vocab.lookups = lookups
-
-
 def load_model(
     name: Union[str, Path],
     *,
@@ -422,11 +414,13 @@ def resolve_training_config(
     return registry.resolve(config, validate=validate)
 
 
-def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> List[Optional[Callable]]:
-    """Resolve one or more "dot notation" names, e.g. corpora.train. 
+def resolve_dot_names(
+    config: Config, dot_names: List[Optional[str]]
+) -> List[Optional[Callable]]:
+    """Resolve one or more "dot notation" names, e.g. corpora.train.
     The paths could point anywhere into the config, so we don't know which
     top-level section we'll be looking within.
-    
+
     We resolve the whole top-level section, although we could resolve less --
     we could find the lowest part of the tree.
     """

From e44a7519cdac903a64b0dec5e98b8b828952d4b9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 11:56:14 +0200
Subject: [PATCH 210/516] Update CLI and add [initialize] block

---
 setup.cfg                  |  2 +-
 spacy/cli/_util.py         | 21 ----------------
 spacy/cli/init_pipeline.py | 49 +++++++++++++++++++++++++-------------
 spacy/cli/train.py         | 24 ++++++++++++++++++-
 spacy/default_config.cfg   | 12 ++++++++++
 spacy/schemas.py           | 35 +++++++++++++++++++--------
 spacy/util.py              |  2 +-
 7 files changed, 94 insertions(+), 51 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index b55c0d376..9ce361bc1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -98,7 +98,7 @@ universal = false
 formats = gztar
 
 [flake8]
-ignore = E203, E266, E501, E731, W503
+ignore = E203, E266, E501, E731, W503, E741
 max-line-length = 80
 select = B,C,E,F,W,T4,B9
 exclude =
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 6eafee4df..7ff2c6199 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -459,24 +459,3 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
             p = int(p)
         result.append(p)
     return result
-
-
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    # TODO: separate checks from loading
-    raw_text = ensure_path(config["training"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
-    tag_map = {}
-    morph_rules = {}
-    weights_data = None
-    init_tok2vec = ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return raw_text, tag_map, morph_rules, weights_data
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 8ff47d4a8..5ca565d88 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -8,12 +8,12 @@ import srsly
 
 from .. import util
 from ..util import registry, resolve_dot_names, OOV_RANK
-from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
 from ..language import Language
 from ..lookups import Lookups
 from ..errors import Errors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components, load_from_paths
+from ._util import import_code, get_sourced_components
 
 
 DEFAULT_OOV_PROB = -20
@@ -67,14 +67,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     with show_validation_error():
-        nlp = util.load_model_from_config(raw_config)
+        nlp = util.load_model_from_config(raw_config, auto_fill=True)
     msg.good("Set up nlp object from config")
+    config = nlp.config.interpolate()
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
     train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
-    # TODO: move lookups to [initialize], add vocab data
-    init_vocab(nlp, lookups=T["lookups"])
+    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+    init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
     msg.good("Created vocabulary")
     if T["vectors"] is not None:
         add_vectors(nlp, T["vectors"])
@@ -98,22 +99,19 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     verify_config(nlp)
     if "pretraining" in config and config["pretraining"]:
         P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        add_tok2vec_weights({"training": T, "pretraining": P}, nlp)
+        add_tok2vec_weights(nlp, P, I)
     # TODO: this should be handled better?
     nlp = before_to_disk(nlp)
     return nlp
 
 
 def init_vocab(
-    nlp: Language,
-    *,
-    vocab_data: Optional[Path] = None,
-    lookups: Optional[Lookups] = None,
+    nlp: Language, *, data: Optional[Path] = None, lookups: Optional[Lookups] = None,
 ) -> Language:
     if lookups:
         nlp.vocab.lookups = lookups
         msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
-    data_path = util.ensure_path(vocab_data)
+    data_path = util.ensure_path(data)
     if data_path is not None:
         lex_attrs = srsly.read_jsonl(data_path)
         for lexeme in nlp.vocab:
@@ -131,11 +129,29 @@ def init_vocab(
         msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
 
 
-def add_tok2vec_weights(config: Config, nlp: Language) -> None:
+def add_tok2vec_weights(
+    nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
+) -> None:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    weights_data = load_from_paths(config)
+    P = pretrain_config
+    I = init_config
+    raw_text = util.ensure_path(I["vocab"]["raw_text"])
+    if raw_text is not None:
+        if not raw_text.exists():
+            msg.fail("Can't find raw text", raw_text, exits=1)
+        raw_text = list(srsly.read_jsonl(raw_text))
+    weights_data = None
+    init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
+    if init_tok2vec is not None:
+        if P["objective"].get("type") == "vectors" and not I["vectors"]:
+            err = "Need initialize.vectors if pretraining.objective.type is vectors"
+            msg.fail(err, exits=1)
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
     if weights_data is not None:
-        tok2vec_component = config["pretraining"]["component"]
+        tok2vec_component = P["component"]
         if tok2vec_component is None:
             msg.fail(
                 f"To use pretrained tok2vec weights, [pretraining.component] "
@@ -143,9 +159,8 @@ def add_tok2vec_weights(config: Config, nlp: Language) -> None:
                 exits=1,
             )
         layer = nlp.get_pipe(tok2vec_component).model
-        tok2vec_layer = config["pretraining"]["layer"]
-        if tok2vec_layer:
-            layer = layer.get_ref(tok2vec_layer)
+        if P["layer"]:
+            layer = layer.get_ref(P["layer"])
         layer.from_bytes(weights_data)
         msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7a83646ef..d69b3bd36 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -14,7 +14,6 @@ from .init_pipeline import init_pipeline, must_initialize
 from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
-from ._util import load_from_paths  # noqa: F401 (needed for Ray extension for now)
 from ..language import Language
 from .. import util
 from ..training.example import Example
@@ -381,3 +380,26 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
         if not output_path.exists():
             output_path.mkdir()
             msg.good(f"Created output directory: {output_path}")
+
+
+# TODO: this is currently imported by the ray extension and not used otherwise
+def load_from_paths(
+    config: Config,
+) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
+    import srsly
+    # TODO: separate checks from loading
+    raw_text = util.ensure_path(config["training"]["raw_text"])
+    if raw_text is not None:
+        if not raw_text.exists():
+            msg.fail("Can't find raw text", raw_text, exits=1)
+        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
+    tag_map = {}
+    morph_rules = {}
+    weights_data = None
+    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
+    if init_tok2vec is not None:
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    return raw_text, tag_map, morph_rules, weights_data
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index a8f4a9497..800a2b4a3 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -108,3 +108,15 @@ grad_clip = 1.0
 use_averages = false
 eps = 1e-8
 learn_rate = 0.001
+
+[initialize]
+tokenizer = {}
+components = {}
+
+[initialize.vocab]
+data = null
+lookups = null
+vectors = null
+# Extra resources for transfer-learning or pseudo-rehearsal
+init_tok2vec = ${paths.init_tok2vec}
+raw_text = ${paths.raw}
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 7951b851b..6553892d3 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -273,22 +273,37 @@ class ConfigSchemaPretrain(BaseModel):
         arbitrary_types_allowed = True
 
 
+class ConfigSchemaInitVocab(BaseModel):
+    # fmt: off
+    data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
+    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
+    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchemaInit(BaseModel):
+    vocab: ConfigSchemaInitVocab
+    tokenizer: Any
+    components: Dict[str, Any]
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
     pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
     components: Dict[str, Dict[str, Any]]
     corpora: Dict[str, Reader]
-
-    @root_validator(allow_reuse=True)
-    def validate_config(cls, values):
-        """Perform additional validation for settings with dependencies."""
-        pt = values.get("pretraining")
-        if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
-            if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
-                err = "Need nlp.vectors if pretraining.objective.type is vectors"
-                raise ValueError(err)
-        return values
+    initialize: ConfigSchemaInit
 
     class Config:
         extra = "allow"
diff --git a/spacy/util.py b/spacy/util.py
index de1fd7f81..cab7af8fb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
-CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
 # fmt: on
 
 

From 9f6ad06452cd389d68cc63f5ae9a88a9943d2d72 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 28 Sep 2020 12:00:23 +0200
Subject: [PATCH 211/516] Upd default config

---
 spacy/default_config.cfg | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 800a2b4a3..0ab27f499 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -48,15 +48,6 @@ max_length = 0
 # Limitation on number of training examples
 limit = 0
 
-[prepare]
-# The 'prepare' step is run before training or pretraining. Components and
-# the tokenizer can each define their own prepare step, giving them a chance
-# to gather resources like lookup-tables, build label sets, construct vocabularies,
-# etc. After 'prepare' is finished, the result will be saved out to disk, which
-# will then be read in at the start of training. You can call the prepare step
-# separately with the `spacy prepare` command, or you can let the train script
-# do it for you.
-
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
@@ -109,6 +100,13 @@ use_averages = false
 eps = 1e-8
 learn_rate = 0.001
 
+# The 'initialize' step is run before training or pretraining. Components and
+# the tokenizer can each define their own prepare step, giving them a chance
+# to gather resources like lookup-tables, build label sets, construct vocabularies,
+# etc. After 'prepare' is finished, the result will be saved out to disk, which
+# will then be read in at the start of training. You can call the prepare step
+# separately with the `spacy prepare` command, or you can let the train script
+# do it for you.
 [initialize]
 tokenizer = {}
 components = {}

From 1590de11b1e794ac4c48b21e56c81b164de57ee7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:05:23 +0200
Subject: [PATCH 212/516] Update config

---
 spacy/cli/init_pipeline.py                    | 26 ++++++++-----------
 spacy/cli/templates/quickstart_training.jinja | 14 ++++++----
 spacy/default_config.cfg                      | 11 +++-----
 spacy/default_config_pretraining.cfg          |  2 +-
 spacy/schemas.py                              | 10 ++-----
 5 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 5ca565d88..78d828719 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     config = nlp.config.interpolate()
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
-    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-    init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"])
+    V = I["vocab"]
+    init_vocab(nlp, data=V["data"], lookups=V["lookups"])
     msg.good("Created vocabulary")
-    if T["vectors"] is not None:
-        add_vectors(nlp, T["vectors"])
-        msg.good(f"Added vectors: {T['vectors']}")
+    if V["vectors"] is not None:
+        add_vectors(nlp, V["vectors"])
+        msg.good(f"Added vectors: {V['vectors']}")
     optimizer = T["optimizer"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
@@ -130,20 +131,15 @@ def init_vocab(
 
 
 def add_tok2vec_weights(
-    nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
+    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 ) -> None:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     P = pretrain_config
-    I = init_config
-    raw_text = util.ensure_path(I["vocab"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(raw_text))
+    V = vocab_config
     weights_data = None
-    init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"])
+    init_tok2vec = util.ensure_path(V["init_tok2vec"])
     if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not I["vectors"]:
+        if P["objective"].get("type") == "vectors" and not V["vectors"]:
             err = "Need initialize.vectors if pretraining.objective.type is vectors"
             msg.fail(err, exits=1)
         if not init_tok2vec.exists():
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9a8b9d1d7..5e990611e 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -277,11 +277,6 @@ path = ${paths.dev}
 max_length = 0
 
 [training]
-{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
-{% else -%}
-vectors = "{{ word_vectors }}"
-{% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
 {% endif -%}
@@ -317,3 +312,12 @@ start = 100
 stop = 1000
 compound = 1.001
 {% endif %}
+
+[initialize]
+
+[initialize.vocab]
+{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
+vectors = null
+{% else -%}
+vectors = "{{ word_vectors }}"
+{% endif -%}
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 0ab27f499..083b6a702 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -1,8 +1,9 @@
 [paths]
 train = ""
 dev = ""
-raw = null
+raw_text = null
 init_tok2vec = null
+vocab_data = null
 
 [system]
 seed = 0
@@ -54,11 +55,6 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Extra resources for transfer-learning or pseudo-rehearsal
-init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths.raw}
-vectors = null
-lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
@@ -112,9 +108,8 @@ tokenizer = {}
 components = {}
 
 [initialize.vocab]
-data = null
+data = ${paths.vocab_data}
 lookups = null
 vectors = null
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
-raw_text = ${paths.raw}
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index bbd595308..122a7803a 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -32,7 +32,7 @@ learn_rate = 0.001
 
 [corpora.pretrain]
 @readers = "spacy.JsonlReader.v1"
-path = ${paths.raw}
+path = ${paths.raw_text}
 min_length = 5
 max_length = 500
 limit = 0
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 6553892d3..b98498b8b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
-from pydantic import root_validator
 from thinc.config import Promise
 from collections import defaultdict
 from thinc.api import Optimizer
@@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):
 
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
-    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
     dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
     train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
@@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
     score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
-    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
-    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
     logger: Logger = Field(..., title="The logger to track training progress")
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):
 
 class ConfigSchemaInitVocab(BaseModel):
     # fmt: off
-    data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file")
+    data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
     lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
-    raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
     # fmt: on
 
     class Config:
@@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
 class ConfigSchemaInit(BaseModel):
     vocab: ConfigSchemaInitVocab
     tokenizer: Any
-    components: Dict[str, Any]
+    components: Dict[StrictStr, Any]
 
     class Config:
         extra = "forbid"

From a5f2cc05090a3fde472b7a61958cc08c86099a8f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:30:13 +0200
Subject: [PATCH 213/516] Tidy up and remove raw text (rehearsal) for now

---
 spacy/cli/init_pipeline.py | 14 --------
 spacy/cli/train.py         | 67 ++++++++++++++++++--------------------
 spacy/default_config.cfg   |  1 -
 3 files changed, 31 insertions(+), 51 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 78d828719..a2fd4c838 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -42,20 +42,6 @@ def init_pipeline_cli(
     msg.good(f"Saved initialized pipeline to {output_path}")
 
 
-def must_initialize(init_path: Path, config_path: Path, overrides: Dict) -> bool:
-    config = util.load_config(config_path, overrides=overrides)
-    if not init_path.exists():
-        return True
-    elif not (init_path / "config.cfg").exists():
-        return True
-    else:
-        init_cfg = util.load_config(init_path / "config.cfg", interpolate=True)
-        if config.to_str() != init_cfg.to_str():
-            return True
-        else:
-            return False
-
-
 def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     raw_config = config
     config = raw_config.interpolate()
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index d69b3bd36..e179a1e3d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -10,13 +10,12 @@ import random
 import typer
 import logging
 
-from .init_pipeline import init_pipeline, must_initialize
+from .init_pipeline import init_pipeline
 from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code
 from ..language import Language
 from .. import util
-from ..training.example import Example
 from ..errors import Errors
 from ..util import resolve_dot_names, registry
 from ..schemas import ConfigSchemaTraining
@@ -69,24 +68,39 @@ def train_cli(
 def init_nlp(
     config: Config, output_path: Optional[Path], init_path: Optional[Path]
 ) -> None:
-
     if init_path is not None:
         nlp = util.load_model(init_path)
-        # TODO: how to handle provided pipeline that needs to be reinitialized?
+        if must_reinitialize(config, nlp.config):
+            msg.fail(
+                f"Config has changed: can't use initialized pipeline from "
+                f"{init_path}. Please re-run 'spacy init nlp'.",
+                exits=1,
+            )
         msg.good(f"Loaded initialized pipeline from {init_path}")
         return nlp
     if output_path is not None:
         output_init_path = output_path / "model-initial"
-        if must_initialize(config, output_init_path):
-            msg.warn("TODO:")
+        if not output_init_path.exists():
+            msg.info(f"Initializing the pipeline in {output_init_path}")
             nlp = init_pipeline(config)
-            nlp.to_disk(init_path)
+            nlp.to_disk(output_init_path)
             msg.good(f"Saved initialized pipeline to {output_init_path}")
         else:
             nlp = util.load_model(output_init_path)
-            msg.good(f"Loaded initialized pipeline from {output_init_path}")
+            if must_reinitialize(config, nlp.config):
+                msg.warn("Config has changed: need to re-initialize pipeline")
+                nlp = init_pipeline(config)
+                nlp.to_disk(output_init_path)
+                msg.good(f"Re-initialized pipeline in {output_init_path}")
+            else:
+                msg.good(f"Loaded initialized pipeline from {output_init_path}")
         return nlp
-    msg.warn("TODO:")
+    msg.warn(
+        "Not saving initialized model: no output directory specified. "
+        "To speed up training, spaCy can save the initialized nlp object with "
+        "the vocabulary, vectors and label scheme. To take advantage of this, "
+        "provide an output directory or use the 'spacy init nlp' command."
+    )
     return init_pipeline(config)
 
 
@@ -101,8 +115,8 @@ def train(
     if use_gpu >= 0 and allocator:
         set_gpu_allocator(allocator)
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]]
-    train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     optimizer = T["optimizer"]
     score_weights = T["score_weights"]
     batcher = T["batcher"]
@@ -121,7 +135,6 @@ def train(
         patience=T["patience"],
         max_steps=T["max_steps"],
         eval_frequency=T["eval_frequency"],
-        raw_text=raw_text,
         exclude=frozen_components,
     )
     msg.info(f"Pipeline: {nlp.pipe_names}")
@@ -171,6 +184,11 @@ def train(
             msg.good(f"Saved pipeline to output directory {final_model_path}")
 
 
+def must_reinitialize(train_config: Config, init_config: Config) -> bool:
+    # TODO: do this better and more fine-grained
+    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
+
+
 def add_vectors(nlp: Language, vectors: str) -> None:
     title = f"Config validation error for vectors {vectors}"
     desc = (
@@ -235,7 +253,6 @@ def train_while_improving(
     accumulate_gradient: int,
     patience: int,
     max_steps: int,
-    raw_text: List[Dict[str, str]],
     exclude: List[str],
 ):
     """Train until an evaluation stops improving. Works as a generator,
@@ -282,27 +299,14 @@ def train_while_improving(
         dropouts = dropout
     results = []
     losses = {}
-    if raw_text:
-        random.shuffle(raw_text)
-        raw_examples = [
-            Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
-        ]
-        raw_batches = util.minibatch(raw_examples, size=8)
-
     words_seen = 0
     start_time = timer()
     for step, (epoch, batch) in enumerate(train_data):
         dropout = next(dropouts)
         for subbatch in subdivide_batch(batch, accumulate_gradient):
-
             nlp.update(
                 subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
             )
-            if raw_text:
-                # If raw text is available, perform 'rehearsal' updates,
-                # which use unlabelled data to reduce overfitting.
-                raw_batch = list(next(raw_batches))
-                nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude)
         # TODO: refactor this so we don't have to run it separately in here
         for name, proc in nlp.pipeline:
             if (
@@ -386,15 +390,6 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
 def load_from_paths(
     config: Config,
 ) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    import srsly
-    # TODO: separate checks from loading
-    raw_text = util.ensure_path(config["training"]["raw_text"])
-    if raw_text is not None:
-        if not raw_text.exists():
-            msg.fail("Can't find raw text", raw_text, exits=1)
-        raw_text = list(srsly.read_jsonl(config["training"]["raw_text"]))
-    tag_map = {}
-    morph_rules = {}
     weights_data = None
     init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
     if init_tok2vec is not None:
@@ -402,4 +397,4 @@ def load_from_paths(
             msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
         with init_tok2vec.open("rb") as file_:
             weights_data = file_.read()
-    return raw_text, tag_map, morph_rules, weights_data
+    return None, {}, {}, weights_data
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 083b6a702..86293fd40 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -1,7 +1,6 @@
 [paths]
 train = ""
 dev = ""
-raw_text = null
 init_tok2vec = null
 vocab_data = null
 

From f49288ab81d9d2b2095eb5513b6fc79fcc68cac1 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:31:54 +0200
Subject: [PATCH 214/516] Update default_config_pretraining.cfg

---
 spacy/default_config_pretraining.cfg | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 122a7803a..4011159a4 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -1,3 +1,6 @@
+[paths]
+raw_text = null
+
 [pretraining]
 max_epochs = 1000
 dropout = 0.2

From c22ecc66bbed5a98242d0b8b45c145f6abc5598f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:46:28 +0200
Subject: [PATCH 215/516] Don't support init path for now

---
 spacy/cli/init_pipeline.py |  5 +++--
 spacy/cli/train.py         | 37 ++++++++++++-------------------------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index a2fd4c838..e64683fe1 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -20,7 +20,9 @@ DEFAULT_OOV_PROB = -20
 
 
 @init_cli.command(
-    "nlp", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    "nlp",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    hidden=True,
 )
 def init_pipeline_cli(
     # fmt: off
@@ -38,7 +40,6 @@ def init_pipeline_cli(
         config = util.load_config(config_path, overrides=overrides)
     nlp = init_pipeline(config)
     nlp.to_disk(output_path)
-    # TODO: add more instructions
     msg.good(f"Saved initialized pipeline to {output_path}")
 
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e179a1e3d..afaf230d1 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -30,7 +30,6 @@ def train_cli(
     config_path: Path = Arg(..., help="Path to config file", exists=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
-    init_path: Optional[Path] = Opt(None, "--init", "-i", help="Path to already initialized pipeline directory, e.g. created with 'spacy init pipeline' (will speed up training)"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
@@ -60,46 +59,34 @@ def train_cli(
         msg.info("Using CPU")
     config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
-    nlp = init_nlp(config, output_path, init_path)
+    nlp = init_nlp(config, output_path)
     msg.divider("Training pipeline")
     train(nlp, output_path, use_gpu=use_gpu)
 
 
-def init_nlp(
-    config: Config, output_path: Optional[Path], init_path: Optional[Path]
-) -> None:
-    if init_path is not None:
-        nlp = util.load_model(init_path)
-        if must_reinitialize(config, nlp.config):
-            msg.fail(
-                f"Config has changed: can't use initialized pipeline from "
-                f"{init_path}. Please re-run 'spacy init nlp'.",
-                exits=1,
-            )
-        msg.good(f"Loaded initialized pipeline from {init_path}")
-        return nlp
+def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
     if output_path is not None:
-        output_init_path = output_path / "model-initial"
-        if not output_init_path.exists():
-            msg.info(f"Initializing the pipeline in {output_init_path}")
+        init_path = output_path / "model-initial"
+        if not init_path.exists():
+            msg.info(f"Initializing the pipeline in {init_path}")
             nlp = init_pipeline(config)
-            nlp.to_disk(output_init_path)
-            msg.good(f"Saved initialized pipeline to {output_init_path}")
+            nlp.to_disk(init_path)
+            msg.good(f"Saved initialized pipeline to {init_path}")
         else:
-            nlp = util.load_model(output_init_path)
+            nlp = util.load_model(init_path)
             if must_reinitialize(config, nlp.config):
                 msg.warn("Config has changed: need to re-initialize pipeline")
                 nlp = init_pipeline(config)
-                nlp.to_disk(output_init_path)
-                msg.good(f"Re-initialized pipeline in {output_init_path}")
+                nlp.to_disk(init_path)
+                msg.good(f"Re-initialized pipeline in {init_path}")
             else:
-                msg.good(f"Loaded initialized pipeline from {output_init_path}")
+                msg.good(f"Loaded initialized pipeline from {init_path}")
         return nlp
     msg.warn(
         "Not saving initialized model: no output directory specified. "
         "To speed up training, spaCy can save the initialized nlp object with "
         "the vocabulary, vectors and label scheme. To take advantage of this, "
-        "provide an output directory or use the 'spacy init nlp' command."
+        "provide an output directory."
     )
     return init_pipeline(config)
 

From a62337b3f381b061b2ec27e6d9e9ba718276131b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:53:06 +0200
Subject: [PATCH 216/516] Tidy up vocab init

---
 spacy/cli/init_pipeline.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index e64683fe1..28e314d0a 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -63,11 +63,7 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
     V = I["vocab"]
-    init_vocab(nlp, data=V["data"], lookups=V["lookups"])
-    msg.good("Created vocabulary")
-    if V["vectors"] is not None:
-        add_vectors(nlp, V["vectors"])
-        msg.good(f"Added vectors: {V['vectors']}")
+    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
     optimizer = T["optimizer"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
@@ -94,7 +90,11 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
 
 
 def init_vocab(
-    nlp: Language, *, data: Optional[Path] = None, lookups: Optional[Lookups] = None,
+    nlp: Language,
+    *,
+    data: Optional[Path] = None,
+    lookups: Optional[Lookups] = None,
+    vectors: Optional[str] = None,
 ) -> Language:
     if lookups:
         nlp.vocab.lookups = lookups
@@ -115,6 +115,10 @@ def init_vocab(
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
         msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    msg.good("Created vocabulary")
+    if vectors is not None:
+        add_vectors(nlp, vectors)
+        msg.good(f"Added vectors: {V['vectors']}")
 
 
 def add_tok2vec_weights(

From a89e0ff7cb6cb120652ca7994e078778d2b8804a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 12:55:21 +0200
Subject: [PATCH 217/516] Fix typo

---
 spacy/cli/init_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 28e314d0a..0c4b6ec70 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -118,7 +118,7 @@ def init_vocab(
     msg.good("Created vocabulary")
     if vectors is not None:
         add_vectors(nlp, vectors)
-        msg.good(f"Added vectors: {V['vectors']}")
+        msg.good(f"Added vectors: {vectors}")
 
 
 def add_tok2vec_weights(

From 822ea4ef619072a94ce565bf78add9f9ea9d2866 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 15:09:59 +0200
Subject: [PATCH 218/516] Refactor CLI

---
 spacy/cli/__init__.py                |   3 +-
 spacy/cli/_util.py                   |  35 +--
 spacy/cli/debug_config.py            |  28 +--
 spacy/cli/debug_data.py              |  27 +--
 spacy/cli/debug_model.py             |  18 +-
 spacy/cli/evaluate.py                |   7 +-
 spacy/cli/init_pipeline.py           | 185 +--------------
 spacy/cli/pretrain.py                | 269 +---------------------
 spacy/cli/train.py                   | 330 ++-------------------------
 spacy/tests/pipeline/test_textcat.py |   4 +-
 spacy/tests/test_cli.py              |  13 --
 spacy/tests/test_misc.py             |  14 --
 spacy/tests/test_util.py             |  31 ++-
 spacy/tests/training/test_readers.py |  22 +-
 spacy/training/initialize.py         | 205 +++++++++++++++++
 spacy/training/loop.py               | 301 ++++++++++++++++++++++++
 spacy/training/pretrain.py           | 267 ++++++++++++++++++++++
 spacy/util.py                        |  71 +++---
 18 files changed, 917 insertions(+), 913 deletions(-)
 create mode 100644 spacy/training/loop.py
 create mode 100644 spacy/training/pretrain.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 5569e630d..7368bcef3 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -15,8 +15,7 @@ from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
-#from .init_model import init_model  # noqa: F401
-from .init_pipeline import init_pipeline  # noqa: F401
+from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 7ff2c6199..c41905970 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -10,13 +10,12 @@ from click import NoSuchOption
 from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config, ConfigValidationError, require_gpu
 from configparser import InterpolationError
 import os
 
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import ensure_path
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -276,18 +275,6 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
             msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 
 
-def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
-    """RETURNS (List[str]): All sourced components in the original config,
-        e.g. {"source": "en_core_web_sm"}. If the config contains a key
-        "factory", we assume it refers to a component factory.
-    """
-    return [
-        name
-        for name, cfg in config.get("components", {}).items()
-        if "factory" not in cfg and "source" in cfg
-    ]
-
-
 def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
     """Upload a file.
 
@@ -459,3 +446,23 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
             p = int(p)
         result.append(p)
     return result
+
+
+class CliLogger:
+    """Helper mocking up the most commonly used logger methods. Can be passed
+    into functions like train() to make them output pretty-printed messages
+    on the CLI and regular logging if used from within Python.
+    """
+
+    debug = msg.text
+    info = msg.info
+    warn = msg.info
+    error = msg.fail
+
+
+def setup_gpu(use_gpu: int):
+    if use_gpu >= 0:
+        msg.info(f"Using GPU: {use_gpu}")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 131fecf6d..d1dcc45b9 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -1,7 +1,7 @@
 from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config
 from thinc.config import VARIABLE_RE
 import typer
 
@@ -52,10 +52,8 @@ def debug_config(
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
         nlp = util.load_model_from_config(config)
-        # Use the resolved config here in case user has one function returning
-        # a dict of corpora etc.
-        resolved = util.resolve_training_config(nlp.config)
-        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
+        dot_names = ["training.dev_corpus", "training.train_corpus"]
+        util.resolve_dot_names(nlp.config, dot_names)
     msg.good("Config is valid")
     if show_vars:
         variables = get_variables(config)
@@ -97,23 +95,3 @@ def get_variables(config: Config) -> Dict[str, Any]:
         value = util.dot_to_object(config, path)
         result[variable] = repr(value)
     return result
-
-
-def check_section_refs(config: Config, fields: List[str]) -> None:
-    """Validate fields in the config that refer to other sections or values
-    (e.g. in the corpora) and make sure that those references exist.
-    """
-    errors = []
-    for field in fields:
-        # If the field doesn't exist in the config, we ignore it
-        try:
-            value = util.dot_to_object(config, field)
-        except KeyError:
-            continue
-        try:
-            util.dot_to_object(config, value)
-        except KeyError:
-            msg = f"not a valid section reference: {value}"
-            errors.append({"loc": field.split("."), "msg": msg})
-    if errors:
-        raise ConfigValidationError(config=config, errors=errors)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 302bfd563..f0e76be2b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -7,10 +7,13 @@ from wasabi import Printer, MESSAGES, msg
 import typer
 
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, get_sourced_components
+from ._util import import_code, debug_cli
 from ..training import Corpus, Example
+from ..training.initialize import get_sourced_components
+from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
+from ..util import registry
 from .. import util
 
 
@@ -94,26 +97,13 @@ def debug_data(
     with show_validation_error(config_path):
         cfg = util.load_config(config_path, overrides=config_overrides)
         nlp = util.load_model_from_config(cfg)
-        C = util.resolve_training_config(nlp.config)
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
     # Use original config here, not resolved version
     sourced_components = get_sourced_components(cfg)
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
     resume_components = [p for p in sourced_components if p not in frozen_components]
     pipeline = nlp.pipe_names
     factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
-    tag_map_path = util.ensure_path(C["training"]["tag_map"])
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-    morph_rules_path = util.ensure_path(C["training"]["morph_rules"])
-    morph_rules = {}
-    if morph_rules_path is not None:
-        morph_rules = srsly.read_json(morph_rules_path)
-    # Replace tag map with provided mapping
-    nlp.vocab.morphology.load_tag_map(tag_map)
-    # Load morph rules
-    nlp.vocab.morphology.load_morph_exceptions(morph_rules)
-
     msg.divider("Data file validation")
 
     # Create the gold corpus to be able to better analyze data
@@ -145,10 +135,10 @@ def debug_data(
 
     train_texts = gold_train_data["texts"]
     dev_texts = gold_dev_data["texts"]
-    frozen_components = C["training"]["frozen_components"]
+    frozen_components = T["frozen_components"]
 
     msg.divider("Training stats")
-    msg.text(f"Language: {C['nlp']['lang']}")
+    msg.text(f"Language: {nlp.lang}")
     msg.text(f"Training pipeline: {', '.join(pipeline)}")
     if resume_components:
         msg.text(f"Components from other pipelines: {', '.join(resume_components)}")
@@ -355,6 +345,7 @@ def debug_data(
     if "tagger" in factory_names:
         msg.divider("Part-of-speech Tagging")
         labels = [label for label in gold_train_data["tags"]]
+        # TODO: does this need to be updated?
         tag_map = nlp.vocab.morphology.tag_map
         msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
         labels_with_counts = _format_labels(
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 6f554ed2d..f8fc687fa 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -4,12 +4,14 @@ from pathlib import Path
 from spacy.training import Example
 from spacy.util import dot_to_object
 from wasabi import msg
-from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+from thinc.api import fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
 import typer
 
 from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list
+from ._util import parse_config_overrides, string_to_list, setup_gpu
+from ..schemas import ConfigSchemaTraining
+from ..util import registry
 from .. import util
 
 
@@ -37,11 +39,7 @@ def debug_model_cli(
 
     DOCS: https://nightly.spacy.io/api/cli#debug-model
     """
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
     layers = string_to_list(layers, intify=True)
     print_settings = {
         "dimensions": dimensions,
@@ -65,8 +63,8 @@ def debug_model_cli(
         set_gpu_allocator(allocator)
     with show_validation_error(config_path):
         nlp = util.load_model_from_config(raw_config)
-        C = util.resolve_training_config(nlp.config)
-    seed = C["training"]["seed"]
+        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    seed = T["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
@@ -77,7 +75,7 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(C, nlp, model, print_settings=print_settings)
+    debug_model(T, nlp, model, print_settings=print_settings)
 
 
 def debug_model(
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index f9954d9ad..4c1eeb9e8 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -3,11 +3,11 @@ from wasabi import Printer
 from pathlib import Path
 import re
 import srsly
-from thinc.api import require_gpu, fix_random_seed
+from thinc.api import fix_random_seed
 
 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, setup_gpu
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
@@ -61,8 +61,7 @@ def evaluate(
 ) -> Scorer:
     msg = Printer(no_print=silent, pretty=not silent)
     fix_random_seed()
-    if use_gpu >= 0:
-        require_gpu(use_gpu)
+    setup_gpu(use_gpu)
     data_path = util.ensure_path(data_path)
     output_path = util.ensure_path(output)
     displacy_path = util.ensure_path(displacy_path)
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 0c4b6ec70..de1dc8a46 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -1,22 +1,13 @@
-from typing import Optional, Dict, Callable, Any
+from typing import Optional
 import logging
 from pathlib import Path
 from wasabi import msg
 import typer
-from thinc.api import Config, fix_random_seed, set_gpu_allocator
-import srsly
 
 from .. import util
-from ..util import registry, resolve_dot_names, OOV_RANK
-from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain, ConfigSchemaInit
-from ..language import Language
-from ..lookups import Lookups
-from ..errors import Errors
+from ..training.initialize import init_nlp
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, get_sourced_components
-
-
-DEFAULT_OOV_PROB = -20
+from ._util import import_code, CliLogger, setup_gpu
 
 
 @init_cli.command(
@@ -31,178 +22,16 @@ def init_pipeline_cli(
     output_path: Path = Arg(..., help="Output directory for the prepared data"),
     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
     util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
+    setup_gpu(use_gpu)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
-    nlp = init_pipeline(config)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
     nlp.to_disk(output_path)
     msg.good(f"Saved initialized pipeline to {output_path}")
-
-
-def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
-    raw_config = config
-    config = raw_config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    # Use original config here before it's resolved to functions
-    sourced_components = get_sourced_components(config)
-    with show_validation_error():
-        nlp = util.load_model_from_config(raw_config, auto_fill=True)
-    msg.good("Set up nlp object from config")
-    config = nlp.config.interpolate()
-    # Resolve all training-relevant sections using the filled nlp config
-    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"]]
-    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-    V = I["vocab"]
-    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
-    optimizer = T["optimizer"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T["frozen_components"]
-    # Sourced components that require resume_training
-    resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if resume_components:
-        with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
-            nlp.resume_training(sgd=optimizer)
-    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-        msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
-    # are properly initialized
-    verify_config(nlp)
-    if "pretraining" in config and config["pretraining"]:
-        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        add_tok2vec_weights(nlp, P, I)
-    # TODO: this should be handled better?
-    nlp = before_to_disk(nlp)
-    return nlp
-
-
-def init_vocab(
-    nlp: Language,
-    *,
-    data: Optional[Path] = None,
-    lookups: Optional[Lookups] = None,
-    vectors: Optional[str] = None,
-) -> Language:
-    if lookups:
-        nlp.vocab.lookups = lookups
-        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
-    data_path = util.ensure_path(data)
-    if data_path is not None:
-        lex_attrs = srsly.read_jsonl(data_path)
-        for lexeme in nlp.vocab:
-            lexeme.rank = OOV_RANK
-        for attrs in lex_attrs:
-            if "settings" in attrs:
-                continue
-            lexeme = nlp.vocab[attrs["orth"]]
-            lexeme.set_attrs(**attrs)
-        if len(nlp.vocab):
-            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
-        else:
-            oov_prob = DEFAULT_OOV_PROB
-        nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
-    msg.good("Created vocabulary")
-    if vectors is not None:
-        add_vectors(nlp, vectors)
-        msg.good(f"Added vectors: {vectors}")
-
-
-def add_tok2vec_weights(
-    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
-) -> None:
-    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
-    P = pretrain_config
-    V = vocab_config
-    weights_data = None
-    init_tok2vec = util.ensure_path(V["init_tok2vec"])
-    if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not V["vectors"]:
-            err = "Need initialize.vectors if pretraining.objective.type is vectors"
-            msg.fail(err, exits=1)
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    if weights_data is not None:
-        tok2vec_component = P["component"]
-        if tok2vec_component is None:
-            msg.fail(
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them.",
-                exits=1,
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        if P["layer"]:
-            layer = layer.get_ref(P["layer"])
-        layer.from_bytes(weights_data)
-        msg.good(f"Loaded pretrained weights into component '{tok2vec_component}'")
-
-
-def add_vectors(nlp: Language, vectors: str) -> None:
-    title = f"Config validation error for vectors {vectors}"
-    desc = (
-        "This typically means that there's a problem in the config.cfg included "
-        "with the packaged vectors. Make sure that the vectors package you're "
-        "loading is compatible with the current version of spaCy."
-    )
-    with show_validation_error(
-        title=title, desc=desc, hint_fill=False, show_config=False
-    ):
-        util.load_vectors_into_model(nlp, vectors)
-        msg(f"Added {len(nlp.vocab.vectors)} vectors from {vectors}")
-
-
-def verify_config(nlp: Language) -> None:
-    """Perform additional checks based on the config, loaded nlp object and training data."""
-    # TODO: maybe we should validate based on the actual components, the list
-    # in config["nlp"]["pipeline"] instead?
-    for pipe_config in nlp.config["components"].values():
-        # We can't assume that the component name == the factory
-        factory = pipe_config["factory"]
-        if factory == "textcat":
-            verify_textcat_config(nlp, pipe_config)
-
-
-def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
-    # if 'positive_label' is provided: double check whether it's in the data and
-    # the task is binary
-    if pipe_config.get("positive_label"):
-        textcat_labels = nlp.get_pipe("textcat").labels
-        pos_label = pipe_config.get("positive_label")
-        if pos_label not in textcat_labels:
-            raise ValueError(
-                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
-            )
-        if len(list(textcat_labels)) != 2:
-            raise ValueError(
-                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
-            )
-
-
-def create_before_to_disk_callback(
-    callback: Optional[Callable[[Language], Language]]
-) -> Callable[[Language], Language]:
-    def before_to_disk(nlp: Language) -> Language:
-        if not callback:
-            return nlp
-        modified_nlp = callback(nlp)
-        if not isinstance(modified_nlp, Language):
-            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
-            raise ValueError(err)
-        return modified_nlp
-
-    return before_to_disk
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 29e220b95..6494486a9 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,25 +1,13 @@
 from typing import Optional
-import numpy
-import time
-import re
-from collections import Counter
 from pathlib import Path
-from thinc.api import require_gpu, set_gpu_allocator
-from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
-import srsly
-from functools import partial
 import typer
+import re
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
-from ..ml.models.multi_task import build_cloze_multi_task_model
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
-from ..tokens import Doc
-from ..attrs import ID
-from .. import util
-from ..util import dot_to_object
+from ._util import import_code, setup_gpu, CliLogger
+from ..training.pretrain import pretrain
+from ..util import load_config
 
 
 @app.command(
@@ -61,15 +49,11 @@ def pretrain_cli(
     config_overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
-    if use_gpu >= 0:
-        msg.info("Using GPU")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
+    setup_gpu(use_gpu)
     msg.info(f"Loading config from: {config_path}")
 
     with show_validation_error(config_path):
-        raw_config = util.load_config(
+        raw_config = load_config(
             config_path, overrides=config_overrides, interpolate=False
         )
     config = raw_config.interpolate()
@@ -89,250 +73,11 @@ def pretrain_cli(
         resume_path=resume_path,
         epoch_resume=epoch_resume,
         use_gpu=use_gpu,
+        logger=CliLogger,
     )
-
-
-def pretrain(
-    config: Config,
-    output_dir: Path,
-    resume_path: Optional[Path] = None,
-    epoch_resume: Optional[int] = None,
-    use_gpu: int = -1,
-):
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    nlp = util.load_model_from_config(config)
-    C = util.resolve_training_config(nlp.config)
-    P_cfg = C["pretraining"]
-    corpus = dot_to_object(C, P_cfg["corpus"])
-    batcher = P_cfg["batcher"]
-    model = create_pretraining_model(nlp, C["pretraining"])
-    optimizer = C["pretraining"]["optimizer"]
-    # Load in pretrained weights to resume from
-    if resume_path is not None:
-        _resume_model(model, resume_path, epoch_resume)
-    else:
-        # Without '--resume-path' the '--epoch-resume' argument is ignored
-        epoch_resume = 0
-
-    tracker = ProgressTracker(frequency=10000)
-    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
-    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
-    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
-
-    def _save_model(epoch, is_temp=False):
-        is_temp_str = ".temp" if is_temp else ""
-        with model.use_params(optimizer.averages):
-            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
-                file_.write(model.get_ref("tok2vec").to_bytes())
-            log = {
-                "nr_word": tracker.nr_word,
-                "loss": tracker.loss,
-                "epoch_loss": tracker.epoch_loss,
-                "epoch": epoch,
-            }
-            with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(srsly.json_dumps(log) + "\n")
-
-    objective = create_objective(P_cfg["objective"])
-    # TODO: I think we probably want this to look more like the
-    # 'create_train_batches' function?
-    for epoch in range(epoch_resume, P_cfg["max_epochs"]):
-        for batch_id, batch in enumerate(batcher(corpus(nlp))):
-            docs = ensure_docs(batch)
-            loss = make_update(model, docs, optimizer, objective)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                msg.row(progress, **row_settings)
-            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
-                _save_model(epoch, is_temp=True)
-        _save_model(epoch)
-        tracker.epoch_loss = 0.0
     msg.good("Successfully finished pretrain")
 
 
-def ensure_docs(examples_or_docs):
-    docs = []
-    for eg_or_doc in examples_or_docs:
-        if isinstance(eg_or_doc, Doc):
-            docs.append(eg_or_doc)
-        else:
-            docs.append(eg_or_doc.reference)
-    return docs
-
-
-def _resume_model(model, resume_path, epoch_resume):
-    msg.info(f"Resume training tok2vec from: {resume_path}")
-    with resume_path.open("rb") as file_:
-        weights_data = file_.read()
-        model.get_ref("tok2vec").from_bytes(weights_data)
-    # Parse the epoch number from the given weight file
-    model_name = re.search(r"model\d+\.bin", str(resume_path))
-    if model_name:
-        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-    else:
-        msg.info(f"Resuming from epoch: {epoch_resume}")
-
-
-def make_update(model, docs, optimizer, objective_func):
-    """Perform an update over a single batch of documents.
-
-    docs (iterable): A batch of `Doc` objects.
-    optimizer (callable): An optimizer.
-    RETURNS loss: A float for the loss.
-    """
-    predictions, backprop = model.begin_update(docs)
-    loss, gradients = objective_func(model.ops, docs, predictions)
-    backprop(gradients)
-    model.finish_update(optimizer)
-    # Don't want to return a cupy object here
-    # The gradients are modified in-place by the BERT MLM,
-    # so we get an accurate loss
-    return float(loss)
-
-
-def create_objective(config):
-    """Create the objective for pretraining.
-
-    We'd like to replace this with a registry function but it's tricky because
-    we're also making a model choice based on this. For now we hard-code support
-    for two types (characters, vectors). For characters you can specify
-    n_characters, for vectors you can specify the loss.
-
-    Bleh.
-    """
-    objective_type = config["type"]
-    if objective_type == "characters":
-        return partial(get_characters_loss, nr_char=config["n_characters"])
-    elif objective_type == "vectors":
-        if config["loss"] == "cosine":
-            return partial(
-                get_vectors_loss,
-                distance=CosineDistance(normalize=True, ignore_zeros=True),
-            )
-        elif config["loss"] == "L2":
-            return partial(
-                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
-            )
-        else:
-            raise ValueError("Unexpected loss type", config["loss"])
-    else:
-        raise ValueError("Unexpected objective_type", objective_type)
-
-
-def get_vectors_loss(ops, docs, prediction, distance):
-    """Compute a loss based on a distance between the documents' vectors and
-    the prediction.
-    """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    d_target, loss = distance(prediction, target)
-    return loss, d_target
-
-
-def get_characters_loss(ops, docs, prediction, nr_char):
-    """Compute a loss based on a number of characters predicted from the docs."""
-    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
-    target_ids = target_ids.reshape((-1,))
-    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
-    target = target.reshape((-1, 256 * nr_char))
-    diff = prediction - target
-    loss = (diff ** 2).sum()
-    d_target = diff / float(prediction.shape[0])
-    return loss, d_target
-
-
-def create_pretraining_model(nlp, pretrain_config):
-    """Define a network for the pretraining. We simply add an output layer onto
-    the tok2vec input model. The tok2vec input model needs to be a model that
-    takes a batch of Doc objects (as a list), and returns a list of arrays.
-    Each array in the output needs to have one row per token in the doc.
-    The actual tok2vec layer is stored as a reference, and only this bit will be
-    serialized to file and read back in when calling the 'train' command.
-    """
-    component = nlp.get_pipe(pretrain_config["component"])
-    if pretrain_config.get("layer"):
-        tok2vec = component.model.get_ref(pretrain_config["layer"])
-    else:
-        tok2vec = component.model
-
-    # TODO
-    maxout_pieces = 3
-    hidden_size = 300
-    if pretrain_config["objective"]["type"] == "vectors":
-        model = build_cloze_multi_task_model(
-            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
-        )
-    elif pretrain_config["objective"]["type"] == "characters":
-        model = build_cloze_characters_multi_task_model(
-            nlp.vocab,
-            tok2vec,
-            hidden_size=hidden_size,
-            maxout_pieces=maxout_pieces,
-            nr_char=pretrain_config["objective"]["n_characters"],
-        )
-    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
-    set_dropout_rate(model, pretrain_config["dropout"])
-    return model
-
-
-class ProgressTracker:
-    def __init__(self, frequency=1000000):
-        self.loss = 0.0
-        self.prev_loss = 0.0
-        self.nr_word = 0
-        self.words_per_epoch = Counter()
-        self.frequency = frequency
-        self.last_time = time.time()
-        self.last_update = 0
-        self.epoch_loss = 0.0
-
-    def update(self, epoch, loss, docs):
-        self.loss += loss
-        self.epoch_loss += loss
-        words_in_batch = sum(len(doc) for doc in docs)
-        self.words_per_epoch[epoch] += words_in_batch
-        self.nr_word += words_in_batch
-        words_since_update = self.nr_word - self.last_update
-        if words_since_update >= self.frequency:
-            wps = words_since_update / (time.time() - self.last_time)
-            self.last_update = self.nr_word
-            self.last_time = time.time()
-            loss_per_word = self.loss - self.prev_loss
-            status = (
-                epoch,
-                self.nr_word,
-                _smart_round(self.loss, width=10),
-                _smart_round(loss_per_word, width=6),
-                int(wps),
-            )
-            self.prev_loss = float(self.loss)
-            return status
-        else:
-            return None
-
-
-def _smart_round(figure, width=10, max_decimal=4):
-    """Round large numbers as integers, smaller numbers as decimals."""
-    n_digits = len(str(int(figure)))
-    n_decimal = width - (n_digits + 1)
-    if n_decimal <= 1:
-        return str(int(figure))
-    else:
-        n_decimal = min(n_decimal, max_decimal)
-        format_str = "%." + str(n_decimal) + "f"
-        return format_str % figure
-
-
 def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
     if not config_path or not config_path.exists():
         msg.fail("Config file not found", config_path, exits=1)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index afaf230d1..aa0e71b5a 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,24 +1,16 @@
-from typing import Optional, Dict, Any, Tuple, Union, Callable, List
-from timeit import default_timer as timer
-import tqdm
+from typing import Optional
 from pathlib import Path
 from wasabi import msg
-import thinc
-import thinc.schedules
-from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
-import random
+from thinc.api import Config
 import typer
 import logging
 
-from .init_pipeline import init_pipeline
-from .init_pipeline import create_before_to_disk_callback
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
+from ._util import import_code, CliLogger, setup_gpu
 from ..language import Language
+from ..training.loop import train
+from ..training.initialize import init_nlp, must_reinitialize
 from .. import util
-from ..errors import Errors
-from ..util import resolve_dot_names, registry
-from ..schemas import ConfigSchemaTraining
 
 
 @app.command(
@@ -52,31 +44,33 @@ def train_cli(
     verify_cli_args(config_path, output_path)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
-    if use_gpu >= 0:
-        msg.info(f"Using GPU: {use_gpu}")
-        require_gpu(use_gpu)
-    else:
-        msg.info("Using CPU")
-    config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
-    nlp = init_nlp(config, output_path)
+    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
     msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu)
+    final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
+    if final_path:
+        msg.good(f"Saved pipeline to output directory", final_path)
 
 
-def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
+def init_pipeline(
+    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
+) -> Language:
+    init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
     if output_path is not None:
         init_path = output_path / "model-initial"
         if not init_path.exists():
             msg.info(f"Initializing the pipeline in {init_path}")
-            nlp = init_pipeline(config)
+            nlp = init_nlp(config, **init_kwargs)
             nlp.to_disk(init_path)
             msg.good(f"Saved initialized pipeline to {init_path}")
         else:
             nlp = util.load_model(init_path)
             if must_reinitialize(config, nlp.config):
                 msg.warn("Config has changed: need to re-initialize pipeline")
-                nlp = init_pipeline(config)
+                nlp = init_nlp(config, **init_kwargs)
                 nlp.to_disk(init_path)
                 msg.good(f"Re-initialized pipeline in {init_path}")
             else:
@@ -88,279 +82,7 @@ def init_nlp(config: Config, output_path: Optional[Path]) -> Language:
         "the vocabulary, vectors and label scheme. To take advantage of this, "
         "provide an output directory."
     )
-    return init_pipeline(config)
-
-
-def train(
-    nlp: Language, output_path: Optional[Path] = None, *, use_gpu: int = -1
-) -> None:
-    # Create iterator, which yields out info after each optimization step.
-    config = nlp.config.interpolate()
-    if config["training"]["seed"] is not None:
-        fix_random_seed(config["training"]["seed"])
-    allocator = config["training"]["gpu_allocator"]
-    if use_gpu >= 0 and allocator:
-        set_gpu_allocator(allocator)
-    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
-    dot_names = [T["train_corpus"], T["dev_corpus"]]
-    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    optimizer = T["optimizer"]
-    score_weights = T["score_weights"]
-    batcher = T["batcher"]
-    train_logger = T["logger"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
-    # Components that shouldn't be updated during training
-    frozen_components = T["frozen_components"]
-    # Create iterator, which yields out info after each optimization step.
-    training_step_iterator = train_while_improving(
-        nlp,
-        optimizer,
-        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
-        create_evaluation_callback(nlp, dev_corpus, score_weights),
-        dropout=T["dropout"],
-        accumulate_gradient=T["accumulate_gradient"],
-        patience=T["patience"],
-        max_steps=T["max_steps"],
-        eval_frequency=T["eval_frequency"],
-        exclude=frozen_components,
-    )
-    msg.info(f"Pipeline: {nlp.pipe_names}")
-    if frozen_components:
-        msg.info(f"Frozen components: {frozen_components}")
-    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
-    with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
-
-    try:
-        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-        progress.set_description(f"Epoch 1")
-        for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
-            if is_best_checkpoint is not None:
-                progress.close()
-                print_row(info)
-                if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=frozen_components):
-                        update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / "model-best")
-                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-                progress.set_description(f"Epoch {info['epoch']}")
-    except Exception as e:
-        finalize_logger()
-        if output_path is not None:
-            # We don't want to swallow the traceback if we don't have a
-            # specific error.
-            msg.warn(
-                f"Aborting and saving the final best model. "
-                f"Encountered exception: {str(e)}"
-            )
-            nlp = before_to_disk(nlp)
-            nlp.to_disk(output_path / "model-final")
-        raise e
-    finally:
-        finalize_logger()
-        if output_path is not None:
-            final_model_path = output_path / "model-final"
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    nlp.to_disk(final_model_path)
-            else:
-                nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory {final_model_path}")
-
-
-def must_reinitialize(train_config: Config, init_config: Config) -> bool:
-    # TODO: do this better and more fine-grained
-    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
-
-
-def add_vectors(nlp: Language, vectors: str) -> None:
-    title = f"Config validation error for vectors {vectors}"
-    desc = (
-        "This typically means that there's a problem in the config.cfg included "
-        "with the packaged vectors. Make sure that the vectors package you're "
-        "loading is compatible with the current version of spaCy."
-    )
-    with show_validation_error(
-        title=title, desc=desc, hint_fill=False, show_config=False
-    ):
-        util.load_vectors_into_model(nlp, vectors)
-
-
-def create_train_batches(iterator, batcher, max_epochs: int):
-    epoch = 0
-    examples = list(iterator)
-    if not examples:
-        # Raise error if no data
-        raise ValueError(Errors.E986)
-    while max_epochs < 1 or epoch != max_epochs:
-        random.shuffle(examples)
-        for batch in batcher(examples):
-            yield epoch, batch
-        epoch += 1
-
-
-def create_evaluation_callback(
-    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
-) -> Callable[[], Tuple[float, Dict[str, float]]]:
-    weights = {key: value for key, value in weights.items() if value is not None}
-
-    def evaluate() -> Tuple[float, Dict[str, float]]:
-        dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
-        # Calculate a weighted sum based on score_weights for the main score.
-        # We can only consider scores that are ints/floats, not dicts like
-        # entity scores per type etc.
-        for key, value in scores.items():
-            if key in weights and not isinstance(value, (int, float)):
-                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
-        try:
-            weighted_score = sum(
-                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
-            )
-        except KeyError as e:
-            keys = list(scores.keys())
-            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
-            raise KeyError(err) from None
-        return weighted_score, scores
-
-    return evaluate
-
-
-def train_while_improving(
-    nlp: Language,
-    optimizer: Optimizer,
-    train_data,
-    evaluate,
-    *,
-    dropout: float,
-    eval_frequency: int,
-    accumulate_gradient: int,
-    patience: int,
-    max_steps: int,
-    exclude: List[str],
-):
-    """Train until an evaluation stops improving. Works as a generator,
-    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
-    where info is a dict, and is_best_checkpoint is in [True, False, None] --
-    None indicating that the iteration was not evaluated as a checkpoint.
-    The evaluation is conducted by calling the evaluate callback.
-
-    Positional arguments:
-        nlp: The spaCy pipeline to evaluate.
-        optimizer: The optimizer callable.
-        train_data (Iterable[Batch]): A generator of batches, with the training
-            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
-            data iterable needs to take care of iterating over the epochs and
-            shuffling.
-        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
-            The callback should take no arguments and return a tuple
-            `(main_score, other_scores)`. The main_score should be a float where
-            higher is better. other_scores can be any object.
-
-    Every iteration, the function yields out a tuple with:
-
-    * batch: A list of Example objects.
-    * info: A dict with various information about the last update (see below).
-    * is_best_checkpoint: A value in None, False, True, indicating whether this
-        was the best evaluation so far. You should use this to save the model
-        checkpoints during training. If None, evaluation was not conducted on
-        that iteration. False means evaluation was conducted, but a previous
-        evaluation was better.
-
-    The info dict provides the following information:
-
-        epoch (int): How many passes over the data have been completed.
-        step (int): How many steps have been completed.
-        score (float): The main score from the last evaluation.
-        other_scores: : The other scores from the last evaluation.
-        losses: The accumulated losses throughout training.
-        checkpoints: A list of previous results, where each result is a
-            (score, step, epoch) tuple.
-    """
-    if isinstance(dropout, float):
-        dropouts = thinc.schedules.constant(dropout)
-    else:
-        dropouts = dropout
-    results = []
-    losses = {}
-    words_seen = 0
-    start_time = timer()
-    for step, (epoch, batch) in enumerate(train_data):
-        dropout = next(dropouts)
-        for subbatch in subdivide_batch(batch, accumulate_gradient):
-            nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
-            )
-        # TODO: refactor this so we don't have to run it separately in here
-        for name, proc in nlp.pipeline:
-            if (
-                name not in exclude
-                and hasattr(proc, "model")
-                and proc.model not in (True, False, None)
-            ):
-                proc.model.finish_update(optimizer)
-        optimizer.step_schedules()
-        if not (step % eval_frequency):
-            if optimizer.averages:
-                with nlp.use_params(optimizer.averages):
-                    score, other_scores = evaluate()
-            else:
-                score, other_scores = evaluate()
-            results.append((score, step))
-            is_best_checkpoint = score == max(results)[0]
-        else:
-            score, other_scores = (None, None)
-            is_best_checkpoint = None
-        words_seen += sum(len(eg) for eg in batch)
-        info = {
-            "epoch": epoch,
-            "step": step,
-            "score": score,
-            "other_scores": other_scores,
-            "losses": losses,
-            "checkpoints": results,
-            "seconds": int(timer() - start_time),
-            "words": words_seen,
-        }
-        yield batch, info, is_best_checkpoint
-        if is_best_checkpoint is not None:
-            losses = {}
-        # Stop if no improvement in `patience` updates (if specified)
-        best_score, best_step = max(results)
-        if patience and (step - best_step) >= patience:
-            break
-        # Stop if we've exhausted our max steps (if specified)
-        if max_steps and step >= max_steps:
-            break
-
-
-def subdivide_batch(batch, accumulate_gradient):
-    batch = list(batch)
-    batch.sort(key=lambda eg: len(eg.predicted))
-    sub_len = len(batch) // accumulate_gradient
-    start = 0
-    for i in range(accumulate_gradient):
-        subbatch = batch[start : start + sub_len]
-        if subbatch:
-            yield subbatch
-        start += len(subbatch)
-    subbatch = batch[start:]
-    if subbatch:
-        yield subbatch
-
-
-def update_meta(
-    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
-) -> None:
-    nlp.meta["performance"] = {}
-    for metric in training["score_weights"]:
-        if metric is not None:
-            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
-    for pipe_name in nlp.pipe_names:
-        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+    return init_nlp(config, **init_kwargs)
 
 
 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
@@ -371,17 +93,3 @@ def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> No
         if not output_path.exists():
             output_path.mkdir()
             msg.good(f"Created output directory: {output_path}")
-
-
-# TODO: this is currently imported by the ray extension and not used otherwise
-def load_from_paths(
-    config: Config,
-) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
-    weights_data = None
-    init_tok2vec = util.ensure_path(config["training"]["init_tok2vec"])
-    if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
-            weights_data = file_.read()
-    return None, {}, {}, weights_data
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 232b53e1d..02e189834 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -9,10 +9,10 @@ from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
+from spacy.training import Example
+from spacy.training.initialize import verify_textcat_config
 
 from ..util import make_tempdir
-from ...cli.train import verify_textcat_config
-from ...training import Example
 
 
 TRAIN_DATA = [
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index caf4ea890..ee103208c 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -7,7 +7,6 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from spacy.cli.debug_config import check_section_refs
 from thinc.api import ConfigValidationError, Config
 import srsly
 import os
@@ -414,15 +413,3 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
     assert string_to_list(value, intify=False) == ["1", "2", "3"]
     assert string_to_list(value, intify=True) == [1, 2, 3]
-
-
-def test_check_section_refs():
-    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
-    config = Config(config)
-    # Valid section reference
-    check_section_refs(config, ["a.b.c"])
-    # Section that doesn't exist in this config
-    check_section_refs(config, ["x.y.z"])
-    # Invalid section reference
-    with pytest.raises(ConfigValidationError):
-        check_section_refs(config, ["a.b.c", "f.g"])
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 4e079d29e..e6ef45f90 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,7 +7,6 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from thinc.api import Optimizer
 
 
 @pytest.fixture
@@ -158,16 +157,3 @@ def test_dot_to_dict(dot_notation, expected):
     result = util.dot_to_dict(dot_notation)
     assert result == expected
     assert util.dict_to_dot(result) == dot_notation
-
-
-def test_resolve_training_config():
-    config = {
-        "nlp": {"lang": "en", "disabled": []},
-        "training": {"dropout": 0.1, "optimizer": {"@optimizers": "Adam.v1"}},
-        "corpora": {},
-    }
-    resolved = util.resolve_training_config(config)
-    assert resolved["training"]["dropout"] == 0.1
-    assert isinstance(resolved["training"]["optimizer"], Optimizer)
-    assert resolved["corpora"] == {}
-    assert "nlp" not in resolved
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 0647b8556..f48cfba00 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -1,14 +1,15 @@
 import pytest
 
-from .util import get_random_doc
-
 from spacy import util
 from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
-from ..lang.en import English
-from ..lang.nl import Dutch
-from ..language import DEFAULT_CONFIG_PATH
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc
 
 
 @pytest.mark.parametrize(
@@ -101,8 +102,8 @@ def test_util_dot_section():
         dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
     with pytest.raises(KeyError):
         dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    resolved = util.resolve_training_config(nl_nlp.config)
-    assert isinstance(dot_to_object(resolved, "training.optimizer"), Optimizer)
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
 
 
 def test_simple_frozen_list():
@@ -120,3 +121,17 @@ def test_simple_frozen_list():
     t = SimpleFrozenList(["foo", "bar"], error="Error!")
     with pytest.raises(NotImplementedError):
         t.append("baz")
+
+
+def test_resolve_dot_names():
+    config = {
+        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+    }
+    result = util.resolve_dot_names(config, ["foo.bar"])
+    assert isinstance(result[0], Optimizer)
+    with pytest.raises(ConfigValidationError) as e:
+        util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index c06c9d282..7d94d5ddc 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -2,8 +2,8 @@ from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
 from spacy import Language
-from spacy.util import load_model_from_config, registry, dot_to_object
-from spacy.util import resolve_training_config
+from spacy.util import load_model_from_config, registry, resolve_dot_names
+from spacy.schemas import ConfigSchemaTraining
 from spacy.training import Example
 
 
@@ -39,21 +39,21 @@ def test_readers():
 
     config = Config().from_str(config_string)
     nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
     assert isinstance(train_corpus, Callable)
-    optimizer = resolved["training"]["optimizer"]
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    optimizer = T["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
         nlp.update([example], sgd=optimizer)
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
     scores = nlp.evaluate(list(dev_corpus(nlp)))
     assert scores["cats_score"]
     # ensure the pipeline runs
     doc = nlp("Quick test")
     assert doc.cats
-    extra_corpus = resolved["corpora"]["extra"]
+    extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
     assert isinstance(extra_corpus, Callable)
 
 
@@ -89,9 +89,10 @@ def test_cat_readers(reader, additional_config):
     config["corpora"]["@readers"] = reader
     config["corpora"].update(additional_config)
     nlp = load_model_from_config(config, auto_fill=True)
-    resolved = resolve_training_config(nlp.config)
-    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
-    optimizer = resolved["training"]["optimizer"]
+    dot_names = ["training.train_corpus", "training.dev_corpus"]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    optimizer = T["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
@@ -100,7 +101,6 @@ def test_cat_readers(reader, additional_config):
         assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
         nlp.update([example], sgd=optimizer)
     # simulate performance benchmark on dev corpus
-    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
     dev_examples = list(dev_corpus(nlp))
     for example in dev_examples:
         # this shouldn't fail if each dev example has at least one positive label
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index e69de29bb..8938886fe 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -0,0 +1,205 @@
+from typing import Union, Dict, Optional, Any, List, Callable
+from thinc.api import Config, fix_random_seed, set_gpu_allocator
+from thinc.api import ConfigValidationError
+from pathlib import Path
+import srsly
+
+from .loop import create_before_to_disk_callback
+from ..language import Language
+from ..lookups import Lookups
+from ..errors import Errors
+from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
+from ..util import registry, load_model_from_config, resolve_dot_names
+from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
+
+
+def init_nlp(
+    config: Config,
+    *,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+    on_success: Callable[[str], None] = lambda x: None,
+) -> Language:
+    raw_config = config
+    config = raw_config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    # Use original config here before it's resolved to functions
+    sourced_components = get_sourced_components(config)
+    nlp = load_model_from_config(raw_config, auto_fill=True)
+    on_success("Set up nlp object from config")
+    config = nlp.config.interpolate()
+    # Resolve all training-relevant sections using the filled nlp config
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+    V = I["vocab"]
+    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
+    optimizer = T["optimizer"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Sourced components that require resume_training
+    resume_components = [p for p in sourced_components if p not in frozen_components]
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if resume_components:
+        with nlp.select_pipes(enable=resume_components):
+            logger.info(f"Resuming training for: {resume_components}")
+            nlp.resume_training(sgd=optimizer)
+    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
+        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        on_success(f"Initialized pipeline components")
+    # Verify the config after calling 'begin_training' to ensure labels
+    # are properly initialized
+    verify_config(nlp)
+    if "pretraining" in config and config["pretraining"]:
+        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
+        loaded = add_tok2vec_weights(nlp, P, I)
+        if loaded and P["component"]:
+            on_success(f"Loaded pretrained weights into component '{P['component']}'")
+    nlp = before_to_disk(nlp)
+    return nlp
+
+
+def must_reinitialize(train_config: Config, init_config: Config) -> bool:
+    # TODO: do this better and more fine-grained
+    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
+
+
+def init_vocab(
+    nlp: Language,
+    *,
+    data: Optional[Path] = None,
+    lookups: Optional[Lookups] = None,
+    vectors: Optional[str] = None,
+    on_success: Callable[[str], None] = lambda x: None,
+) -> Language:
+    if lookups:
+        nlp.vocab.lookups = lookups
+        on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
+    data_path = ensure_path(data)
+    if data_path is not None:
+        lex_attrs = srsly.read_jsonl(data_path)
+        for lexeme in nlp.vocab:
+            lexeme.rank = OOV_RANK
+        for attrs in lex_attrs:
+            if "settings" in attrs:
+                continue
+            lexeme = nlp.vocab[attrs["orth"]]
+            lexeme.set_attrs(**attrs)
+        if len(nlp.vocab):
+            oov_prob = min(lex.prob for lex in nlp.vocab) - 1
+        else:
+            oov_prob = DEFAULT_OOV_PROB
+        nlp.vocab.cfg.update({"oov_prob": oov_prob})
+        on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    on_success("Created vocabulary")
+    if vectors is not None:
+        load_vectors_into_model(nlp, vectors)
+        on_success(f"Added vectors: {vectors}")
+
+
+def load_vectors_into_model(
+    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
+) -> None:
+    """Load word vectors from an installed model or path into a model instance."""
+    try:
+        vectors_nlp = load_model(name)
+    except ConfigValidationError as e:
+        title = f"Config validation error for vectors {name}"
+        desc = (
+            "This typically means that there's a problem in the config.cfg included "
+            "with the packaged vectors. Make sure that the vectors package you're "
+            "loading is compatible with the current version of spaCy."
+        )
+        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        raise err from None
+    nlp.vocab.vectors = vectors_nlp.vocab.vectors
+    if add_strings:
+        # I guess we should add the strings from the vectors_nlp model?
+        # E.g. if someone does a similarity query, they might expect the strings.
+        for key in nlp.vocab.vectors.key2row:
+            if key in vectors_nlp.vocab.strings:
+                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
+
+
+def add_tok2vec_weights(
+    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
+) -> bool:
+    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
+    P = pretrain_config
+    V = vocab_config
+    weights_data = None
+    init_tok2vec = ensure_path(V["init_tok2vec"])
+    if init_tok2vec is not None:
+        if P["objective"].get("type") == "vectors" and not V["vectors"]:
+            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
+            errors = [{"loc": ["initialize", "vectors"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        if not init_tok2vec.exists():
+            err = f"can't find pretrained tok2vec: {init_tok2vec}"
+            errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
+            raise ConfigValidationError(config=nlp.config, errors=errors)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+    if weights_data is not None:
+        tok2vec_component = P["component"]
+        if tok2vec_component is None:
+            desc = (
+                f"To use pretrained tok2vec weights, [pretraining.component] "
+                f"needs to specify the component that should load them."
+            )
+            err = "component can't be null"
+            errors = [{"loc": ["pretraining", "component"], "msg": err}]
+            raise ConfigValidationError(
+                config=nlp.config["pretraining"], errors=errors, desc=desc
+            )
+        layer = nlp.get_pipe(tok2vec_component).model
+        if P["layer"]:
+            layer = layer.get_ref(P["layer"])
+        layer.from_bytes(weights_data)
+        return True
+    return False
+
+
+def verify_config(nlp: Language) -> None:
+    """Perform additional checks based on the config, loaded nlp object and training data."""
+    # TODO: maybe we should validate based on the actual components, the list
+    # in config["nlp"]["pipeline"] instead?
+    for pipe_config in nlp.config["components"].values():
+        # We can't assume that the component name == the factory
+        factory = pipe_config["factory"]
+        if factory == "textcat":
+            verify_textcat_config(nlp, pipe_config)
+
+
+def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
+    # if 'positive_label' is provided: double check whether it's in the data and
+    # the task is binary
+    if pipe_config.get("positive_label"):
+        textcat_labels = nlp.get_pipe("textcat").labels
+        pos_label = pipe_config.get("positive_label")
+        if pos_label not in textcat_labels:
+            raise ValueError(
+                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
+            )
+        if len(list(textcat_labels)) != 2:
+            raise ValueError(
+                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
+            )
+
+
+def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
+    """RETURNS (List[str]): All sourced components in the original config,
+        e.g. {"source": "en_core_web_sm"}. If the config contains a key
+        "factory", we assume it refers to a component factory.
+    """
+    return [
+        name
+        for name, cfg in config.get("components", {}).items()
+        if "factory" not in cfg and "source" in cfg
+    ]
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
new file mode 100644
index 000000000..3e3e9f5ce
--- /dev/null
+++ b/spacy/training/loop.py
@@ -0,0 +1,301 @@
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import Optional
+from pathlib import Path
+from timeit import default_timer as timer
+from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
+import random
+import tqdm
+
+from .example import Example
+from ..schemas import ConfigSchemaTraining
+from ..language import Language
+from ..errors import Errors
+from ..util import resolve_dot_names, registry, logger
+
+
+def train(
+    nlp: Language,
+    output_path: Optional[Path] = None,
+    *,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+) -> Optional[Path]:
+    """Train a pipeline.
+
+    nlp (Language): The initialized nlp object with the full config.
+    output_path (Path): Optional output path to save trained model to.
+    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
+        before calling this function.
+    logger (Callable[[Any], Any]): Optional logger exposing the methods info,
+        error, debug and  warn. Defaults to regular spaCy logger but can be
+        swapped for CLI logger.
+    RETURNS (Path / None): The path to the final exported model.
+    """
+
+    # Create iterator, which yields out info after each optimization step.
+    config = nlp.config.interpolate()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    optimizer = T["optimizer"]
+    score_weights = T["score_weights"]
+    batcher = T["batcher"]
+    train_logger = T["logger"]
+    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    # Components that shouldn't be updated during training
+    frozen_components = T["frozen_components"]
+    # Create iterator, which yields out info after each optimization step.
+    training_step_iterator = train_while_improving(
+        nlp,
+        optimizer,
+        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
+        create_evaluation_callback(nlp, dev_corpus, score_weights),
+        dropout=T["dropout"],
+        accumulate_gradient=T["accumulate_gradient"],
+        patience=T["patience"],
+        max_steps=T["max_steps"],
+        eval_frequency=T["eval_frequency"],
+        exclude=frozen_components,
+    )
+    logger.info(f"Pipeline: {nlp.pipe_names}")
+    if frozen_components:
+        logger.info(f"Frozen components: {frozen_components}")
+    logger.info(f"Initial learn rate: {optimizer.learn_rate}")
+    with nlp.select_pipes(disable=frozen_components):
+        print_row, finalize_logger = train_logger(nlp)
+    try:
+        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+        progress.set_description(f"Epoch 1")
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            progress.update(1)
+            if is_best_checkpoint is not None:
+                progress.close()
+                print_row(info)
+                if is_best_checkpoint and output_path is not None:
+                    with nlp.select_pipes(disable=frozen_components):
+                        update_meta(T, nlp, info)
+                    with nlp.use_params(optimizer.averages):
+                        nlp = before_to_disk(nlp)
+                        nlp.to_disk(output_path / "model-best")
+                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
+                progress.set_description(f"Epoch {info['epoch']}")
+    except Exception as e:
+        finalize_logger()
+        if output_path is not None:
+            # We don't want to swallow the traceback if we don't have a
+            # specific error.
+            logger.warn(
+                f"Aborting and saving the final best model. "
+                f"Encountered exception: {str(e)}"
+            )
+            nlp = before_to_disk(nlp)
+            nlp.to_disk(output_path / "model-final")
+        raise e
+    finally:
+        finalize_logger()
+        if output_path is not None:
+            final_model_path = output_path / "model-final"
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    nlp.to_disk(final_model_path)
+            else:
+                nlp.to_disk(final_model_path)
+            return final_model_path
+
+
+def train_while_improving(
+    nlp: Language,
+    optimizer: Optimizer,
+    train_data,
+    evaluate,
+    *,
+    dropout: float,
+    eval_frequency: int,
+    accumulate_gradient: int,
+    patience: int,
+    max_steps: int,
+    exclude: List[str],
+):
+    """Train until an evaluation stops improving. Works as a generator,
+    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+    where info is a dict, and is_best_checkpoint is in [True, False, None] --
+    None indicating that the iteration was not evaluated as a checkpoint.
+    The evaluation is conducted by calling the evaluate callback.
+
+    Positional arguments:
+        nlp: The spaCy pipeline to evaluate.
+        optimizer: The optimizer callable.
+        train_data (Iterable[Batch]): A generator of batches, with the training
+            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+            data iterable needs to take care of iterating over the epochs and
+            shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A list of Example objects.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score from the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        losses: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses = {}
+    words_seen = 0
+    start_time = timer()
+    for step, (epoch, batch) in enumerate(train_data):
+        dropout = next(dropouts)
+        for subbatch in subdivide_batch(batch, accumulate_gradient):
+            nlp.update(
+                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+            )
+        # TODO: refactor this so we don't have to run it separately in here
+        for name, proc in nlp.pipeline:
+            if (
+                name not in exclude
+                and hasattr(proc, "model")
+                and proc.model not in (True, False, None)
+            ):
+                proc.model.finish_update(optimizer)
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    score, other_scores = evaluate()
+            else:
+                score, other_scores = evaluate()
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        words_seen += sum(len(eg) for eg in batch)
+        info = {
+            "epoch": epoch,
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+            "seconds": int(timer() - start_time),
+            "words": words_seen,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if no improvement in `patience` updates (if specified)
+        best_score, best_step = max(results)
+        if patience and (step - best_step) >= patience:
+            break
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and step >= max_steps:
+            break
+
+
+def subdivide_batch(batch, accumulate_gradient):
+    batch = list(batch)
+    batch.sort(key=lambda eg: len(eg.predicted))
+    sub_len = len(batch) // accumulate_gradient
+    start = 0
+    for i in range(accumulate_gradient):
+        subbatch = batch[start : start + sub_len]
+        if subbatch:
+            yield subbatch
+        start += len(subbatch)
+    subbatch = batch[start:]
+    if subbatch:
+        yield subbatch
+
+
+def create_evaluation_callback(
+    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
+) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    weights = {key: value for key, value in weights.items() if value is not None}
+
+    def evaluate() -> Tuple[float, Dict[str, float]]:
+        dev_examples = list(dev_corpus(nlp))
+        scores = nlp.evaluate(dev_examples)
+        # Calculate a weighted sum based on score_weights for the main score.
+        # We can only consider scores that are ints/floats, not dicts like
+        # entity scores per type etc.
+        for key, value in scores.items():
+            if key in weights and not isinstance(value, (int, float)):
+                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
+        try:
+            weighted_score = sum(
+                scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
+            )
+        except KeyError as e:
+            keys = list(scores.keys())
+            err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
+            raise KeyError(err) from None
+        return weighted_score, scores
+
+    return evaluate
+
+
+def create_train_batches(
+    iterator: Iterator[Example],
+    batcher: Callable[[Iterable[Example]], Iterable[Example]],
+    max_epochs: int,
+):
+    epoch = 0
+    examples = list(iterator)
+    if not examples:
+        # Raise error if no data
+        raise ValueError(Errors.E986)
+    while max_epochs < 1 or epoch != max_epochs:
+        random.shuffle(examples)
+        for batch in batcher(examples):
+            yield epoch, batch
+        epoch += 1
+
+
+def update_meta(
+    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
+) -> None:
+    nlp.meta["performance"] = {}
+    for metric in training["score_weights"]:
+        if metric is not None:
+            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
+    for pipe_name in nlp.pipe_names:
+        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+
+
+def create_before_to_disk_callback(
+    callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+    def before_to_disk(nlp: Language) -> Language:
+        if not callback:
+            return nlp
+        modified_nlp = callback(nlp)
+        if not isinstance(modified_nlp, Language):
+            err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+            raise ValueError(err)
+        return modified_nlp
+
+    return before_to_disk
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
new file mode 100644
index 000000000..1e0f055ee
--- /dev/null
+++ b/spacy/training/pretrain.py
@@ -0,0 +1,267 @@
+from typing import Optional, Callable, Any, Iterable, Union, List
+from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
+from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from pathlib import Path
+from functools import partial
+from collections import Counter
+import srsly
+import numpy
+import time
+import re
+from wasabi import msg
+
+from .example import Example
+from ..tokens import Doc
+from ..attrs import ID
+from ..ml.models.multi_task import build_cloze_multi_task_model
+from ..ml.models.multi_task import build_cloze_characters_multi_task_model
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..util import registry, load_model_from_config, dot_to_object, logger
+
+
+def pretrain(
+    config: Config,
+    output_dir: Path,
+    resume_path: Optional[Path] = None,
+    epoch_resume: Optional[int] = None,
+    use_gpu: int = -1,
+    logger: Callable[[Any], Any] = logger,
+):
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+    nlp = load_model_from_config(config)
+    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
+    corpus = dot_to_object(T, P["corpus"])
+    batcher = P["batcher"]
+    model = create_pretraining_model(nlp, P)
+    optimizer = P["optimizer"]
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        _resume_model(model, resume_path, epoch_resume)
+    else:
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0
+
+    # TODO: move this to logger function?
+    tracker = ProgressTracker(frequency=10000)
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
+    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
+    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
+
+    def _save_model(epoch, is_temp=False):
+        is_temp_str = ".temp" if is_temp else ""
+        with model.use_params(optimizer.averages):
+            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+                file_.write(model.get_ref("tok2vec").to_bytes())
+            log = {
+                "nr_word": tracker.nr_word,
+                "loss": tracker.loss,
+                "epoch_loss": tracker.epoch_loss,
+                "epoch": epoch,
+            }
+            with (output_dir / "log.jsonl").open("a") as file_:
+                file_.write(srsly.json_dumps(log) + "\n")
+
+    objective = create_objective(P["objective"])
+    # TODO: I think we probably want this to look more like the
+    # 'create_train_batches' function?
+    for epoch in range(epoch_resume, P["max_epochs"]):
+        for batch_id, batch in enumerate(batcher(corpus(nlp))):
+            docs = ensure_docs(batch)
+            loss = make_update(model, docs, optimizer, objective)
+            progress = tracker.update(epoch, loss, docs)
+            if progress:
+                msg.row(progress, **row_settings)
+            if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
+                _save_model(epoch, is_temp=True)
+        _save_model(epoch)
+        tracker.epoch_loss = 0.0
+
+
+def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
+    docs = []
+    for eg_or_doc in examples_or_docs:
+        if isinstance(eg_or_doc, Doc):
+            docs.append(eg_or_doc)
+        else:
+            docs.append(eg_or_doc.reference)
+    return docs
+
+
+def _resume_model(
+    model: Model,
+    resume_path: Path,
+    epoch_resume: int,
+    logger: Callable[[Any], Any] = logger,
+) -> None:
+    logger.info(f"Resume training tok2vec from: {resume_path}")
+    with resume_path.open("rb") as file_:
+        weights_data = file_.read()
+        model.get_ref("tok2vec").from_bytes(weights_data)
+    # Parse the epoch number from the given weight file
+    model_name = re.search(r"model\d+\.bin", str(resume_path))
+    if model_name:
+        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+        logger.info(f"Resuming from epoch: {epoch_resume}")
+    else:
+        logger.info(f"Resuming from epoch: {epoch_resume}")
+
+
+def make_update(
+    model: Model, docs: Iterable[Doc], optimizer: Optimizer, objective_func: Callable
+) -> float:
+    """Perform an update over a single batch of documents.
+
+    docs (iterable): A batch of `Doc` objects.
+    optimizer (callable): An optimizer.
+    RETURNS loss: A float for the loss.
+    """
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = objective_func(model.ops, docs, predictions)
+    backprop(gradients)
+    model.finish_update(optimizer)
+    # Don't want to return a cupy object here
+    # The gradients are modified in-place by the BERT MLM,
+    # so we get an accurate loss
+    return float(loss)
+
+
+def create_objective(config: Config):
+    """Create the objective for pretraining.
+
+    We'd like to replace this with a registry function but it's tricky because
+    we're also making a model choice based on this. For now we hard-code support
+    for two types (characters, vectors). For characters you can specify
+    n_characters, for vectors you can specify the loss.
+
+    Bleh.
+    """
+    objective_type = config["type"]
+    if objective_type == "characters":
+        return partial(get_characters_loss, nr_char=config["n_characters"])
+    elif objective_type == "vectors":
+        if config["loss"] == "cosine":
+            distance = CosineDistance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        elif config["loss"] == "L2":
+            distance = L2Distance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        else:
+            raise ValueError("Unexpected loss type", config["loss"])
+    else:
+        raise ValueError("Unexpected objective_type", objective_type)
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_target, loss = distance(prediction, target)
+    return loss, d_target
+
+
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+def create_pretraining_model(nlp, pretrain_config):
+    """Define a network for the pretraining. We simply add an output layer onto
+    the tok2vec input model. The tok2vec input model needs to be a model that
+    takes a batch of Doc objects (as a list), and returns a list of arrays.
+    Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
+    """
+    component = nlp.get_pipe(pretrain_config["component"])
+    if pretrain_config.get("layer"):
+        tok2vec = component.model.get_ref(pretrain_config["layer"])
+    else:
+        tok2vec = component.model
+
+    # TODO
+    maxout_pieces = 3
+    hidden_size = 300
+    if pretrain_config["objective"]["type"] == "vectors":
+        model = build_cloze_multi_task_model(
+            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
+        )
+    elif pretrain_config["objective"]["type"] == "characters":
+        model = build_cloze_characters_multi_task_model(
+            nlp.vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=pretrain_config["objective"]["n_characters"],
+        )
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    set_dropout_rate(model, pretrain_config["dropout"])
+    return model
+
+
+class ProgressTracker:
+    def __init__(self, frequency=1000000):
+        self.loss = 0.0
+        self.prev_loss = 0.0
+        self.nr_word = 0
+        self.words_per_epoch = Counter()
+        self.frequency = frequency
+        self.last_time = time.time()
+        self.last_update = 0
+        self.epoch_loss = 0.0
+
+    def update(self, epoch, loss, docs):
+        self.loss += loss
+        self.epoch_loss += loss
+        words_in_batch = sum(len(doc) for doc in docs)
+        self.words_per_epoch[epoch] += words_in_batch
+        self.nr_word += words_in_batch
+        words_since_update = self.nr_word - self.last_update
+        if words_since_update >= self.frequency:
+            wps = words_since_update / (time.time() - self.last_time)
+            self.last_update = self.nr_word
+            self.last_time = time.time()
+            loss_per_word = self.loss - self.prev_loss
+            status = (
+                epoch,
+                self.nr_word,
+                _smart_round(self.loss, width=10),
+                _smart_round(loss_per_word, width=6),
+                int(wps),
+            )
+            self.prev_loss = float(self.loss)
+            return status
+        else:
+            return None
+
+
+def _smart_round(
+    figure: Union[float, int], width: int = 10, max_decimal: int = 4
+) -> str:
+    """Round large numbers as integers, smaller numbers as decimals."""
+    n_digits = len(str(int(figure)))
+    n_decimal = width - (n_digits + 1)
+    if n_decimal <= 1:
+        return str(int(figure))
+    else:
+        n_decimal = min(n_decimal, max_decimal)
+        format_str = "%." + str(n_decimal) + "f"
+        return format_str % figure
diff --git a/spacy/util.py b/spacy/util.py
index cab7af8fb..9d7199d7f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -8,6 +8,7 @@ import re
 from pathlib import Path
 import thinc
 from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError
 import functools
 import itertools
 import numpy.random
@@ -56,6 +57,7 @@ if TYPE_CHECKING:
 
 
 OOV_RANK = numpy.iinfo(numpy.uint64).max
+DEFAULT_OOV_PROB = -20
 LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
 
 # Default order of sections in the config.cfg. Not all sections needs to exist,
@@ -239,20 +241,6 @@ def get_module_path(module: ModuleType) -> Path:
     return Path(sys.modules[module.__module__].__file__).parent
 
 
-def load_vectors_into_model(
-    nlp: "Language", name: Union[str, Path], *, add_strings=True
-) -> None:
-    """Load word vectors from an installed model or path into a model instance."""
-    vectors_nlp = load_model(name)
-    nlp.vocab.vectors = vectors_nlp.vocab.vectors
-    if add_strings:
-        # I guess we should add the strings from the vectors_nlp model?
-        # E.g. if someone does a similarity query, they might expect the strings.
-        for key in nlp.vocab.vectors.key2row:
-            if key in vectors_nlp.vocab.strings:
-                nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
-
-
 def load_model(
     name: Union[str, Path],
     *,
@@ -391,32 +379,9 @@ def load_model_from_config(
     return nlp
 
 
-def resolve_training_config(
-    config: Config,
-    exclude: Iterable[str] = ("nlp", "components"),
-    validate: bool = True,
-) -> Dict[str, Any]:
-    """Resolve the config sections relevant for trainig and create all objects.
-    Mostly used in the CLI to separate training config (not resolved by default
-    because not runtime-relevant – an nlp object should load fine even if it's
-    [training] block refers to functions that are not available etc.).
-
-    config (Config): The config to resolve.
-    exclude (Iterable[str]): The config blocks to exclude. Those blocks won't
-        be available in the final resolved config.
-    validate (bool): Whether to validate the config.
-    RETURNS (Dict[str, Any]): The resolved config.
-    """
-    config = config.copy()
-    for key in exclude:
-        if key in config:
-            config.pop(key)
-    return registry.resolve(config, validate=validate)
-
-
 def resolve_dot_names(
     config: Config, dot_names: List[Optional[str]]
-) -> List[Optional[Callable]]:
+) -> Tuple[Any]:
     """Resolve one or more "dot notation" names, e.g. corpora.train.
     The paths could point anywhere into the config, so we don't know which
     top-level section we'll be looking within.
@@ -424,18 +389,42 @@ def resolve_dot_names(
     We resolve the whole top-level section, although we could resolve less --
     we could find the lowest part of the tree.
     """
+    # TODO: include schema?
+    # TODO: clean this up and avoid duplication
     resolved = {}
     output = []
+    errors = []
     for name in dot_names:
         if name is None:
             output.append(name)
         else:
             section = name.split(".")[0]
-            # We want to avoid resolving the same thing twice.
+            # We want to avoid resolving the same thing twice
             if section not in resolved:
                 resolved[section] = registry.resolve(config[section])
-            output.append(dot_to_object(resolved, name))
-    return output
+            try:
+                output.append(dot_to_object(resolved, name))
+            except KeyError:
+                msg = f"not a valid section reference: {name}"
+                errors.append({"loc": name.split("."), "msg": msg})
+    objects = []
+    for ref in output:
+        if not isinstance(ref, str):
+            msg = f"not a valid section reference: {ref} ({type(ref)})"
+            errors.append({"loc": ref.split("."), "msg": msg})
+            continue
+        section = ref.split(".")[0]
+        # We want to avoid resolving the same thing twice
+        if section not in resolved:
+            resolved[section] = registry.resolve(config[section])
+        try:
+            objects.append(dot_to_object(resolved, ref))
+        except KeyError:
+            msg = f"not a valid section reference: {name}"
+            errors.append({"loc": ref.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config=config, errors=errors)
+    return tuple(objects)
 
 
 def load_model_from_init_py(

From 02838a1d470d08ab381524bb1d857a61366759ac Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 15:27:10 +0200
Subject: [PATCH 219/516] Fix resolve_dot_names

---
 spacy/util.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 9d7199d7f..f9d9e6495 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -379,9 +379,7 @@ def load_model_from_config(
     return nlp
 
 
-def resolve_dot_names(
-    config: Config, dot_names: List[Optional[str]]
-) -> Tuple[Any]:
+def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[Any]:
     """Resolve one or more "dot notation" names, e.g. corpora.train.
     The paths could point anywhere into the config, so we don't know which
     top-level section we'll be looking within.
@@ -410,8 +408,7 @@ def resolve_dot_names(
     objects = []
     for ref in output:
         if not isinstance(ref, str):
-            msg = f"not a valid section reference: {ref} ({type(ref)})"
-            errors.append({"loc": ref.split("."), "msg": msg})
+            objects.append(ref)
             continue
         section = ref.split(".")[0]
         # We want to avoid resolving the same thing twice

From 2e9c9e74af52dc3f8effbd862f0b999f70d7c926 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 15:34:00 +0200
Subject: [PATCH 220/516] Fix config resolution and interpolation

TODO: auto-interpolate in Thinc if config is dict (i.e. likely subsection)
---
 spacy/cli/debug_data.py              |  4 +++-
 spacy/cli/debug_model.py             |  4 +++-
 spacy/tests/training/test_readers.py | 11 ++++++++---
 spacy/training/pretrain.py           |  5 +++--
 spacy/util.py                        |  7 ++++++-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index f0e76be2b..c4d1069c0 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -97,7 +97,9 @@ def debug_data(
     with show_validation_error(config_path):
         cfg = util.load_config(config_path, overrides=config_overrides)
         nlp = util.load_model_from_config(cfg)
-        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+        T = registry.resolve(
+            nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
+        )
     # Use original config here, not resolved version
     sourced_components = get_sourced_components(cfg)
     frozen_components = T["frozen_components"]
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index f8fc687fa..0b4db70b6 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -63,7 +63,9 @@ def debug_model_cli(
         set_gpu_allocator(allocator)
     with show_validation_error(config_path):
         nlp = util.load_model_from_config(raw_config)
-        T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+        T = registry.resolve(
+            nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
+        )
     seed = T["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 7d94d5ddc..5c02aca36 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -42,7 +42,9 @@ def test_readers():
     dot_names = ["training.train_corpus", "training.dev_corpus"]
     train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
     assert isinstance(train_corpus, Callable)
-    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    T = registry.resolve(
+        nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
+    )
     optimizer = T["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
@@ -53,7 +55,8 @@ def test_readers():
     # ensure the pipeline runs
     doc = nlp("Quick test")
     assert doc.cats
-    extra_corpus = registry.resolve(nlp.config["corpora"])["extra"]
+    corpora = {"corpora": nlp.config.interpolate()["corpora"]}
+    extra_corpus = registry.resolve(corpora)["corpora"]["extra"]
     assert isinstance(extra_corpus, Callable)
 
 
@@ -91,7 +94,9 @@ def test_cat_readers(reader, additional_config):
     nlp = load_model_from_config(config, auto_fill=True)
     dot_names = ["training.train_corpus", "training.dev_corpus"]
     train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
-    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
+    T = registry.resolve(
+        nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
+    )
     optimizer = T["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 1e0f055ee..e8dd9df30 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -33,8 +33,9 @@ def pretrain(
     if use_gpu >= 0 and allocator:
         set_gpu_allocator(allocator)
     nlp = load_model_from_config(config)
-    T = registry.resolve(nlp.config["training"], schema=ConfigSchemaTraining)
-    P = registry.resolve(nlp.config["pretraining"], schema=ConfigSchemaPretrain)
+    _config = nlp.config.interpolate()
+    T = registry.resolve(_config["training"], schema=ConfigSchemaTraining)
+    P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain)
     corpus = dot_to_object(T, P["corpus"])
     batcher = P["batcher"]
     model = create_pretraining_model(nlp, P)
diff --git a/spacy/util.py b/spacy/util.py
index f9d9e6495..67c577927 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -413,7 +413,12 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A
         section = ref.split(".")[0]
         # We want to avoid resolving the same thing twice
         if section not in resolved:
-            resolved[section] = registry.resolve(config[section])
+            if registry.is_promise(config[section]):
+                # Otherwise we can't resolve [corpus] if it's a promise
+                result = registry.resolve({"config": config[section]})["config"]
+            else:
+                result = registry.resolve(config[section])
+            resolved[section] = result
         try:
             objects.append(dot_to_object(resolved, ref))
         except KeyError:

From cd21eb24851fde435d8bd3f2c8d15c5f82d66813 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 28 Sep 2020 16:45:48 +0200
Subject: [PATCH 221/516] upgrade pydantic pin for thinc's
 field.default_factory

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d696cd44b..3ff8bea3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ pathy
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.3.0,<2.0.0
+pydantic>=1.5.0,<2.0.0
 pytokenizations
 # Official Python utilities
 setuptools
diff --git a/setup.cfg b/setup.cfg
index b55c0d376..92732dc33 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ install_requires =
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0
     requests>=2.13.0,<3.0.0
-    pydantic>=1.3.0,<2.0.0
+    pydantic>=1.5.0,<2.0.0
     pytokenizations
     # Official Python utilities
     setuptools

From 3360825e0042a535e0da08d045f6147425edb00a Mon Sep 17 00:00:00 2001
From: walterhenry <55140654+walterhenry@users.noreply.github.com>
Date: Mon, 28 Sep 2020 16:50:15 +0200
Subject: [PATCH 222/516] Proofreading

Another round of proofreading. All the API docs have been read through and I've grazed the Usage docs.
---
 website/docs/api/doc.md                       |  3 +--
 website/docs/api/pipeline-functions.md        |  2 +-
 website/docs/api/span.md                      |  2 +-
 website/docs/api/textcategorizer.md           |  8 ++++----
 website/docs/api/tok2vec.md                   |  4 ++--
 website/docs/api/token.md                     | 14 +++++++-------
 website/docs/api/tokenizer.md                 |  8 ++++----
 website/docs/api/top-level.md                 | 16 ++++++++--------
 website/docs/api/transformer.md               | 18 +++++++++---------
 website/docs/api/vectors.md                   |  6 +++---
 website/docs/api/vocab.md                     | 12 ++++++------
 website/docs/usage/embeddings-transformers.md |  2 +-
 12 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index b4097ddb7..151b00a0a 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -444,8 +444,7 @@ invalidated, although they may accidentally continue to work.
 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
-`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
-dictionary mapping attribute name to values as the `"_"` key.
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute name to values.
 
 > #### Example
 >
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 8bb52d0f9..0dc03a16a 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -26,7 +26,7 @@ Merge noun chunks into a single token. Also available via the string name
 
 <Infobox variant="warning">
 
-Since noun chunks require part-of-speech tags and the dependency parser, make
+Since noun chunks require part-of-speech tags and the dependency parse, make
 sure to add this component _after_ the `"tagger"` and `"parser"` components. By
 default, `nlp.add_pipe` will add components to the end of the pipeline and after
 all other components.
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 242ceaed0..7fa1aaa38 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -187,7 +187,7 @@ the character indices don't map to a valid span.
 | Name                                 | Description                                                                               |
 | ------------------------------------ | ----------------------------------------------------------------------------------------- |
 | `start`                              | The index of the first character of the span. ~~int~~                                     |
-| `end`                                | The index of the last character after the span. ~int~~                                    |
+| `end`                                | The index of the last character after the span. ~~int~~                                    |
 | `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
 | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
 | `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index b68039094..be4052f46 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -153,7 +153,7 @@ setting up the label scheme based on the data.
 
 ## TextCategorizer.predict {#predict tag="method"}
 
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 
 > #### Example
@@ -170,7 +170,7 @@ modifying them.
 
 ## TextCategorizer.set_annotations {#set_annotations tag="method"}
 
-Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
 
 > #### Example
 >
@@ -213,7 +213,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model, to try to address
+current model to make predictions similar to an initial model to try to address
 the "catastrophic forgetting" problem. This feature is experimental.
 
 > #### Example
@@ -286,7 +286,7 @@ Create an optimizer for the pipeline component.
 
 ## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
 
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model to use the given parameter values.
 
 > #### Example
 >
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 5c7214edc..2633a7a1a 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -151,7 +151,7 @@ setting up the label scheme based on the data.
 
 ## Tok2Vec.predict {#predict tag="method"}
 
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 
 > #### Example
@@ -224,7 +224,7 @@ Create an optimizer for the pipeline component.
 
 ## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
 
-Modify the pipe's model, to use the given parameter values. At the end of the
+Modify the pipe's model to use the given parameter values. At the end of the
 context, the original parameters are restored.
 
 > #### Example
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 0860797aa..068a1d2d2 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children.
 
 ## Token.lefts {#lefts tag="property" model="parser"}
 
-The leftward immediate children of the word, in the syntactic dependency parse.
+The leftward immediate children of the word in the syntactic dependency parse.
 
 > #### Example
 >
@@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse.
 
 ## Token.rights {#rights tag="property" model="parser"}
 
-The rightward immediate children of the word, in the syntactic dependency parse.
+The rightward immediate children of the word in the syntactic dependency parse.
 
 > #### Example
 >
@@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse.
 
 ## Token.n_lefts {#n_lefts tag="property" model="parser"}
 
-The number of leftward immediate children of the word, in the syntactic
+The number of leftward immediate children of the word in the syntactic
 dependency parse.
 
 > #### Example
@@ -291,7 +291,7 @@ dependency parse.
 
 ## Token.n_rights {#n_rights tag="property" model="parser"}
 
-The number of rightward immediate children of the word, in the syntactic
+The number of rightward immediate children of the word in the syntactic
 dependency parse.
 
 > #### Example
@@ -422,8 +422,8 @@ The L2 norm of the token's vector representation.
 | `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~                                                                                                 |
 | `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                   |
 | `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                          |
-| `shape`                                      | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                                     | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `shape`                                      | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
 | `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                             |
 | `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                           |
 | `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                               |
@@ -451,7 +451,7 @@ The L2 norm of the token's vector representation.
 | `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                   |
 | `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                   |
 | `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                              |
-| `morph_` <Tag variant="new">3</Tag>          | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~                                                                                                                     |
+| `morph_` <Tag variant="new">3</Tag>          | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~                                                                                                                     |
 | `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                                 |
 | `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                                 |
 | `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                  |
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 0158c5589..8ea5a1f65 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -1,6 +1,6 @@
 ---
 title: Tokenizer
-teaser: Segment text into words, punctuations marks etc.
+teaser: Segment text into words, punctuations marks, etc.
 tag: class
 source: spacy/tokenizer.pyx
 ---
@@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx
 Segment text, and create `Doc` objects with the discovered segment boundaries.
 For a deeper understanding, see the docs on
 [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
-The tokenizer is typically created automatically when the a
+The tokenizer is typically created automatically when a
 [`Language`](/api/language) subclass is initialized and it reads its settings
 like punctuation and special case rules from the
 [`Language.Defaults`](/api/language#defaults) provided by the language subclass.
 
 ## Tokenizer.\_\_init\_\_ {#init tag="method"}
 
-Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples
+Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
 of how to construct a custom tokenizer with different tokenization rules, see
 the
 [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
@@ -87,7 +87,7 @@ Tokenize a stream of texts.
 | ------------ | ------------------------------------------------------------------------------------ |
 | `texts`      | A sequence of unicode texts. ~~Iterable[str]~~                                       |
 | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
-| **YIELDS**   | The tokenized Doc objects, in order. ~~Doc~~                                         |
+| **YIELDS**   | The tokenized `Doc` objects, in order. ~~Doc~~                                         |
 
 ## Tokenizer.find_infix {#find_infix tag="method"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..94260cacb 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -196,7 +196,7 @@ browser. Will run a simple web server.
 | `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                      |
 | `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                  |
 | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                  |
-| `manual`  | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
+| `manual`  | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
 | `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                           |
 | `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                      |
 
@@ -221,7 +221,7 @@ Render a dependency parse tree or named entity visualization.
 | `page`      | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                                          |
 | `minify`    | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                                      |
 | `options`   | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                                      |
-| `manual`    | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
+| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
 | `jupyter`   | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
 | **RETURNS** | The rendered HTML markup. ~~str~~                                                                                                                                                      |
 
@@ -242,7 +242,7 @@ If a setting is not present in the options, the default value will be used.
 | Name                                       | Description                                                                                                                                  |
 | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
 | `fine_grained`                             | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
-| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
 | `collapse_punct`                           | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
 | `collapse_phrases`                         | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
 | `compact`                                  | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
@@ -611,7 +611,7 @@ sequences in the batch.
 
 Encode labelled spans into per-token tags, using the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
-Out). Returns a list of strings, describing the tags. Each tag string will be of
+Out). Returns a list of strings, describing the tags. Each tag string will be in
 the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of
 `"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets
 don't align with the tokenization in the `Doc` object. The training algorithm
@@ -716,7 +716,7 @@ decorator.
 ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
 
 Check whether a `Language` subclass is already loaded. `Language` subclasses are
-loaded lazily, to avoid expensive setup code associated with the language data.
+loaded lazily to avoid expensive setup code associated with the language data.
 
 > #### Example
 >
@@ -904,7 +904,7 @@ Compile a sequence of prefix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
 
 ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
 
@@ -921,7 +921,7 @@ Compile a sequence of suffix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
 
 ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
 
@@ -938,7 +938,7 @@ Compile a sequence of infix rules into a regex object.
 | Name        | Description                                                                                                                               |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
+| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
 
 ### util.minibatch {#util.minibatch tag="function" new="2"}
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index d5bcef229..957ce69a4 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -186,7 +186,7 @@ setting up the label scheme based on the data.
 
 ## Transformer.predict {#predict tag="method"}
 
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 
 > #### Example
@@ -203,7 +203,7 @@ modifying them.
 
 ## Transformer.set_annotations {#set_annotations tag="method"}
 
-Assign the extracted features to the Doc objects. By default, the
+Assign the extracted features to the `Doc` objects. By default, the
 [`TransformerData`](/api/transformer#transformerdata) object is written to the
 [`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
 callback is then called, if provided.
@@ -272,7 +272,7 @@ Create an optimizer for the pipeline component.
 
 ## Transformer.use_params {#use_params tag="method, contextmanager"}
 
-Modify the pipe's model, to use the given parameter values. At the end of the
+Modify the pipe's model to use the given parameter values. At the end of the
 context, the original parameters are restored.
 
 > #### Example
@@ -388,8 +388,8 @@ by this class. Instances of this class are typically assigned to the
 
 | Name      | Description                                                                                                                                                                                                                                                                                                                                             |
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                         |
-| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
+| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                         |
+| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
 | `align`   | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                    |
 | `width`   | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                             |
 
@@ -409,7 +409,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
 
 | Name       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `spans`    | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
+| `spans`    | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
 | `tokens`   | The output of the tokenizer. ~~transformers.BatchEncoding~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `tensors`  | The output of the transformer model. ~~List[torch.Tensor]~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `align`    | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                                                                                                                                                   |
@@ -439,10 +439,10 @@ Split a `TransformerData` object that represents a batch into a list with one
 ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
 
 Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
-return a lists of [`Span`](/api/span) objects for each doc, to be processed by
-the transformer. This is used to manage long documents, by cutting them into
+return a lists of [`Span`](/api/span) objects for each doc to be processed by
+the transformer. This is used to manage long documents by cutting them into
 smaller sequences before running the transformer. The spans are allowed to
-overlap, and you can also omit sections of the Doc if they are not relevant.
+overlap, and you can also omit sections of the `Doc` if they are not relevant.
 
 Span getters can be referenced in the `[components.transformer.model.get_spans]`
 block of the config to customize the sequences processed by the transformer. You
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index 7e97b4ca3..ba2d5ab42 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -290,7 +290,7 @@ If a table is full, it can be resized using
 ## Vectors.n_keys {#n_keys tag="property"}
 
 Get the number of keys in the table. Note that this is the number of _all_ keys,
-not just unique vectors. If several keys are mapped are mapped to the same
+not just unique vectors. If several keys are mapped to the same
 vectors, they will be counted individually.
 
 > #### Example
@@ -307,10 +307,10 @@ vectors, they will be counted individually.
 
 ## Vectors.most_similar {#most_similar tag="method"}
 
-For each of the given vectors, find the `n` most similar entries to it, by
+For each of the given vectors, find the `n` most similar entries to it by
 cosine. Queries are by vector. Results are returned as a
 `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
-performed in chunks, to avoid consuming too much memory. You can set the
+performed in chunks to avoid consuming too much memory. You can set the
 `batch_size` to control the size/space trade-off during the calculations.
 
 > #### Example
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index 71a678cb3..a2ca63002 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -29,7 +29,7 @@ Create the vocabulary.
 | `oov_prob`                                  | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                             |
 | `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~                                                                                                           |
 | `writing_system`                            | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~          |
-| `get_noun_chunks`                           | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
+| `get_noun_chunks`                           | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
 
 ## Vocab.\_\_len\_\_ {#len tag="method"}
 
@@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words
 would then be remapped to the closest remaining vector – so "feline" would have
 the same vector as "cat", and "reclined" would have the same vector as "sat".
 The similarities are judged by cosine. The original vectors may be large, so the
-cosines are calculated in minibatches, to reduce memory usage.
+cosines are calculated in minibatches to reduce memory usage.
 
 > #### Example
 >
@@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage.
 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
 or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
 is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
-subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
+subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
 
 > #### Example
 >
@@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
 | Name                                | Description                                                                                                            |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | `orth`                              | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
-| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~                 |
-| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~                 |
+| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                 |
+| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                 |
 | **RETURNS**                         | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 
 ## Vocab.set_vector {#set_vector tag="method" new="2"}
 
-Set a vector for a word in the vocabulary. Words can be referenced by by string
+Set a vector for a word in the vocabulary. Words can be referenced by string
 or hash value.
 
 > #### Example
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 8dd104ead..c61d7e144 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -36,7 +36,7 @@ models such as [transformers](#transformers) is that word vectors model
 context around them, a transformer model like BERT can't really help you. BERT
 is designed to understand language **in context**, which isn't what you have. A
 word vectors table will be a much better fit for your task. However, if you do
-have words in context — whole sentences or paragraphs of running text — word
+have words in context – whole sentences or paragraphs of running text – word
 vectors will only provide a very rough approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a

From a139fe672bbf465a829bb2d73558fa61351dfc7e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 21:17:10 +0200
Subject: [PATCH 223/516] Fix typos and refactor CLI logging

---
 spacy/cli/_util.py           | 15 ++----------
 spacy/cli/init_pipeline.py   |  4 ++--
 spacy/cli/pretrain.py        |  4 ++--
 spacy/cli/train.py           |  8 +++----
 spacy/training/initialize.py | 45 ++++++++++++++++++------------------
 spacy/training/loop.py       | 19 +++++++--------
 spacy/training/pretrain.py   | 24 +++++++++----------
 7 files changed, 52 insertions(+), 67 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c41905970..2c944bf3a 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
     return result
 
 
-class CliLogger:
-    """Helper mocking up the most commonly used logger methods. Can be passed
-    into functions like train() to make them output pretty-printed messages
-    on the CLI and regular logging if used from within Python.
-    """
-
-    debug = msg.text
-    info = msg.info
-    warn = msg.info
-    error = msg.fail
-
-
-def setup_gpu(use_gpu: int):
+def setup_gpu(use_gpu: int) -> None:
+    """Configure the GPU and log info."""
     if use_gpu >= 0:
         msg.info(f"Using GPU: {use_gpu}")
         require_gpu(use_gpu)
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index de1dc8a46..a92705cb0 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -7,7 +7,7 @@ import typer
 from .. import util
 from ..training.initialize import init_nlp
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, CliLogger, setup_gpu
+from ._util import import_code, setup_gpu
 
 
 @init_cli.command(
@@ -32,6 +32,6 @@ def init_pipeline_cli(
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
     with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
+        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
     nlp.to_disk(output_path)
     msg.good(f"Saved initialized pipeline to {output_path}")
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 6494486a9..de9341449 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -5,7 +5,7 @@ import typer
 import re
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu, CliLogger
+from ._util import import_code, setup_gpu
 from ..training.pretrain import pretrain
 from ..util import load_config
 
@@ -73,7 +73,7 @@ def pretrain_cli(
         resume_path=resume_path,
         epoch_resume=epoch_resume,
         use_gpu=use_gpu,
-        logger=CliLogger,
+        silent=False,
     )
     msg.good("Successfully finished pretrain")
 
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index aa0e71b5a..b0bd48ddb 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -6,7 +6,7 @@ import typer
 import logging
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, CliLogger, setup_gpu
+from ._util import import_code, setup_gpu
 from ..language import Language
 from ..training.loop import train
 from ..training.initialize import init_nlp, must_reinitialize
@@ -50,15 +50,13 @@ def train_cli(
     msg.divider("Initializing pipeline")
     nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
     msg.divider("Training pipeline")
-    final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
-    if final_path:
-        msg.good(f"Saved pipeline to output directory", final_path)
+    train(nlp, output_path, use_gpu=use_gpu, silent=False)
 
 
 def init_pipeline(
     config: Config, output_path: Optional[Path], *, use_gpu: int = -1
 ) -> Language:
-    init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
+    init_kwargs = {"use_gpu": use_gpu, "silent": False}
     if output_path is not None:
         init_path = output_path / "model-initial"
         if not init_path.exists():
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 8938886fe..ecfc57ee9 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,7 +1,8 @@
-from typing import Union, Dict, Optional, Any, List, Callable
+from typing import Union, Dict, Optional, Any, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
+from wasabi import Printer
 import srsly
 
 from .loop import create_before_to_disk_callback
@@ -10,16 +11,11 @@ from ..lookups import Lookups
 from ..errors import Errors
 from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, resolve_dot_names
-from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
+from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
 
 
-def init_nlp(
-    config: Config,
-    *,
-    use_gpu: int = -1,
-    logger: Callable[[Any], Any] = logger,
-    on_success: Callable[[str], None] = lambda x: None,
-) -> Language:
+def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
+    msg = Printer(no_print=silent)
     raw_config = config
     config = raw_config.interpolate()
     if config["training"]["seed"] is not None:
@@ -30,7 +26,7 @@ def init_nlp(
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     nlp = load_model_from_config(raw_config, auto_fill=True)
-    on_success("Set up nlp object from config")
+    msg.good("Set up nlp object from config")
     config = nlp.config.interpolate()
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
@@ -38,29 +34,31 @@ def init_nlp(
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
     V = I["vocab"]
-    init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
+    init_vocab(
+        nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
+    )
     optimizer = T["optimizer"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
     # Sourced components that require resume_training
     resume_components = [p for p in sourced_components if p not in frozen_components]
-    logger.info(f"Pipeline: {nlp.pipe_names}")
+    msg.info(f"Pipeline: {nlp.pipe_names}")
     if resume_components:
         with nlp.select_pipes(enable=resume_components):
-            logger.info(f"Resuming training for: {resume_components}")
+            msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
-        on_success(f"Initialized pipeline components")
+        msg.good(f"Initialized pipeline components")
     # Verify the config after calling 'begin_training' to ensure labels
     # are properly initialized
     verify_config(nlp)
     if "pretraining" in config and config["pretraining"]:
         P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        loaded = add_tok2vec_weights(nlp, P, I)
+        loaded = add_tok2vec_weights(nlp, P, V)
         if loaded and P["component"]:
-            on_success(f"Loaded pretrained weights into component '{P['component']}'")
+            msg.good(f"Loaded pretrained weights into component '{P['component']}'")
     nlp = before_to_disk(nlp)
     return nlp
 
@@ -76,11 +74,12 @@ def init_vocab(
     data: Optional[Path] = None,
     lookups: Optional[Lookups] = None,
     vectors: Optional[str] = None,
-    on_success: Callable[[str], None] = lambda x: None,
+    silent: bool = True,
 ) -> Language:
+    msg = Printer(no_print=silent)
     if lookups:
         nlp.vocab.lookups = lookups
-        on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
+        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
     data_path = ensure_path(data)
     if data_path is not None:
         lex_attrs = srsly.read_jsonl(data_path)
@@ -96,11 +95,11 @@ def init_vocab(
         else:
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
-    on_success("Created vocabulary")
+        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    msg.good("Created vocabulary")
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
-        on_success(f"Added vectors: {vectors}")
+        msg.good(f"Added vectors: {vectors}")
 
 
 def load_vectors_into_model(
@@ -137,8 +136,8 @@ def add_tok2vec_weights(
     init_tok2vec = ensure_path(V["init_tok2vec"])
     if init_tok2vec is not None:
         if P["objective"].get("type") == "vectors" and not V["vectors"]:
-            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
-            errors = [{"loc": ["initialize", "vectors"], "msg": err}]
+            err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
+            errors = [{"loc": ["initialize", "vocab"], "msg": err}]
             raise ConfigValidationError(config=nlp.config, errors=errors)
         if not init_tok2vec.exists():
             err = f"can't find pretrained tok2vec: {init_tok2vec}"
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 3e3e9f5ce..5153be66c 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -5,12 +5,13 @@ from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 import random
 import tqdm
+from wasabi import Printer
 
 from .example import Example
 from ..schemas import ConfigSchemaTraining
 from ..language import Language
 from ..errors import Errors
-from ..util import resolve_dot_names, registry, logger
+from ..util import resolve_dot_names, registry
 
 
 def train(
@@ -18,8 +19,8 @@ def train(
     output_path: Optional[Path] = None,
     *,
     use_gpu: int = -1,
-    logger: Callable[[Any], Any] = logger,
-) -> Optional[Path]:
+    silent: bool = False,
+) -> None:
     """Train a pipeline.
 
     nlp (Language): The initialized nlp object with the full config.
@@ -31,7 +32,7 @@ def train(
         swapped for CLI logger.
     RETURNS (Path / None): The path to the final exported model.
     """
-
+    msg = Printer(no_print=silent)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
     if config["training"]["seed"] is not None:
@@ -62,10 +63,10 @@ def train(
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
     )
-    logger.info(f"Pipeline: {nlp.pipe_names}")
+    msg.info(f"Pipeline: {nlp.pipe_names}")
     if frozen_components:
-        logger.info(f"Frozen components: {frozen_components}")
-    logger.info(f"Initial learn rate: {optimizer.learn_rate}")
+        msg.info(f"Frozen components: {frozen_components}")
+    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
     with nlp.select_pipes(disable=frozen_components):
         print_row, finalize_logger = train_logger(nlp)
     try:
@@ -89,7 +90,7 @@ def train(
         if output_path is not None:
             # We don't want to swallow the traceback if we don't have a
             # specific error.
-            logger.warn(
+            msg.warn(
                 f"Aborting and saving the final best model. "
                 f"Encountered exception: {str(e)}"
             )
@@ -105,7 +106,7 @@ def train(
                     nlp.to_disk(final_model_path)
             else:
                 nlp.to_disk(final_model_path)
-            return final_model_path
+            msg.good(f"Saved pipeline to output directory", final_model_path)
 
 
 def train_while_improving(
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index e8dd9df30..5e136cdf1 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, Any, Iterable, Union, List
+from typing import Optional, Callable, Iterable, Union, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
 from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
 from pathlib import Path
@@ -8,7 +8,7 @@ import srsly
 import numpy
 import time
 import re
-from wasabi import msg
+from wasabi import Printer
 
 from .example import Example
 from ..tokens import Doc
@@ -16,7 +16,7 @@ from ..attrs import ID
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
-from ..util import registry, load_model_from_config, dot_to_object, logger
+from ..util import registry, load_model_from_config, dot_to_object
 
 
 def pretrain(
@@ -25,8 +25,9 @@ def pretrain(
     resume_path: Optional[Path] = None,
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
-    logger: Callable[[Any], Any] = logger,
+    silent: bool = True,
 ):
+    msg = Printer(no_print=silent)
     if config["training"]["seed"] is not None:
         fix_random_seed(config["training"]["seed"])
     allocator = config["training"]["gpu_allocator"]
@@ -42,11 +43,10 @@ def pretrain(
     optimizer = P["optimizer"]
     # Load in pretrained weights to resume from
     if resume_path is not None:
-        _resume_model(model, resume_path, epoch_resume)
+        _resume_model(model, resume_path, epoch_resume, silent=silent)
     else:
         # Without '--resume-path' the '--epoch-resume' argument is ignored
         epoch_resume = 0
-
     # TODO: move this to logger function?
     tracker = ProgressTracker(frequency=10000)
     msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
 
 
 def _resume_model(
-    model: Model,
-    resume_path: Path,
-    epoch_resume: int,
-    logger: Callable[[Any], Any] = logger,
+    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
 ) -> None:
-    logger.info(f"Resume training tok2vec from: {resume_path}")
+    msg = Printer(no_print=silent)
+    msg.info(f"Resume training tok2vec from: {resume_path}")
     with resume_path.open("rb") as file_:
         weights_data = file_.read()
         model.get_ref("tok2vec").from_bytes(weights_data)
@@ -108,9 +106,9 @@ def _resume_model(
     if model_name:
         # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
         epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
-        logger.info(f"Resuming from epoch: {epoch_resume}")
+        msg.info(f"Resuming from epoch: {epoch_resume}")
     else:
-        logger.info(f"Resuming from epoch: {epoch_resume}")
+        msg.info(f"Resuming from epoch: {epoch_resume}")
 
 
 def make_update(

From 046f655d860601b54265a24af04a7b3352209772 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 21:17:45 +0200
Subject: [PATCH 224/516] Fix error

---
 spacy/training/initialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index ecfc57ee9..24b00a764 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -141,7 +141,7 @@ def add_tok2vec_weights(
             raise ConfigValidationError(config=nlp.config, errors=errors)
         if not init_tok2vec.exists():
             err = f"can't find pretrained tok2vec: {init_tok2vec}"
-            errors = [{"loc": ["initialize", "vectors", "init_tok2vec"], "msg": err}]
+            errors = [{"loc": ["initialize", "vocab", "init_tok2vec"], "msg": err}]
             raise ConfigValidationError(config=nlp.config, errors=errors)
         with init_tok2vec.open("rb") as file_:
             weights_data = file_.read()

From ff9a63bfbd70b0fe140f352da22833c0109eaa2c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 28 Sep 2020 21:35:09 +0200
Subject: [PATCH 225/516] begin_training -> initialize

---
 spacy/cli/debug_model.py                      |  4 +-
 spacy/errors.py                               |  7 ++-
 spacy/language.py                             | 20 +++++--
 spacy/pipeline/dep_parser.pyx                 |  2 +-
 spacy/pipeline/entity_linker.py               |  4 +-
 spacy/pipeline/morphologizer.pyx              |  4 +-
 spacy/pipeline/multitask.pyx                  |  6 +-
 spacy/pipeline/ner.pyx                        |  2 +-
 spacy/pipeline/pipe.pyx                       |  4 +-
 spacy/pipeline/sentencizer.pyx                |  2 +-
 spacy/pipeline/senter.pyx                     |  4 +-
 spacy/pipeline/tagger.pyx                     |  4 +-
 spacy/pipeline/textcat.py                     |  4 +-
 spacy/pipeline/tok2vec.py                     |  4 +-
 spacy/pipeline/transition_parser.pyx          |  2 +-
 spacy/tests/doc/test_add_entities.py          |  4 +-
 spacy/tests/parser/test_add_label.py          |  4 +-
 spacy/tests/parser/test_ner.py                | 20 +++----
 spacy/tests/parser/test_parse.py              |  2 +-
 spacy/tests/parser/test_preset_sbd.py         |  2 +-
 spacy/tests/pipeline/test_entity_linker.py    |  8 +--
 spacy/tests/pipeline/test_morphologizer.py    | 18 +++---
 spacy/tests/pipeline/test_senter.py           | 12 ++--
 spacy/tests/pipeline/test_tagger.py           | 30 +++++-----
 spacy/tests/pipeline/test_textcat.py          | 24 ++++----
 spacy/tests/pipeline/test_tok2vec.py          |  4 +-
 spacy/tests/regression/test_issue1-1000.py    |  2 +-
 spacy/tests/regression/test_issue1501-2000.py |  2 +-
 spacy/tests/regression/test_issue2001-2500.py |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  4 +-
 spacy/tests/regression/test_issue3001-3500.py |  4 +-
 spacy/tests/regression/test_issue3501-4000.py |  8 +--
 spacy/tests/regression/test_issue4001-4500.py | 12 ++--
 spacy/tests/regression/test_issue4501-5000.py |  2 +-
 spacy/tests/regression/test_issue5230.py      |  4 +-
 spacy/tests/regression/test_issue5551.py      |  2 +-
 .../tests/serialize/test_serialize_config.py  |  6 +-
 spacy/tests/test_language.py                  |  2 +-
 spacy/tests/training/test_readers.py          |  4 +-
 spacy/tests/training/test_training.py         |  2 +-
 spacy/training/initialize.py                  |  4 +-
 website/docs/api/architectures.md             | 60 +++++++++----------
 website/docs/api/dependencyparser.md          | 21 ++++---
 website/docs/api/entitylinker.md              | 12 +++-
 website/docs/api/entityrecognizer.md          | 21 ++++---
 website/docs/api/language.md                  | 23 +++----
 website/docs/api/morphologizer.md             | 17 +++---
 website/docs/api/pipe.md                      | 24 +++++---
 website/docs/api/sentencerecognizer.md        |  6 +-
 website/docs/api/tagger.md                    | 22 ++++---
 website/docs/api/textcategorizer.md           | 26 ++++----
 website/docs/api/tok2vec.md                   |  6 +-
 website/docs/api/transformer.md               |  6 +-
 website/docs/usage/layers-architectures.md    |  6 +-
 website/docs/usage/processing-pipelines.md    | 12 ++--
 website/docs/usage/training.md                |  6 +-
 website/docs/usage/v3.md                      | 25 ++++----
 57 files changed, 301 insertions(+), 253 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 0b4db70b6..eca85dc04 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -103,12 +103,12 @@ def debug_model(
     with data_validation(False):
         try:
             train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.begin_training(lambda: train_corpus(nlp))
+            nlp.initialize(lambda: train_corpus(nlp))
             msg.info("Initialized the model with the training corpus.")
         except ValueError:
             try:
                 _set_output_dim(nO=7, model=model)
-                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except Exception:
                 msg.fail(
diff --git a/spacy/errors.py b/spacy/errors.py
index 640419182..1f9bcb0ae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,6 +85,7 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
+    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@@ -306,7 +307,7 @@ class Errors:
             "settings: {opts}")
     E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
     E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call begin_training()?")
+            "call initialize()?")
     E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
     E111 = ("Pickling a token is not supported, because tokens are only views "
             "of the parent Doc and can't exist on their own. A pickled token "
@@ -376,7 +377,7 @@ class Errors:
             "provided {found}.")
     E143 = ("Labels for component '{name}' not initialized. This can be fixed "
             "by calling add_label, or by providing a representative batch of "
-            "examples to the component's begin_training method.")
+            "examples to the component's initialize method.")
     E145 = ("Error reading `{param}` from input file.")
     E146 = ("Could not access `{path}`.")
     E147 = ("Unexpected error in the {method} functionality of the "
@@ -517,7 +518,7 @@ class Errors:
             "but the provided argument {loc} points to a file.")
     E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
             "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.begin_training. "
+    E930 = ("Received invalid get_examples callback in {name}.initialize. "
             "Expected function that returns an iterable of Example objects but "
             "got: {obj}")
     E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
diff --git a/spacy/language.py b/spacy/language.py
index c1d2df026..a5b78b178 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1154,6 +1154,16 @@ class Language:
         *,
         sgd: Optional[Optimizer] = None,
         device: int = -1,
+    ) -> Optimizer:
+        warnings.warn(Warnings.W089, DeprecationWarning)
+        return self.initialize(get_examples, sgd=sgd, device=device)
+
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
+        device: int = -1,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
 
@@ -1163,11 +1173,11 @@ class Language:
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/language#begin_training
+        DOCS: https://nightly.spacy.io/api/language#initialize
         """
         if get_examples is None:
             util.logger.debug(
-                "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
             )
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
@@ -1179,7 +1189,7 @@ class Language:
         for example in get_examples():
             if not isinstance(example, Example):
                 err = Errors.E978.format(
-                    name="Language.begin_training", types=type(example)
+                    name="Language.initialize", types=type(example)
                 )
                 raise ValueError(err)
             else:
@@ -1198,8 +1208,8 @@ class Language:
             sgd = create_default_optimizer()
         self._optimizer = sgd
         for name, proc in self.pipeline:
-            if hasattr(proc, "begin_training"):
-                proc.begin_training(
+            if hasattr(proc, "initialize"):
+                proc.initialize(
                     get_examples, pipeline=self.pipeline, sgd=self._optimizer
                 )
         self._link_components()
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index a447434d2..95effac59 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 039e2a891..0f33378b4 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -140,7 +140,7 @@ class EntityLinker(Pipe):
         if len(self.kb) == 0:
             raise ValueError(Errors.E139.format(name=self.name))
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -159,7 +159,7 @@ class EntityLinker(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
+        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
         """
         self._ensure_examples(get_examples)
         self._require_kb()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 5fee9a900..d035172a8 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -129,7 +129,7 @@ class Morphologizer(Tagger):
             self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -142,7 +142,7 @@ class Morphologizer(Tagger):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
+        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
         """
         self._ensure_examples(get_examples)
         # First, fetch all labels from the data
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 2f8940124..3fd034b30 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
             raise ValueError(err)
@@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         self.model.initialize()  # TODO: fix initialization by defining X and Y
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
-        self.model.output_layer.begin_training(X)
+        self.model.output_layer.initialize(X)
         if sgd is None:
             sgd = self.create_optimizer()
         return sgd
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index fc0dda40d..effcef2e3 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.begin_training(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, pipeline=pipeline)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 324c8e19c..bff2be1af 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -183,7 +183,7 @@ cdef class Pipe:
         """
         return util.create_default_optimizer()
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using data examples if available.
         This method needs to be implemented by each Pipe component,
         ensuring the internal model (if available) is initialized properly
@@ -198,7 +198,7 @@ cdef class Pipe:
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/pipe#begin_training
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
         """
         raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
 
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 2882f6f8b..0f49033ff 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -58,7 +58,7 @@ class Sentencizer(Pipe):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None, sgd=None):
         pass
 
     def __call__(self, doc):
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index da85a9cf2..68a9860a5 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
+        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3efe29916..66f8b38b6 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -256,7 +256,7 @@ class Tagger(Pipe):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def begin_training(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None, sgd=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -269,7 +269,7 @@ class Tagger(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/tagger#begin_training
+        DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 6b8c0ca65..37665adfc 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
         self.labels = tuple(list(self.labels) + [label])
         return 1
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
+        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
         self._ensure_examples(get_examples)
         subbatch = []  # Select a subbatch of examples to initialize the model
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 9ab4e42b7..7c8bbf5e5 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
     def get_loss(self, examples, scores) -> None:
         pass
 
-    def begin_training(
+    def initialize(
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
@@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
             create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
-        DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
+        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
         """
         self._ensure_examples(get_examples)
         doc_sample = []
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 1350e1f12..5a4503cf9 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -405,7 +405,7 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
         self._ensure_examples(get_examples)
         self.cfg.update(kwargs)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 86aa883bd..fa0206fdd 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
     ner(doc)
 
     doc.ents = [("ANIMAL", 3, 4)]
@@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
     cfg = {"model": DEFAULT_NER_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
     ner = EntityRecognizer(en_vocab, model, **config)
-    ner.begin_training(lambda: [_ner_example(ner)])
+    ner.initialize(lambda: [_ner_example(ner)])
     ner(doc)
     orig_iobs = [t.ent_iob_ for t in doc]
     doc.ents = list(doc.ents)
diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index cd376e0fc..fb1eabf7d 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
     fix_random_seed(1)
     parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
     sgd = Adam(0.001)
 
     for i in range(5):
@@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
     ner1.add_label("C")
     ner1.add_label("B")
     ner1.add_label("A")
-    ner1.begin_training(lambda: [_ner_example(ner1)])
+    ner1.initialize(lambda: [_ner_example(ner1)])
     ner2 = EntityRecognizer(Vocab(), model, **config)
 
     # the second model needs to be resized before we can call from_bytes
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index cd5581769..b657ae2e8 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -202,7 +202,7 @@ def test_train_empty():
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     ner = nlp.add_pipe("ner", last=True)
     ner.add_label("PERSON")
-    nlp.begin_training()
+    nlp.initialize()
     for itn in range(2):
         losses = {}
         batches = util.minibatch(train_examples, size=8)
@@ -213,7 +213,7 @@ def test_train_empty():
 def test_overwrite_token():
     nlp = English()
     nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
     # The untrained NER will predict O for each token
     doc = nlp("I live in New York")
     assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@@ -235,7 +235,7 @@ def test_empty_ner():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("John is watching the news about Croatia's elections")
     # if this goes wrong, the initialization of the parser's upper layer is probably broken
     result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@@ -254,7 +254,7 @@ def test_ruler_before_ner():
     # 2: untrained NER - should set everything else to O
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("This is Antti Korhonen speaking in Finland")
     expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
     expected_types = ["THING", "", "", "", "", "", ""]
@@ -269,7 +269,7 @@ def test_ner_before_ruler():
     # 1: untrained NER - should set everything to O
     untrained_ner = nlp.add_pipe("ner", name="uner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
 
     # 2 : Entity Ruler - should set "this" to B and keep everything else O
     patterns = [{"label": "THING", "pattern": "This"}]
@@ -290,7 +290,7 @@ def test_block_ner():
     nlp.add_pipe("blocker", config={"start": 2, "end": 5})
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("This is Antti L Korhonen speaking in Finland")
     expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
     expected_types = ["", "", "", "", "", "", "", ""]
@@ -307,7 +307,7 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for ent in annotations.get("entities"):
             ner.add_label(ent[2])
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
 
     for i in range(50):
         losses = {}
@@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
     assert not len(nlp.vocab.lookups)
     nlp.add_pipe("ner")
     with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
         assert "W033" in caplog.text
     caplog.clear()
     nlp.vocab.lookups.add_table("lexeme_norm")
     nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
     with caplog.at_level(logging.DEBUG):
-        nlp.begin_training()
+        nlp.initialize()
         assert "W033" not in caplog.text
 
 
@@ -358,5 +358,5 @@ class BlockerComponent1:
         self.name = name
 
     def __call__(self, doc):
-        doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
+        doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
         return doc
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 8648f2018..ffb6f23f1 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -191,7 +191,7 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(100):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index e8dfa68c7..d8f861b02 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -34,7 +34,7 @@ def parser(vocab):
     parser.cfg["hidden_width"] = 32
     # parser.add_label('right')
     parser.add_label("left")
-    parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
     sgd = Adam(0.001)
 
     for i in range(10):
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 878f41a28..d5c8de36b 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
     """Test that the EL can't train without defining a KB"""
     entity_linker = nlp.add_pipe("entity_linker", config={})
     with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 
 
 def test_kb_empty(nlp):
@@ -143,7 +143,7 @@ def test_kb_empty(nlp):
     entity_linker = nlp.add_pipe("entity_linker", config=config)
     assert len(entity_linker.kb) == 0
     with pytest.raises(ValueError):
-        entity_linker.begin_training(lambda: [])
+        entity_linker.initialize(lambda: [])
 
 
 def test_kb_serialize(nlp):
@@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
     ruler.add_patterns(patterns)
     el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
     entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
-    nlp.begin_training()
+    nlp.initialize()
     assert entity_linker.model.get_dim("nO") == vector_length
 
     # test whether the entity links are preserved by the `as_doc()` function
@@ -463,7 +463,7 @@ def test_overfitting_IO():
     )
 
     # train the NEL pipe
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert entity_linker.model.get_dim("nO") == vector_length
     assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
 
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 864c7332e..c86ee3617 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -33,7 +33,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("morphologizer")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_implicit_label():
@@ -42,7 +42,7 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
 def test_no_resize():
@@ -50,13 +50,13 @@ def test_no_resize():
     morphologizer = nlp.add_pipe("morphologizer")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
-    nlp.begin_training()
+    nlp.initialize()
     # this throws an error because the morphologizer can't be resized after initialization
     with pytest.raises(ValueError):
         morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     morphologizer = nlp.add_pipe("morphologizer")
     morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@@ -64,12 +64,12 @@ def test_begin_training_examples():
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -79,7 +79,7 @@ def test_overfitting_IO():
     train_examples = []
     for inst in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
 
     for i in range(50):
         losses = {}
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 5827f8ff1..5d8a8be41 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -31,19 +31,19 @@ TRAIN_DATA = [
 ]
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     nlp.add_pipe("senter")
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -58,7 +58,7 @@ def test_overfitting_IO():
     train_examples[1].reference[11].is_sent_start = False
 
     nlp.add_pipe("senter")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
 
     for i in range(200):
         losses = {}
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index cd5927675..69a6dd414 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -15,14 +15,14 @@ def test_label_types():
         tagger.add_label(9)
 
 
-def test_tagger_begin_training_tag_map():
-    """Test that Tagger.begin_training() without gold tuples does not clobber
+def test_tagger_initialize_tag_map():
+    """Test that Tagger.initialize() without gold tuples does not clobber
     the tag map."""
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     orig_tag_count = len(tagger.labels)
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
 
 
@@ -38,7 +38,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("tagger")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_no_resize():
@@ -47,7 +47,7 @@ def test_no_resize():
     tagger.add_label("N")
     tagger.add_label("V")
     assert tagger.labels == ("N", "V")
-    nlp.begin_training()
+    nlp.initialize()
     assert tagger.model.get_dim("nO") == 2
     # this throws an error because the tagger can't be resized after initialization
     with pytest.raises(ValueError):
@@ -60,10 +60,10 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     train_examples = []
@@ -72,16 +72,16 @@ def test_begin_training_examples():
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: train_examples[0])
+        nlp.initialize(get_examples=lambda: train_examples[0])
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=lambda: [])
+        nlp.initialize(get_examples=lambda: [])
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -91,7 +91,7 @@ def test_overfitting_IO():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert tagger.model.get_dim("nO") == len(TAGS)
 
     for i in range(50):
@@ -122,4 +122,4 @@ def test_tagger_requires_labels():
     nlp = English()
     nlp.add_pipe("tagger")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 02e189834..2870229c8 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -26,7 +26,7 @@ def test_simple_train():
     nlp = Language()
     textcat = nlp.add_pipe("textcat")
     textcat.add_label("answer")
-    nlp.begin_training()
+    nlp.initialize()
     for i in range(5):
         for text, answer in [
             ("aaaa", 1.0),
@@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
     textcat = TextCategorizer(nlp.vocab, width=8)
     for letter in letters:
         textcat.add_label(letter)
-    optimizer = textcat.begin_training(lambda: [])
+    optimizer = textcat.initialize(lambda: [])
     for i in range(30):
         losses = {}
         examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@@ -86,7 +86,7 @@ def test_no_label():
     nlp = Language()
     nlp.add_pipe("textcat")
     with pytest.raises(ValueError):
-        nlp.begin_training()
+        nlp.initialize()
 
 
 def test_implicit_label():
@@ -95,7 +95,7 @@ def test_implicit_label():
     train_examples = []
     for t in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=lambda: train_examples)
 
 
 def test_no_resize():
@@ -103,14 +103,14 @@ def test_no_resize():
     textcat = nlp.add_pipe("textcat")
     textcat.add_label("POSITIVE")
     textcat.add_label("NEGATIVE")
-    nlp.begin_training()
+    nlp.initialize()
     assert textcat.model.get_dim("nO") == 2
     # this throws an error because the textcat can't be resized after initialization
     with pytest.raises(ValueError):
         textcat.add_label("NEUTRAL")
 
 
-def test_begin_training_examples():
+def test_initialize_examples():
     nlp = Language()
     textcat = nlp.add_pipe("textcat")
     train_examples = []
@@ -119,12 +119,12 @@ def test_begin_training_examples():
         for label, value in annotations.get("cats").items():
             textcat.add_label(label)
     # you shouldn't really call this more than once, but for testing it should be fine
-    nlp.begin_training()
-    nlp.begin_training(get_examples=lambda: train_examples)
+    nlp.initialize()
+    nlp.initialize(get_examples=lambda: train_examples)
     with pytest.raises(TypeError):
-        nlp.begin_training(get_examples=lambda: None)
+        nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.begin_training(get_examples=train_examples)
+        nlp.initialize(get_examples=train_examples)
 
 
 def test_overfitting_IO():
@@ -139,7 +139,7 @@ def test_overfitting_IO():
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
-    optimizer = nlp.begin_training(get_examples=lambda: train_examples)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert textcat.model.get_dim("nO") == 2
 
     for i in range(50):
@@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for label, value in annotations.get("cats").items():
             textcat.add_label(label)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 558b9079c..f84b78247 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -88,7 +88,7 @@ def test_init_tok2vec():
     nlp = English()
     tok2vec = nlp.add_pipe("tok2vec")
     assert tok2vec.listeners == []
-    nlp.begin_training()
+    nlp.initialize()
     assert tok2vec.model.get_dim("nO")
 
 
@@ -154,7 +154,7 @@ def test_tok2vec_listener():
 
     # Check that the Tok2Vec component finds it listeners
     assert tok2vec.listeners == []
-    optimizer = nlp.begin_training(lambda: train_examples)
+    optimizer = nlp.initialize(lambda: train_examples)
     assert tok2vec.listeners == [tagger_tok2vec]
 
     for i in range(5):
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index d841ee24b..6bb71f6f4 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -428,7 +428,7 @@ def test_issue999():
     for _, offsets in TRAIN_DATA:
         for start, end, label in offsets:
             ner.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
     for itn in range(20):
         random.shuffle(TRAIN_DATA)
         for raw_text, entity_offsets in TRAIN_DATA:
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index dce3e8298..f85ec70e1 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -250,7 +250,7 @@ def test_issue1915():
     ner = nlp.add_pipe("ner")
     ner.add_label("answer")
     with pytest.raises(ValueError):
-        nlp.begin_training(**cfg)
+        nlp.initialize(**cfg)
 
 
 def test_issue1945():
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index c4c755153..09baab4d8 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -30,7 +30,7 @@ def test_issue2179():
     nlp = Italian()
     ner = nlp.add_pipe("ner")
     ner.add_label("CITIZENSHIP")
-    nlp.begin_training()
+    nlp.initialize()
     nlp2 = Italian()
     nlp2.add_pipe("ner")
     assert len(nlp2.get_pipe("ner").labels) == 0
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 5895b616e..4952a545d 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -18,7 +18,7 @@ def test_issue2564():
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     doc = nlp("hello world")
     assert doc.has_annotation("TAG")
     docs = nlp.pipe(["hello", "world"])
@@ -149,7 +149,7 @@ def test_issue2800():
     ner = nlp.add_pipe("ner")
     for entity_type in list(entity_types):
         ner.add_label(entity_type)
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(20):
         losses = {}
         random.shuffle(train_data)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 56ef23dbf..6fc42e83f 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -92,7 +92,7 @@ def test_issue3209():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("ANIMAL")
-    nlp.begin_training()
+    nlp.initialize()
     move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
     assert ner.move_names == move_names
     nlp2 = English()
@@ -239,7 +239,7 @@ def test_issue3456():
     nlp = English()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     list(nlp.pipe(["hi", ""]))
 
 
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 304e654c3..31e441d86 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -223,7 +223,7 @@ def test_issue3611():
         textcat.add_label(label)
     # training the network
     with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
     assert "subtok" not in parser.labels
 
 
@@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
     parser = DependencyParser(Vocab(), model, **config)
     parser.add_label("nsubj")
     assert "subtok" not in parser.labels
-    parser.begin_training(lambda: [_parser_example(parser)])
+    parser.initialize(lambda: [_parser_example(parser)])
     assert "subtok" in parser.labels
 
 
@@ -342,7 +342,7 @@ def test_issue3880():
     nlp.add_pipe("parser").add_label("dep")
     nlp.add_pipe("ner").add_label("PERSON")
     nlp.add_pipe("tagger").add_label("NN")
-    nlp.begin_training()
+    nlp.initialize()
     for doc in nlp.pipe(texts):
         pass
 
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 7b7ddfe0d..753cff37f 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -66,7 +66,7 @@ def test_issue4030():
         textcat.add_label(label)
     # training the network
     with nlp.select_pipes(enable="textcat"):
-        optimizer = nlp.begin_training()
+        optimizer = nlp.initialize()
         for i in range(3):
             losses = {}
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@@ -87,7 +87,7 @@ def test_issue4042():
     # add ner pipe
     ner = nlp.add_pipe("ner")
     ner.add_label("SOME_LABEL")
-    nlp.begin_training()
+    nlp.initialize()
     # Add entity ruler
     patterns = [
         {"label": "MY_ORG", "pattern": "Apple"},
@@ -118,7 +118,7 @@ def test_issue4042_bug2():
     # add ner pipe
     ner1 = nlp1.add_pipe("ner")
     ner1.add_label("SOME_LABEL")
-    nlp1.begin_training()
+    nlp1.initialize()
     # add a new label to the doc
     doc1 = nlp1("What do you think about Apple ?")
     assert len(ner1.labels) == 1
@@ -244,7 +244,7 @@ def test_issue4267():
     nlp = English()
     ner = nlp.add_pipe("ner")
     ner.add_label("PEOPLE")
-    nlp.begin_training()
+    nlp.initialize()
     assert "ner" in nlp.pipe_names
     # assert that we have correct IOB annotations
     doc1 = nlp("hi")
@@ -299,7 +299,7 @@ def test_issue4313():
     config = {}
     ner = nlp.create_pipe("ner", config=config)
     ner.add_label("SOME_LABEL")
-    ner.begin_training(lambda: [])
+    ner.initialize(lambda: [])
     # add a new label to the doc
     doc = nlp("What do you think about Apple ?")
     assert len(ner.labels) == 1
@@ -327,7 +327,7 @@ def test_issue4348():
     TRAIN_DATA = [example, example]
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index e351858f5..6dbbc233b 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -180,7 +180,7 @@ def test_issue4725_2():
     vocab.set_vector("dog", data[1])
     nlp = English(vocab=vocab)
     nlp.add_pipe("ner")
-    nlp.begin_training()
+    nlp.initialize()
     docs = ["Kurt is in London."] * 10
     for _ in nlp.pipe(docs, batch_size=2, n_process=2):
         pass
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 531e48ec3..5e320996a 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -64,7 +64,7 @@ def tagger():
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
     tagger.add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     return tagger
 
 
@@ -85,7 +85,7 @@ def entity_linker():
     # need to add model for two reasons:
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
-    nlp.begin_training()
+    nlp.initialize()
     return entity_linker
 
 
diff --git a/spacy/tests/regression/test_issue5551.py b/spacy/tests/regression/test_issue5551.py
index b7139d463..655764362 100644
--- a/spacy/tests/regression/test_issue5551.py
+++ b/spacy/tests/regression/test_issue5551.py
@@ -25,7 +25,7 @@ def test_issue5551():
         pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
         for label in set(example[1]["cats"]):
             pipe.add_label(label)
-        nlp.begin_training()
+        nlp.initialize()
 
         # Store the result of each iteration
         result = pipe.model.predict([nlp.make_doc(example[0])])
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index eb5f15007..663e76550 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -152,7 +152,7 @@ def test_serialize_nlp():
     nlp_config = Config().from_str(nlp_config_string)
     nlp = load_model_from_config(nlp_config, auto_fill=True)
     nlp.get_pipe("tagger").add_label("A")
-    nlp.begin_training()
+    nlp.initialize()
     assert "tok2vec" in nlp.pipe_names
     assert "tagger" in nlp.pipe_names
     assert "parser" not in nlp.pipe_names
@@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
     parser_cfg = dict()
     parser_cfg["model"] = {"@architectures": "my_test_parser"}
     nlp.add_pipe("parser", config=parser_cfg)
-    nlp.begin_training()
+    nlp.initialize()
 
     with make_tempdir() as d:
         nlp.to_disk(d)
@@ -191,7 +191,7 @@ def test_serialize_parser():
     model_config = Config().from_str(parser_config_string)
     parser = nlp.add_pipe("parser", config=model_config)
     parser.add_label("nsubj")
-    nlp.begin_training()
+    nlp.initialize()
 
     with make_tempdir() as d:
         nlp.to_disk(d)
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index da46ad424..6a487303e 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -18,7 +18,7 @@ def nlp():
     textcat = nlp.add_pipe("textcat")
     for label in ("POSITIVE", "NEGATIVE"):
         textcat.add_label(label)
-    nlp.begin_training()
+    nlp.initialize()
     return nlp
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 5c02aca36..ea39e8b90 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -47,7 +47,7 @@ def test_readers():
     )
     optimizer = T["optimizer"]
     # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
         nlp.update([example], sgd=optimizer)
     scores = nlp.evaluate(list(dev_corpus(nlp)))
@@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
     )
     optimizer = T["optimizer"]
     # simulate a training loop
-    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
     for example in train_corpus(nlp):
         assert example.y.cats
         # this shouldn't fail if each training example has at least one positive label
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index a04e6aadd..9655dd1b6 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -600,7 +600,7 @@ def _train_tuples(train_data):
     train_examples = []
     for t in train_data:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    optimizer = nlp.begin_training()
+    optimizer = nlp.initialize()
     for i in range(5):
         losses = {}
         batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 24b00a764..23debfb28 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         msg.good(f"Initialized pipeline components")
-    # Verify the config after calling 'begin_training' to ensure labels
+    # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
     if "pretraining" in config and config["pretraining"]:
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index ef2666ec0..3f6258be9 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -517,18 +517,18 @@ specific data and challenge.
 Stacked ensemble of a bag-of-words model and a neural network model. The neural
 network has an internal CNN Tok2Vec layer and uses attention.
 
-| Name                 | Description                                                                                                                                                                                        |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                        |
-| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                             |
-| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                              |
-| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                                |
-| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                        |
-| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                        |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                 | Description                                                                                                                                                                                    |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
+| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
+| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                          |
+| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                            |
+| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
+| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
 
@@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
 vectors are mean pooled and used as features in a feed-forward network. This
 architecture is usually less accurate than the ensemble, but runs faster.
 
-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                            |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ### spacy.TextCatBOW.v1 {#TextCatBOW}
 
@@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
 An ngram "bag-of-words" model. This architecture should run much faster than the
 others, but may not be as accurate, especially if texts are short.
 
-| Name                | Description                                                                                                                                                                                        |
-| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
-| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                               |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                   |
+| Name                | Description                                                                                                                                                                                    |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                           |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
 
@@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
 The `EntityLinker` model architecture is a Thinc `Model` with a
 [`Linear`](https://thinc.ai/api-layers#linear) output layer.
 
-| Name        | Description                                                                                                                                                                                                             |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                                 |
-| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                        |
+| Name        | Description                                                                                                                                                                                                         |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`   | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                                             |
+| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 
 ### spacy.EmptyKB.v1 {#EmptyKB}
 
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 8af4455d3..c7c41f2a1 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## DependencyParser.begin_training {#begin_training tag="method"}
+## DependencyParser.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -151,11 +151,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = parser.update(examples, sgd=optimizer)
 > ```
 
@@ -294,11 +300,10 @@ context, the original parameters are restored.
 ## DependencyParser.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 945a1568a..1dbe78703 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityLinker.begin_training {#begin_training tag="method"}
+## EntityLinker.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -150,11 +150,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 6d710f425..2c32ff753 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityRecognizer.begin_training {#begin_training tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -140,11 +140,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = ner.update(examples, sgd=optimizer)
 > ```
 
@@ -282,11 +288,10 @@ context, the original parameters are restored.
 ## EntityRecognizer.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Note that you don't have to call this method if you
-provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+provide a **representative data sample** to the [`initialize`](#initialize)
+method. In this case, all labels found in the sample will be automatically added
+to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index dd3cc57dd..11631502c 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
 
-## Language.begin_training {#begin_training tag="method"}
+## Language.initialize {#initialize tag="method"}
 
 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
 function that returns an iterable of [`Example`](/api/example) objects. The data
 examples can either be the full training data or a representative sample. They
 are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`begin_training`](/api/pipe#begin_training) method, if
+passed each component's [`initialize`](/api/pipe#initialize) method, if
 available. Initialization includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.
 
-If no `get_examples` function is provided when calling `nlp.begin_training`, the
+If no `get_examples` function is provided when calling `nlp.initialize`, the
 pipeline components will be initialized with generic data. In this case, it is
 crucial that the output dimension of each component has already been defined
 either in the [config](/usage/training#config), or by calling
 [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
 the tagger or textcat).
 
-<Infobox variant="warning" title="Changed in v3.0">
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
-The `Language.update` method now takes a **function** that is called with no
-arguments and returns a sequence of [`Example`](/api/example) objects instead of
-tuples of `Doc` and `GoldParse` objects.
+This method was previously called `begin_training`. It now also takes a
+**function** that is called with no arguments and returns a sequence of
+[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
+objects.
 
 </Infobox>
 
@@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
 >
 > ```python
 > get_examples = lambda: examples
-> optimizer = nlp.begin_training(get_examples)
+> optimizer = nlp.initialize(get_examples)
 > ```
 
 | Name           | Description                                                                                                                                              |
@@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
 >
 > ```python
 > with nlp.select_pipes(disable=["tagger", "parser"]):
->    nlp.begin_training()
+>    nlp.initialize()
 >
 > with nlp.select_pipes(enable="ner"):
->     nlp.begin_training()
+>     nlp.initialize()
 >
 > disabled = nlp.select_pipes(disable=["tagger", "parser"])
-> nlp.begin_training()
+> nlp.initialize()
 > disabled.restore()
 > ```
 
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index e1a166474..4f00a09ef 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Morphologizer.begin_training {#begin_training tag="method"}
+## Morphologizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -133,7 +133,7 @@ setting up the label scheme based on the data.
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
 > nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = morphologizer.update(examples, sgd=optimizer)
 > ```
 
@@ -259,12 +259,11 @@ context, the original parameters are restored.
 Add a new label to the pipe. If the `Morphologizer` should set annotations for
 both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
 Raises an error if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). Note that you don't have to
-call this method if you provide a **representative data sample** to the
-[`begin_training`](#begin_training) method. In this case, all labels found in
-the sample will be automatically added to the model, and the output dimension
-will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
+already been fully [initialized](#initialize). Note that you don't have to call
+this method if you provide a **representative data sample** to the
+[`initialize`](#initialize) method. In this case, all labels found in the sample
+will be automatically added to the model, and the output dimension will be
+[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index e4e1e97f1..17752ed5e 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Pipe.begin_training {#begin_training tag="method"}
+## Pipe.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -109,11 +109,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = pipe.update(examples, sgd=optimizer)
 > ```
 
@@ -296,9 +302,9 @@ context, the original parameters are restored.
 Add a new label to the pipe, to be predicted by the model. The actual
 implementation depends on the specific component, but in general `add_label`
 shouldn't be called if the output dimension is already set, or if the model has
-already been fully [initialized](#begin_training). If these conditions are
-violated, the function will raise an Error. The exception to this rule is when
-the component is [resizable](#is_resizable), in which case
+already been fully [initialized](#initialize). If these conditions are violated,
+the function will raise an Error. The exception to this rule is when the
+component is [resizable](#is_resizable), in which case
 [`set_output`](#set_output) should be called to ensure that the model is
 properly resized.
 
@@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
 | **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
 
 Note that in general, you don't have to call `pipe.add_label` if you provide a
-representative data sample to the [`begin_training`](#begin_training) method. In
-this case, all labels found in the sample will be automatically added to the
-model, and the output dimension will be
+representative data sample to the [`initialize`](#initialize) method. In this
+case, all labels found in the sample will be automatically added to the model,
+and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
 ## Pipe.is_resizable {#is_resizable tag="method"}
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index acf94fb8e..d81725343 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## SentenceRecognizer.begin_training {#begin_training tag="method"}
+## SentenceRecognizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -129,7 +129,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = senter.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index d428d376e..6ca554f49 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Tagger.begin_training {#begin_training tag="method"}
+## Tagger.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -123,11 +123,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tagger.update(examples, sgd=optimizer)
 > ```
 
@@ -289,12 +295,12 @@ context, the original parameters are restored.
 ## Tagger.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index b68039094..4c99d6984 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## TextCategorizer.begin_training {#begin_training tag="method"}
+## TextCategorizer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -136,11 +136,17 @@ validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data.
 
+<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
+
+This method was previously called `begin_training`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -196,14 +202,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = textcat.update(examples, sgd=optimizer)
 > ```
 
 | Name              | Description                                                                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
 | `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    | 
+| _keyword-only_    |                                                                                                                                    |
 | `drop`            | The dropout rate. ~~float~~                                                                                                        |
 | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
 | `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
@@ -227,7 +233,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          | 
+| _keyword-only_ |                                                                                                                          |
 | `drop`         | The dropout rate. ~~float~~                                                                                              |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
@@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
 ## TextCategorizer.add_label {#add_label tag="method"}
 
 Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#begin_training). Note
+set, or if the model has already been fully [initialized](#initialize). Note
 that you don't have to call this method if you provide a **representative data
-sample** to the [`begin_training`](#begin_training) method. In this case, all
-labels found in the sample will be automatically added to the model, and the
-output dimension will be
-[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
+sample** to the [`initialize`](#initialize) method. In this case, all labels
+found in the sample will be automatically added to the model, and the output
+dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
+automatically.
 
 > #### Example
 >
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 5c7214edc..8269ad7cf 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Tok2Vec.begin_training {#begin_training tag="method"}
+## Tok2Vec.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -138,7 +138,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = tok2vec.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index d5bcef229..712214fec 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Transformer.begin_training {#begin_training tag="method"}
+## Transformer.initialize {#initialize tag="method"}
 
 Initialize the component for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@@ -173,7 +173,7 @@ setting up the label scheme based on the data.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
+> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = nlp.begin_training()
+> optimizer = nlp.initialize()
 > losses = trf.update(examples, sgd=optimizer)
 > ```
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index a58ba2ba9..b65c3d903 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
 that their internal models are **always initialized** with appropriate sample
 data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
 ~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
-functionality is triggered when
-[`nlp.begin_training`](/api/language#begin_training) is called.
+functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
+called.
 
 ### Dropout and normalization in Thinc {#thinc-dropout-norm}
 
@@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
 
 <!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
-- Initialization life-cycle with `begin_training`, correlation with add_label
+- Initialization life-cycle with `initialize`, correlation with add_label
 Example: relation extraction component (implemented as project template)
 Avoid duplication with usage/processing-pipelines#trainable-components ?
 -->
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index dbf0881ac..b1cf2723b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                         | Description                                                                                                                                                                                                                                                                                                        |
-| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)                 | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)             | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)                   | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 54be6b367..1c1b92e03 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1045,8 +1045,8 @@ of being dropped.
 
 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
->   return an optimizer to update the component model weights.
+> - [`nlp.initialize`](/api/language#initialize): Start the training and return
+>   an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.
@@ -1057,7 +1057,7 @@ of being dropped.
 
 ```python
 ### Example training loop
-optimizer = nlp.begin_training()
+optimizer = nlp.initialize()
 for itn in range(100):
     random.shuffle(train_data)
     for raw_text, entity_offsets in train_data:
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 94c50e1ec..44f902cd5 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`Pipe.update`](/api/pipe#update) methods now all take batches of
   [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
   raw text and a dictionary of annotations.
-  [`Language.begin_training`](/api/language#begin_training) and
-  [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-  returns a sequence of `Example` objects to initialize the model instead of a
-  list of tuples.
+  [`Language.initialize`](/api/language#initialize) and
+  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
+  sequence of `Example` objects to initialize the model instead of a list of
+  tuples.
+- The `begin_training` methods have been renamed to `initialize`.
 - [`Matcher.add`](/api/matcher#add) and
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
@@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
 | `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
@@ -936,7 +938,7 @@ TRAIN_DATA = [
     ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
     ("I like London.", {"entities": [(7, 13, "LOC")]}),
 ]
-nlp.begin_training()
+nlp.initialize()
 for i in range(20):
     random.shuffle(TRAIN_DATA)
     for batch in minibatch(TRAIN_DATA):
@@ -946,17 +948,18 @@ for i in range(20):
         nlp.update(examples)
 ```
 
-[`Language.begin_training`](/api/language#begin_training) and
-[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
-returns a sequence of `Example` objects to initialize the model instead of a
-list of tuples. The data examples are used to **initialize the models** of
+`Language.begin_training` and `Pipe.begin_training` have been renamed to
+[`Language.initialize`](/api/language#initialize) and
+[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
+that returns a sequence of `Example` objects to initialize the model instead of
+a list of tuples. The data examples are used to **initialize the models** of
 trainable pipeline components, which includes validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.
 
 ```diff
-- nlp.begin_training(examples)
-+ nlp.begin_training(lambda: examples)
+- nlp.initialize(examples)
++ nlp.initialize(lambda: examples)
 ```
 
 #### Packaging trained pipelines {#migrating-training-packaging}

From 64d90039a1ae42a1ecb77abe71622398d3bc289b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 29 Sep 2020 10:54:42 +0200
Subject: [PATCH 226/516] encoding UTF8

---
 spacy/cli/project/document.py  | 2 +-
 website/docs/usage/training.md | 2 +-
 website/setup/jinja_to_js.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index d0265029a..811b7c746 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -114,6 +114,6 @@ def project_document(
                 content = f"{before}{content}{after}"
             else:
                 msg.warn("Replacing existing file")
-        with output_file.open("w") as f:
+        with output_file.open("w", encoding="utf8") as f:
             f.write(content)
         msg.good("Saved project documentation", output_file)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index eb02b135a..97992287b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -700,7 +700,7 @@ from pathlib import Path
 @spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
     def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
-        with Path(log_path).open("w") as file_:
+        with Path(log_path).open("w", encoding="utf8") as file_:
             file_.write("step\\t")
             file_.write("score\\t")
             for pipe in nlp.pipe_names:
diff --git a/website/setup/jinja_to_js.py b/website/setup/jinja_to_js.py
index 114d0e172..e2eca7ffb 100644
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
@@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
     data_str = f"export const DATA = {data}"
     result = compiler.get_output()
     if output is not None:
-        with output.open("w") as f:
+        with output.open("w", encoding="utf8") as f:
             f.write(f"{header}\n{result}\n{data_str}")
         print(f"Updated {output.parts[-1]}")
     else:

From 4925ad760a87d84b7cc4bb2fb48b45845a2e0c30 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 10:58:50 +0200
Subject: [PATCH 227/516] Add init vectors

---
 spacy/cli/init_pipeline.py   |  26 +++++++-
 spacy/training/initialize.py | 117 +++++++++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index a92705cb0..0e9de0eb4 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -5,11 +5,35 @@ from wasabi import msg
 import typer
 
 from .. import util
-from ..training.initialize import init_nlp
+from ..training.initialize import init_nlp, convert_vectors
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
 
 
+@init_cli.command("vectors")
+def init_vectors_cli(
+    # fmt: off
+    lang: str = Arg(..., help="The language of the nlp object to create"),
+    vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
+    output_dir: Path = Arg(..., help="Pipeline output directory"),
+    prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
+    truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
+    # fmt: on
+):
+    msg.info(f"Creating blank nlp object for language '{lang}'")
+    nlp = util.get_lang_class(lang)()
+    convert_vectors(
+        nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False
+    )
+    nlp.to_disk(output_dir)
+    msg.good(
+        "Saved nlp object with vectors to output directory. You can now use the "
+        "path to it in your config as the 'vectors' setting in [initialize.vocab].",
+        output_dir,
+    )
+
+
 @init_cli.command(
     "nlp",
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 23debfb28..9a47a7f69 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,13 +1,19 @@
-from typing import Union, Dict, Optional, Any, List
+from typing import Union, Dict, Optional, Any, List, IO
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
 from wasabi import Printer
 import srsly
+import numpy
+import tarfile
+import gzip
+import zipfile
+import tqdm
 
 from .loop import create_before_to_disk_callback
 from ..language import Language
 from ..lookups import Lookups
+from ..vectors import Vectors
 from ..errors import Errors
 from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, resolve_dot_names
@@ -49,8 +55,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        msg.good(f"Initialized pipeline components")
+        nlp.initialize(
+            lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"]
+        )
+        msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
@@ -103,7 +111,7 @@ def init_vocab(
 
 
 def load_vectors_into_model(
-    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
+    nlp: Language, name: Union[str, Path], *, add_strings: bool = True
 ) -> None:
     """Load word vectors from an installed model or path into a model instance."""
     try:
@@ -202,3 +210,104 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
         for name, cfg in config.get("components", {}).items()
         if "factory" not in cfg and "source" in cfg
     ]
+
+
+def convert_vectors(
+    nlp: Language,
+    vectors_loc: Optional[Path],
+    *,
+    truncate: int,
+    prune: int,
+    name: Optional[str] = None,
+    silent: bool = True,
+) -> None:
+    msg = Printer(no_print=silent)
+    vectors_loc = ensure_path(vectors_loc)
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
+        for lex in nlp.vocab:
+            if lex.rank and lex.rank != OOV_RANK:
+                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
+    else:
+        if vectors_loc:
+            with msg.loading(f"Reading vectors from {vectors_loc}"):
+                vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+            msg.good(f"Loaded vectors from {vectors_loc}")
+        else:
+            vectors_data, vector_keys = (None, None)
+        if vector_keys is not None:
+            for word in vector_keys:
+                if word not in nlp.vocab:
+                    nlp.vocab[word]
+        if vectors_data is not None:
+            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
+    if name is None:
+        # TODO: Is this correct? Does this matter?
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
+    else:
+        nlp.vocab.vectors.name = name
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
+    if prune >= 1:
+        nlp.vocab.prune_vectors(prune)
+    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
+
+
+def read_vectors(vectors_loc: Path, truncate_vectors: int):
+    f = open_file(vectors_loc)
+    f = ensure_shape(f)
+    shape = tuple(int(size) for size in next(f).split())
+    if truncate_vectors >= 1:
+        shape = (truncate_vectors, shape[1])
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
+    vectors_keys = []
+    for i, line in enumerate(tqdm.tqdm(f)):
+        line = line.rstrip()
+        pieces = line.rsplit(" ", vectors_data.shape[1])
+        word = pieces.pop(0)
+        if len(pieces) != vectors_data.shape[1]:
+            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
+        vectors_keys.append(word)
+        if i == truncate_vectors - 1:
+            break
+    return vectors_data, vectors_keys
+
+
+def open_file(loc: Union[str, Path]) -> IO:
+    """Handle .gz, .tar.gz or unzipped files"""
+    loc = ensure_path(loc)
+    if tarfile.is_tarfile(str(loc)):
+        return tarfile.open(str(loc), "r:gz")
+    elif loc.parts[-1].endswith("gz"):
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+    elif loc.parts[-1].endswith("zip"):
+        zip_file = zipfile.ZipFile(str(loc))
+        names = zip_file.namelist()
+        file_ = zip_file.open(names[0])
+        return (line.decode("utf8") for line in file_)
+    else:
+        return loc.open("r", encoding="utf8")
+
+
+def ensure_shape(lines):
+    """Ensure that the first line of the data is the vectors shape.
+    If it's not, we read in the data and output the shape as the first result,
+    so that the reader doesn't have to deal with the problem.
+    """
+    first_line = next(lines)
+    try:
+        shape = tuple(int(size) for size in first_line.split())
+    except ValueError:
+        shape = None
+    if shape is not None:
+        # All good, give the data
+        yield first_line
+        yield from lines
+    else:
+        # Figure out the shape, make it the first value, and then give the
+        # rest of the data.
+        width = len(first_line.split()) - 1
+        captured = [first_line] + list(lines)
+        length = len(captured)
+        yield f"{length} {width}"
+        yield from captured

From 5276db6f3f4f44eb98cf984e7e54f9790b00d08e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 11:42:19 +0200
Subject: [PATCH 228/516] Remove 'device' argument from Language, clean up
 'sgd' arg

---
 spacy/language.py | 57 ++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index a5b78b178..5b1f50ee2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -19,7 +19,7 @@ from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .training import Example, validate_examples
 from .scorer import Scorer
-from .util import create_default_optimizer, registry, SimpleFrozenList
+from .util import registry, SimpleFrozenList
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@@ -1065,7 +1065,7 @@ class Language:
         validate_examples(examples, "Language.update")
         if sgd is None:
             if self._optimizer is None:
-                self._optimizer = create_default_optimizer()
+                self._optimizer = self.create_optimizer()
             sgd = self._optimizer
         if component_cfg is None:
             component_cfg = {}
@@ -1123,7 +1123,7 @@ class Language:
         validate_examples(examples, "Language.rehearse")
         if sgd is None:
             if self._optimizer is None:
-                self._optimizer = create_default_optimizer()
+                self._optimizer = self.create_optimizer()
             sgd = self._optimizer
         pipes = list(self.pipeline)
         random.shuffle(pipes)
@@ -1161,16 +1161,14 @@ class Language:
     def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
-        *,
-        sgd: Optional[Optimizer] = None,
-        device: int = -1,
-    ) -> Optimizer:
+        sgd: Optional[Optimizer]=None
+    ) -> None:
         """Initialize the pipe for training, using data examples if available.
 
         get_examples (Callable[[], Iterable[Example]]): Optional function that
             returns gold-standard Example objects.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
+        sgd (Optional[Optimizer]): An optimizer to use for updates. If not 
+            provided, will be created using the .create_optimizer() method.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/language#initialize
@@ -1199,25 +1197,22 @@ class Language:
         if not valid_examples:
             err = Errors.E930.format(name="Language", obj="empty list")
             raise ValueError(err)
-        if device >= 0:  # TODO: do we need this here?
-            require_gpu(device)
-            if self.vocab.vectors.data.shape[1] >= 1:
-                ops = get_current_ops()
-                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-        if sgd is None:
-            sgd = create_default_optimizer()
-        self._optimizer = sgd
+        if self.vocab.vectors.data.shape[1] >= 1:
+            ops = get_current_ops()
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
         for name, proc in self.pipeline:
             if hasattr(proc, "initialize"):
                 proc.initialize(
-                    get_examples, pipeline=self.pipeline, sgd=self._optimizer
+                    get_examples, pipeline=self.pipeline
                 )
         self._link_components()
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
         return self._optimizer
 
-    def resume_training(
-        self, *, sgd: Optional[Optimizer] = None, device: int = -1
-    ) -> Optimizer:
+    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
         """Continue training a pretrained model.
 
         Create and return an optimizer, and initialize "rehearsal" for any pipeline
@@ -1226,22 +1221,20 @@ class Language:
         rehearsal, collect samples of text you want the models to retain performance
         on, and call nlp.rehearse() with a batch of Example objects.
 
-        sgd (Optional[Optimizer]): An optimizer.
         RETURNS (Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/language#resume_training
         """
-        if device >= 0:  # TODO: do we need this here?
-            require_gpu(device)
-            ops = get_current_ops()
-            if self.vocab.vectors.data.shape[1] >= 1:
-                self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-        if sgd is None:
-            sgd = create_default_optimizer()
-        self._optimizer = sgd
+        ops = get_current_ops()
+        if self.vocab.vectors.data.shape[1] >= 1:
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
         for name, proc in self.pipeline:
             if hasattr(proc, "_rehearsal_model"):
                 proc._rehearsal_model = deepcopy(proc.model)
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
         return self._optimizer
 
     def evaluate(
@@ -1302,6 +1295,10 @@ class Language:
         n_words = sum(len(doc) for doc in docs)
         results["speed"] = n_words / (end_time - start_time)
         return results
+    
+    def create_optimizer(self):
+        """Create an optimizer, usually using the [training.optimizer] config."""
+        return registry.resolve(self.config["training"]["optimizer"])
 
     @contextmanager
     def use_params(self, params: Optional[dict]):

From b3b6868639f3982f9cbe584784faa7371f7d7b07 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 11:42:35 +0200
Subject: [PATCH 229/516] Remove 'sgd' arg from component initialize

---
 spacy/pipeline/morphologizer.pyx     | 7 +------
 spacy/pipeline/multitask.pyx         | 4 ++--
 spacy/pipeline/pipe.pyx              | 4 +---
 spacy/pipeline/senter.pyx            | 6 ++----
 spacy/pipeline/tagger.pyx            | 7 +------
 spacy/pipeline/textcat.py            | 3 +--
 spacy/pipeline/transition_parser.pyx | 9 +++------
 7 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index d035172a8..580b6b831 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -129,7 +129,7 @@ class Morphologizer(Tagger):
             self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
-    def initialize(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -138,8 +138,6 @@ class Morphologizer(Tagger):
         pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
             components that this component is part of. Corresponds to
             nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/morphologizer#initialize
@@ -178,9 +176,6 @@ class Morphologizer(Tagger):
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def set_annotations(self, docs, batch_tag_ids):
         """Modify a batch of documents, using pre-computed scores.
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 3fd034b30..ba406dabe 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def initialize(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None):
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
             raise ValueError(err)
@@ -177,7 +177,7 @@ class ClozeMultitask(Pipe):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def initialize(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None):
         self.model.initialize()  # TODO: fix initialization by defining X and Y
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
         self.model.output_layer.initialize(X)
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index bff2be1af..08015e60e 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -183,7 +183,7 @@ cdef class Pipe:
         """
         return util.create_default_optimizer()
 
-    def initialize(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None):
         """Initialize the pipe for training, using data examples if available.
         This method needs to be implemented by each Pipe component,
         ensuring the internal model (if available) is initialized properly
@@ -194,8 +194,6 @@ cdef class Pipe:
         pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
             components that this component is part of. Corresponds to
             nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/pipe#initialize
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 68a9860a5..91ce9f1bb 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def initialize(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -133,9 +133,7 @@ class SentenceRecognizer(Tagger):
         pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
             components that this component is part of. Corresponds to
             nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        RETURNS: None
 
         DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
         """
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 66f8b38b6..ecf93600e 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -256,7 +256,7 @@ class Tagger(Pipe):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def initialize(self, get_examples, *, pipeline=None, sgd=None):
+    def initialize(self, get_examples, *, pipeline=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -265,8 +265,6 @@ class Tagger(Pipe):
         pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
             components that this component is part of. Corresponds to
             nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/tagger#initialize
@@ -289,9 +287,6 @@ class Tagger(Pipe):
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def add_label(self, label):
         """Add a new label to the pipe.
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 37665adfc..67e8777c5 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -338,8 +338,7 @@ class TextCategorizer(Pipe):
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
+        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
     ) -> Optimizer:
         """Initialize the pipe for training, using a representative set
         of data examples.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 5a4503cf9..9a2e5d8d0 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -354,7 +354,7 @@ cdef class Parser(Pipe):
             # If all weights for an output are 0 in the original model, don't
             # supervise that output. This allows us to add classes.
             loss += (d_scores**2).sum()
-            backprop(d_scores, sgd=sgd)
+            backprop(d_scores)
             # Follow the predicted action
             self.transition_states(states, guesses)
             states = [state for state in states if not state.is_final()]
@@ -405,9 +405,8 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
+    def initialize(self, get_examples, pipeline=None, settings=None):
         self._ensure_examples(get_examples)
-        self.cfg.update(kwargs)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
             langs = ", ".join(util.LEXEME_NORM_LANGS)
@@ -425,8 +424,6 @@ cdef class Parser(Pipe):
         self.moves.initialize_actions(actions)
         # make sure we resize so we have an appropriate upper layer
         self._resize()
-        if sgd is None:
-            sgd = self.create_optimizer()
         doc_sample = []
         if pipeline is not None:
             for name, component in pipeline:
@@ -442,7 +439,7 @@ cdef class Parser(Pipe):
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(doc_sample)
         if pipeline is not None:
-            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
+            self.init_multitask_objectives(get_examples, pipeline)
         return sgd
 
     def to_disk(self, path, exclude=tuple()):

From dec984a9c1c067bc1538959da44e49df5b715965 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 11:52:45 +0200
Subject: [PATCH 230/516] Update Language.initialize and support
 components/tokenizer settings

---
 spacy/language.py                    | 21 ++++++-
 spacy/pipeline/transition_parser.pyx |  2 +-
 spacy/schemas.py                     | 93 +++++++++++++++++++++++++++-
 3 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index a5b78b178..20b7a7256 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -27,7 +27,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp
+from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings
 from .git_info import GIT_VERSION
 from . import util
 from . import about
@@ -1162,6 +1162,7 @@ class Language:
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
+        settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
         sgd: Optional[Optimizer] = None,
         device: int = -1,
     ) -> Optimizer:
@@ -1207,10 +1208,26 @@ class Language:
         if sgd is None:
             sgd = create_default_optimizer()
         self._optimizer = sgd
+        if hasattr(self.tokenizer, "initialize"):
+            tok_settings = settings.get("tokenizer", {})
+            tok_settings = validate_init_settings(
+                self.tokenizer.initialize,
+                tok_settings,
+                section="tokenizer",
+                name="tokenizer",
+            )
+            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
         for name, proc in self.pipeline:
             if hasattr(proc, "initialize"):
+                p_settings = settings.get(name, {})
+                p_settings = validate_init_settings(
+                    proc.initialize, p_settings, section="components", name=name
+                )
                 proc.initialize(
-                    get_examples, pipeline=self.pipeline, sgd=self._optimizer
+                    get_examples,
+                    pipeline=self.pipeline,
+                    sgd=self._optimizer,
+                    **p_settings,
                 )
         self._link_components()
         return self._optimizer
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 5a4503cf9..78e3422f6 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, cdivision=True, boundscheck=False
+# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 from __future__ import print_function
 from cymem.cymem cimport Pool
 cimport numpy as np
diff --git a/spacy/schemas.py b/spacy/schemas.py
index b98498b8b..cdd8c11ed 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,11 +1,13 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
-from pydantic import BaseModel, Field, ValidationError, validator
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError
 from thinc.config import Promise
 from collections import defaultdict
-from thinc.api import Optimizer
+import inspect
 
 from .attrs import NAMES
 from .lookups import Lookups
@@ -43,6 +45,93 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
         return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]
 
 
+# Initialization
+
+
+class ArgSchemaConfig:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+class ArgSchemaConfigExtra:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+def get_arg_model(
+    func: Callable,
+    *,
+    exclude: Iterable[str] = tuple(),
+    name: str = "ArgModel",
+    strict: bool = True,
+) -> ModelMetaclass:
+    """Generate a pydantic model for function arguments.
+
+    func (Callable): The function to generate the schema for.
+    exclude (Iterable[str]): Parameter names to ignore.
+    name (str): Name of created model class.
+    strict (bool): Don't allow extra arguments if no variable keyword arguments
+        are allowed on the function.
+    RETURNS (ModelMetaclass): A pydantic model.
+    """
+    sig_args = {}
+    try:
+        sig = inspect.signature(func)
+    except ValueError:
+        # Typically happens if the method is part of a Cython module without
+        # binding=True. Here we just use an empty model that allows everything.
+        return create_model(name, __config__=ArgSchemaConfigExtra)
+    has_variable = False
+    for param in sig.parameters.values():
+        if param.name in exclude:
+            continue
+        if param.kind == param.VAR_KEYWORD:
+            # The function allows variable keyword arguments so we shouldn't
+            # include **kwargs etc. in the schema and switch to non-strict
+            # mode and pass through all other values
+            has_variable = True
+            continue
+        # If no annotation is specified assume it's anything
+        annotation = param.annotation if param.annotation != param.empty else Any
+        # If no default value is specified assume that it's required
+        default = param.default if param.default != param.empty else ...
+        sig_args[param.name] = (annotation, default)
+    is_strict = strict and not has_variable
+    sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
+    return create_model(name, **sig_args)
+
+
+def validate_init_settings(
+    func: Callable,
+    settings: Dict[str, Any],
+    *,
+    section: Optional[str] = None,
+    name: str = "",
+    exclude: Iterable[str] = ("get_examples", "pipeline", "sgd"),
+) -> Dict[str, Any]:
+    """Validate initialization settings against the expected arguments in
+    the method signature. Will parse values if possible (e.g. int to string)
+    and return the updated settings dict. Will raise a ConfigValidationError
+    if types don't match or required values are missing.
+
+    func (Callable): The initialize method of a given component etc.
+    settings (Dict[str, Any]): The settings from the repsective [initialize] block.
+    section (str): Initialize section, for error message.
+    name (str): Name of the block in the section.
+    exclude (Iterable[str]): Parameter names to exclude from schema.
+    RETURNS (Dict[str, Any]): The validated settings.
+    """
+    schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
+    try:
+        return schema(**settings).dict()
+    except ValidationError as e:
+        block = "initialize" if not section else f"initialize.{section}"
+        title = f"Error validating initialization settings in [{block}]"
+        raise ConfigValidationError(
+            title=title, errors=e.errors(), config=settings, parent=name,
+        ) from None
+
+
 # Matcher token patterns
 
 

From 78396d137fa2faced8a0a612ed5009fa52e3b721 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 11:57:08 +0200
Subject: [PATCH 231/516] Integrate initialize settings

---
 spacy/language.py            | 3 ++-
 spacy/training/initialize.py | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 20b7a7256..5ba7e38f8 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1217,9 +1217,10 @@ class Language:
                 name="tokenizer",
             )
             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
+        proc_settings = settings.get("components", {})
         for name, proc in self.pipeline:
             if hasattr(proc, "initialize"):
-                p_settings = settings.get(name, {})
+                p_settings = proc_settings.get(name, {})
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 9a47a7f69..b42732d48 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -55,9 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(
-            lambda: train_corpus(nlp), sgd=optimizer, settings=I["components"]
-        )
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
         msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized

From f2d1b7feb597194746dfd973434a0d683aecd18e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 12:00:08 +0200
Subject: [PATCH 232/516] Clean up sgd

---
 spacy/language.py                    | 3 ++-
 spacy/pipeline/multitask.pyx         | 6 ------
 spacy/pipeline/senter.pyx            | 3 ---
 spacy/pipeline/textcat.py            | 5 -----
 spacy/pipeline/transition_parser.pyx | 1 -
 5 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 5b1f50ee2..8d8f3175b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1298,7 +1298,8 @@ class Language:
     
     def create_optimizer(self):
         """Create an optimizer, usually using the [training.optimizer] config."""
-        return registry.resolve(self.config["training"]["optimizer"])
+        subconfig = {"optimizer": self.config["training"]["optimizer"]}
+        return registry.resolve(subconfig)["optimizer"]
 
     @contextmanager
     def use_params(self, params: Optional[dict]):
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index ba406dabe..d03fd3ae8 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -91,9 +91,6 @@ class MultitaskObjective(Tagger):
                 if label is not None and label not in self.labels:
                     self.labels[label] = len(self.labels)
         self.model.initialize()   # TODO: fix initialization by defining X and Y
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def predict(self, docs):
         tokvecs = self.model.get_ref("tok2vec")(docs)
@@ -181,9 +178,6 @@ class ClozeMultitask(Pipe):
         self.model.initialize()  # TODO: fix initialization by defining X and Y
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
         self.model.output_layer.initialize(X)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def predict(self, docs):
         tokvecs = self.model.get_ref("tok2vec")(docs)
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 91ce9f1bb..76767712f 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -149,9 +149,6 @@ class SentenceRecognizer(Tagger):
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def add_label(self, label, values=None):
         raise NotImplementedError
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 67e8777c5..67ee38217 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -348,8 +348,6 @@ class TextCategorizer(Pipe):
         pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
             components that this component is part of. Corresponds to
             nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
@@ -367,9 +365,6 @@ class TextCategorizer(Pipe):
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         assert len(label_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(X=doc_sample, Y=label_sample)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         """Score a batch of examples.
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9a2e5d8d0..65f6fa928 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -440,7 +440,6 @@ cdef class Parser(Pipe):
         self.model.initialize(doc_sample)
         if pipeline is not None:
             self.init_multitask_objectives(get_examples, pipeline)
-        return sgd
 
     def to_disk(self, path, exclude=tuple()):
         serializers = {

From 50410c17ac7572fb0eab317cdefe0f55342e5560 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:05:38 +0200
Subject: [PATCH 233/516] Update schemas.py

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index cdd8c11ed..594fc92ad 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -107,7 +107,7 @@ def validate_init_settings(
     *,
     section: Optional[str] = None,
     name: str = "",
-    exclude: Iterable[str] = ("get_examples", "pipeline", "sgd"),
+    exclude: Iterable[str] = ("get_examples", "nlp", "pipeline", "sgd"),
 ) -> Dict[str, Any]:
     """Validate initialization settings against the expected arguments in
     the method signature. Will parse values if possible (e.g. int to string)

From e1fdf2b7c5ef601c19c008f4dc0f4fa6198c077d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 12:05:38 +0200
Subject: [PATCH 234/516] Upd tests

---
 spacy/tests/parser/test_add_label.py  | 2 +-
 spacy/tests/parser/test_preset_sbd.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index fb1eabf7d..2f750b60c 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -35,7 +35,7 @@ def test_init_parser(parser):
 def _train_parser(parser):
     fix_random_seed(1)
     parser.add_label("left")
-    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)])
     sgd = Adam(0.001)
 
     for i in range(5):
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index d8f861b02..ab58ac17b 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -34,7 +34,7 @@ def parser(vocab):
     parser.cfg["hidden_width"] = 32
     # parser.add_label('right')
     parser.add_label("left")
-    parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
+    parser.initialize(lambda: [_parser_example(parser)])
     sgd = Adam(0.001)
 
     for i in range(10):

From 42f0e4c946bc5a5e68d3132ed518a15d994e9eb4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:14:08 +0200
Subject: [PATCH 235/516] Clean up

---
 spacy/language.py              | 10 ++++------
 spacy/pipeline/dep_parser.pyx  |  2 +-
 spacy/pipeline/sentencizer.pyx |  2 +-
 spacy/schemas.py               |  2 +-
 spacy/training/initialize.py   |  2 +-
 5 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 97a317101..6c0a8394d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
 import warnings
-from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
+from thinc.api import Model, get_current_ops, Config, Optimizer
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle
@@ -1153,10 +1153,9 @@ class Language:
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
         sgd: Optional[Optimizer] = None,
-        device: int = -1,
     ) -> Optimizer:
         warnings.warn(Warnings.W089, DeprecationWarning)
-        return self.initialize(get_examples, sgd=sgd, device=device)
+        return self.initialize(get_examples, sgd=sgd)
 
     def initialize(
         self,
@@ -1169,7 +1168,7 @@ class Language:
 
         get_examples (Callable[[], Iterable[Example]]): Optional function that
             returns gold-standard Example objects.
-        sgd (Optional[Optimizer]): An optimizer to use for updates. If not 
+        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
             provided, will be created using the .create_optimizer() method.
         RETURNS (thinc.api.Optimizer): The optimizer.
 
@@ -1220,7 +1219,6 @@ class Language:
                     proc.initialize, p_settings, section="components", name=name
                 )
                 proc.initialize(
-                    get_examples, pipeline=self.pipeline
                     get_examples,
                     pipeline=self.pipeline,
                     **p_settings,
@@ -1315,7 +1313,7 @@ class Language:
         n_words = sum(len(doc) for doc in docs)
         results["speed"] = n_words / (end_time - start_time)
         return results
-    
+
     def create_optimizer(self):
         """Create an optimizer, usually using the [training.optimizer] config."""
         subconfig = {"optimizer": self.config["training"]["optimizer"]}
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 95effac59..eedb4cba9 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
+            labeller.initialize(get_examples, pipeline=pipeline)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 0f49033ff..3cd480d20 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -58,7 +58,7 @@ class Sentencizer(Pipe):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
-    def initialize(self, get_examples, pipeline=None, sgd=None):
+    def initialize(self, get_examples, pipeline=None):
         pass
 
     def __call__(self, doc):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 594fc92ad..e183e0a75 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -107,7 +107,7 @@ def validate_init_settings(
     *,
     section: Optional[str] = None,
     name: str = "",
-    exclude: Iterable[str] = ("get_examples", "nlp", "pipeline", "sgd"),
+    exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
 ) -> Dict[str, Any]:
     """Validate initialization settings against the expected arguments in
     the method signature. Will parse values if possible (e.g. int to string)
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index b42732d48..498fd890c 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -55,7 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
+        nlp.initialize(lambda: train_corpus(nlp), settings=I)
         msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized

From 612bbf85abb26eacf9b6d41399b1a761f8732f15 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:14:47 +0200
Subject: [PATCH 236/516] Update initialize.py

---
 spacy/training/initialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 498fd890c..b42732d48 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -55,7 +55,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), settings=I)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
         msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized

From f171903139732ccbe514819da8e0d28f819c5256 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:20:26 +0200
Subject: [PATCH 237/516] Clean up sgd and pipeline -> nlp

---
 spacy/pipeline/entity_linker.py      | 17 ++++-------------
 spacy/pipeline/morphologizer.pyx     |  7 ++-----
 spacy/pipeline/multitask.pyx         |  4 ++--
 spacy/pipeline/pipe.pyx              |  9 +++------
 spacy/pipeline/sentencizer.pyx       |  2 +-
 spacy/pipeline/senter.pyx            |  7 ++-----
 spacy/pipeline/tagger.pyx            |  7 ++-----
 spacy/pipeline/textcat.py            |  9 +++------
 spacy/pipeline/tok2vec.py            | 12 +++---------
 spacy/pipeline/transition_parser.pyx | 10 +++++-----
 spacy/schemas.py                     |  2 +-
 11 files changed, 28 insertions(+), 58 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 0f33378b4..b67a15d32 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
+from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
 from pathlib import Path
 import srsly
 import random
@@ -144,20 +144,14 @@ class EntityLinker(Pipe):
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+    ):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/entitylinker#initialize
         """
@@ -174,9 +168,6 @@ class EntityLinker(Pipe):
         self.model.initialize(
             X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
         )
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
 
     def update(
         self,
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 580b6b831..9b28a7ca1 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -129,16 +129,13 @@ class Morphologizer(Tagger):
             self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/morphologizer#initialize
         """
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index d03fd3ae8..ba351f16e 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
             raise ValueError(err)
@@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
     def set_annotations(self, docs, dep_ids):
         pass
 
-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
         self.model.initialize()  # TODO: fix initialization by defining X and Y
         X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
         self.model.output_layer.initialize(X)
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 08015e60e..b8961f307 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -183,7 +183,7 @@ cdef class Pipe:
         """
         return util.create_default_optimizer()
 
-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
         """Initialize the pipe for training, using data examples if available.
         This method needs to be implemented by each Pipe component,
         ensuring the internal model (if available) is initialized properly
@@ -191,14 +191,11 @@ cdef class Pipe:
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/pipe#initialize
         """
-        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
+        raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
 
     def _ensure_examples(self, get_examples):
         if get_examples is None or not hasattr(get_examples, "__call__"):
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 3cd480d20..13fcd15e2 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -58,7 +58,7 @@ class Sentencizer(Pipe):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
         pass
 
     def __call__(self, doc):
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 76767712f..ec635de5c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS: None
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
         """
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index ecf93600e..3d5aca14e 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -256,16 +256,13 @@ class Tagger(Pipe):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects..
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 67ee38217..ea058ad31 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+    ):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 7c8bbf5e5..89f9df757 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
+from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice
 
@@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
         self,
         get_examples: Callable[[], Iterable[Example]],
         *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
+        nlp: Optional[Language] = None,
     ):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.
 
         DOCS: https://nightly.spacy.io/api/tok2vec#initialize
         """
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 413ea968c..c250d2522 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -405,7 +405,7 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def initialize(self, get_examples, pipeline=None, settings=None):
+    def initialize(self, get_examples, nlp=None):
         self._ensure_examples(get_examples)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@@ -425,8 +425,8 @@ cdef class Parser(Pipe):
         # make sure we resize so we have an appropriate upper layer
         self._resize()
         doc_sample = []
-        if pipeline is not None:
-            for name, component in pipeline:
+        if nlp is not None:
+            for name, component in nlp.pipeline:
                 if component is self:
                     break
                 if hasattr(component, "pipe"):
@@ -438,8 +438,8 @@ cdef class Parser(Pipe):
                 doc_sample.append(example.predicted)
         assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
         self.model.initialize(doc_sample)
-        if pipeline is not None:
-            self.init_multitask_objectives(get_examples, pipeline)
+        if nlp is not None:
+            self.init_multitask_objectives(get_examples, nlp.pipeline)
 
     def to_disk(self, path, exclude=tuple()):
         serializers = {
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e183e0a75..0b2eeba68 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -107,7 +107,7 @@ def validate_init_settings(
     *,
     section: Optional[str] = None,
     name: str = "",
-    exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
+    exclude: Iterable[str] = ("get_examples", "nlp"),
 ) -> Dict[str, Any]:
     """Validate initialization settings against the expected arguments in
     the method signature. Will parse values if possible (e.g. int to string)

From adca08a12fcd011df2d94e4701dfd193a6cbb5ea Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:21:52 +0200
Subject: [PATCH 238/516] Pass nlp forward

---
 spacy/language.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 6c0a8394d..8ef2f1d61 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1218,11 +1218,7 @@ class Language:
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
-                proc.initialize(
-                    get_examples,
-                    pipeline=self.pipeline,
-                    **p_settings,
-                )
+                proc.initialize(get_examples, nlp=self, **p_settings)
         self._link_components()
         if sgd is not None:
             self._optimizer = sgd

From 1d80b3dc1b23ffb2e2659d637fa073f7aebb9012 Mon Sep 17 00:00:00 2001
From: walterhenry <55140654+walterhenry@users.noreply.github.com>
Date: Tue, 29 Sep 2020 12:39:10 +0200
Subject: [PATCH 239/516] Proofreading

Finished with the API docs and started on the Usage, but Embedding & Transformers
---
 website/docs/usage/embeddings-transformers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index b00760e62..e3a8ae448 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_.
 If you have a list of terms with no context around them, a transformer model
 like BERT can't really help you. BERT is designed to understand language **in
 context**, which isn't what you have. A word vectors table will be a much better
-fit for your task. However, if you do have words in context — whole sentences or
-paragraphs of running text — word vectors will only provide a very rough
+fit for your task. However, if you do have words in context – whole sentences or
+paragraphs of running text – word vectors will only provide a very rough
 approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a
@@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]):
 ```
 
 You can also customize how the [`Transformer`](/api/transformer) component sets
-annotations onto the [`Doc`](/api/doc), by specifying a custom
+annotations onto the [`Doc`](/api/doc) by specifying a custom
 `set_extra_annotations` function. This callback will be called with the raw
 input and output data for the whole batch, along with the batch of `Doc`
 objects, allowing you to implement whatever you need. The annotation setter is
@@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another
 
 The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
 subnetwork** within one of your components, and add additional layers to build a
-network for a temporary task, that forces the model to learn something about
+network for a temporary task that forces the model to learn something about
 sentence structure and word cooccurrence statistics. Pretraining produces a
 **binary weights file** that can be loaded back in at the start of training. The
 weights file specifies an initial set of weights. Training then proceeds as

From 591038b1a4eac783506bee845a308f3991e39548 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 12:54:52 +0200
Subject: [PATCH 240/516] Add test

---
 spacy/tests/pipeline/test_initialize.py | 42 +++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 spacy/tests/pipeline/test_initialize.py

diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
new file mode 100644
index 000000000..974556b1c
--- /dev/null
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -0,0 +1,42 @@
+import pytest
+from spacy.language import Language
+from spacy.lang.en import English
+from spacy.training import Example
+from thinc.api import ConfigValidationError
+from pydantic import StrictBool
+
+
+def test_initialize_arguments():
+    name = "test_initialize_arguments"
+
+    class Component:
+        def __init__(self):
+            ...
+
+        def initialize(
+            self, get_examples, nlp, custom1: str, custom2: StrictBool = False
+        ):
+            ...
+
+    Language.factory(name, func=lambda nlp, name: Component())
+
+    nlp = English()
+    example = Example.from_dict(nlp("x"), {})
+    get_examples = lambda: [example]
+    nlp.add_pipe(name)
+    # The settings here will typically come from the [initialize] block
+    with pytest.raises(ConfigValidationError) as e:
+        # Empty settings, no required custom1 argument
+        nlp.initialize(get_examples, settings={"components": {name: {}}})
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ("custom1",)
+    assert errors[0]["type"] == "value_error.missing"
+    with pytest.raises(ConfigValidationError) as e:
+        # Wrong type
+        settings = {"components": {name: {"custom1": "x", "custom2": 1}}}
+        nlp.initialize(get_examples, settings=settings)
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ("custom2",)
+    assert errors[0]["type"] == "value_error.strictbool"

From 6a04e5adeae6387074d890988c957e7e2c4f9a34 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 29 Sep 2020 14:49:55 +0200
Subject: [PATCH 241/516] encoding UTF8 (#6161)

---
 spacy/cli/project/document.py  | 2 +-
 website/docs/usage/training.md | 2 +-
 website/setup/jinja_to_js.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index d0265029a..811b7c746 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -114,6 +114,6 @@ def project_document(
                 content = f"{before}{content}{after}"
             else:
                 msg.warn("Replacing existing file")
-        with output_file.open("w") as f:
+        with output_file.open("w", encoding="utf8") as f:
             f.write(content)
         msg.good("Saved project documentation", output_file)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index eb02b135a..97992287b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -700,7 +700,7 @@ from pathlib import Path
 @spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
     def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
-        with Path(log_path).open("w") as file_:
+        with Path(log_path).open("w", encoding="utf8") as file_:
             file_.write("step\\t")
             file_.write("score\\t")
             for pipe in nlp.pipe_names:
diff --git a/website/setup/jinja_to_js.py b/website/setup/jinja_to_js.py
index 114d0e172..e2eca7ffb 100644
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
@@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
     data_str = f"export const DATA = {data}"
     result = compiler.get_output()
     if output is not None:
-        with output.open("w") as f:
+        with output.open("w", encoding="utf8") as f:
             f.write(f"{header}\n{result}\n{data_str}")
         print(f"Updated {output.parts[-1]}")
     else:

From 56f8bc73ef1880ded2abe9da5a5ff26ca6babc20 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 15:23:34 +0200
Subject: [PATCH 242/516] Add more tests

---
 spacy/tests/pipeline/test_initialize.py | 32 +++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 974556b1c..1d2e7e5a3 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -9,34 +9,58 @@ from pydantic import StrictBool
 def test_initialize_arguments():
     name = "test_initialize_arguments"
 
+    class CustomTokenizer:
+        def __init__(self, tokenizer):
+            self.tokenizer = tokenizer
+            self.from_initialize = None
+
+        def __call__(self, text):
+            return self.tokenizer(text)
+
+        def initialize(self, get_examples, nlp, custom: int):
+            self.from_initialize = custom
+
     class Component:
         def __init__(self):
-            ...
+            self.from_initialize = None
 
         def initialize(
             self, get_examples, nlp, custom1: str, custom2: StrictBool = False
         ):
-            ...
+            self.from_initialize = (custom1, custom2)
 
     Language.factory(name, func=lambda nlp, name: Component())
 
     nlp = English()
+    nlp.tokenizer = CustomTokenizer(nlp.tokenizer)
     example = Example.from_dict(nlp("x"), {})
     get_examples = lambda: [example]
     nlp.add_pipe(name)
     # The settings here will typically come from the [initialize] block
     with pytest.raises(ConfigValidationError) as e:
         # Empty settings, no required custom1 argument
-        nlp.initialize(get_examples, settings={"components": {name: {}}})
+        settings = {"tokenizer": {"custom": 1}, "components": {name: {}}}
+        nlp.initialize(get_examples, settings=settings)
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ("custom1",)
     assert errors[0]["type"] == "value_error.missing"
     with pytest.raises(ConfigValidationError) as e:
         # Wrong type
-        settings = {"components": {name: {"custom1": "x", "custom2": 1}}}
+        settings = {
+            "tokenizer": {"custom": 1},
+            "components": {name: {"custom1": "x", "custom2": 1}},
+        }
         nlp.initialize(get_examples, settings=settings)
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ("custom2",)
     assert errors[0]["type"] == "value_error.strictbool"
+    settings = {
+        "tokenizer": {"custom": 1},
+        "components": {name: {"custom1": "x", "custom2": True}},
+    }
+    nlp.initialize(get_examples, settings=settings)
+    assert nlp.tokenizer.from_initialize == 1
+    pipe = nlp.get_pipe(name)
+    assert pipe.from_initialize == ("x", True)

From 63d15981377aa207591380ba6eaf816c7696830c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 16:05:48 +0200
Subject: [PATCH 243/516] Simplify config use in Language.initialize

---
 spacy/language.py                       | 25 +++++++++++-----
 spacy/tests/pipeline/test_initialize.py | 25 +++++++++-------
 spacy/training/initialize.py            | 38 ++++++++++---------------
 spacy/training/loop.py                  | 20 +++++++------
 4 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 8ef2f1d61..8d546529d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -18,6 +18,7 @@ from .tokens.underscore import Underscore
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@@ -27,7 +28,8 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .tokens import Doc
 from .tokenizer import Tokenizer
 from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, validate_init_settings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
 from .git_info import GIT_VERSION
 from . import util
 from . import about
@@ -1161,7 +1163,6 @@ class Language:
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
-        settings: Dict[str, Dict[str, Any]] = SimpleFrozenDict(),
         sgd: Optional[Optimizer] = None,
     ) -> Optimizer:
         """Initialize the pipe for training, using data examples if available.
@@ -1198,28 +1199,38 @@ class Language:
         if not valid_examples:
             err = Errors.E930.format(name="Language", obj="empty list")
             raise ValueError(err)
+        # Make sure the config is interpolated so we can resolve subsections
+        config = self.config.interpolate()
+        # These are the settings provided in the [initialize] block in the config
+        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        V = I["vocab"]
+        init_vocab(
+            self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
+        )
+        pretrain_cfg = config.get("pretraining")
+        if pretrain_cfg:
+            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
+            init_tok2vec(self, P, V)
         if self.vocab.vectors.data.shape[1] >= 1:
             ops = get_current_ops()
             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
-        self._optimizer = sgd
         if hasattr(self.tokenizer, "initialize"):
-            tok_settings = settings.get("tokenizer", {})
             tok_settings = validate_init_settings(
                 self.tokenizer.initialize,
-                tok_settings,
+                I["tokenizer"],
                 section="tokenizer",
                 name="tokenizer",
             )
             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
-        proc_settings = settings.get("components", {})
         for name, proc in self.pipeline:
             if hasattr(proc, "initialize"):
-                p_settings = proc_settings.get(name, {})
+                p_settings = I["components"].get(name, {})
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
                 )
                 proc.initialize(get_examples, nlp=self, **p_settings)
         self._link_components()
+        self._optimizer = sgd
         if sgd is not None:
             self._optimizer = sgd
         elif self._optimizer is None:
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 1d2e7e5a3..b6c22ee09 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -37,30 +37,33 @@ def test_initialize_arguments():
     get_examples = lambda: [example]
     nlp.add_pipe(name)
     # The settings here will typically come from the [initialize] block
+    init_cfg = {"tokenizer": {"custom": 1}, "components": {name: {}}}
+    nlp.config["initialize"].update(init_cfg)
     with pytest.raises(ConfigValidationError) as e:
-        # Empty settings, no required custom1 argument
-        settings = {"tokenizer": {"custom": 1}, "components": {name: {}}}
-        nlp.initialize(get_examples, settings=settings)
+        # Empty config for component, no required custom1 argument
+        nlp.initialize(get_examples)
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ("custom1",)
     assert errors[0]["type"] == "value_error.missing"
+    init_cfg = {
+        "tokenizer": {"custom": 1},
+        "components": {name: {"custom1": "x", "custom2": 1}},
+    }
+    nlp.config["initialize"].update(init_cfg)
     with pytest.raises(ConfigValidationError) as e:
-        # Wrong type
-        settings = {
-            "tokenizer": {"custom": 1},
-            "components": {name: {"custom1": "x", "custom2": 1}},
-        }
-        nlp.initialize(get_examples, settings=settings)
+        # Wrong type of custom 2
+        nlp.initialize(get_examples)
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ("custom2",)
     assert errors[0]["type"] == "value_error.strictbool"
-    settings = {
+    init_cfg = {
         "tokenizer": {"custom": 1},
         "components": {name: {"custom1": "x", "custom2": True}},
     }
-    nlp.initialize(get_examples, settings=settings)
+    nlp.config["initialize"].update(init_cfg)
+    nlp.initialize(get_examples)
     assert nlp.tokenizer.from_initialize == 1
     pipe = nlp.get_pipe(name)
     assert pipe.from_initialize == ("x", True)
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index b42732d48..9517c6c48 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -1,4 +1,4 @@
-from typing import Union, Dict, Optional, Any, List, IO
+from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
@@ -11,16 +11,18 @@ import zipfile
 import tqdm
 
 from .loop import create_before_to_disk_callback
-from ..language import Language
 from ..lookups import Lookups
 from ..vectors import Vectors
 from ..errors import Errors
-from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
+from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, resolve_dot_names
 from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
 
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
 
-def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
+
+def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
     msg = Printer(no_print=silent)
     raw_config = config
     config = raw_config.interpolate()
@@ -38,11 +40,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
-    I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-    V = I["vocab"]
-    init_vocab(
-        nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
-    )
     optimizer = T["optimizer"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
@@ -55,16 +52,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
             msg.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer, settings=I)
+        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         msg.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
-    if "pretraining" in config and config["pretraining"]:
-        P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
-        loaded = add_tok2vec_weights(nlp, P, V)
-        if loaded and P["component"]:
-            msg.good(f"Loaded pretrained weights into component '{P['component']}'")
     nlp = before_to_disk(nlp)
     return nlp
 
@@ -75,13 +67,13 @@ def must_reinitialize(train_config: Config, init_config: Config) -> bool:
 
 
 def init_vocab(
-    nlp: Language,
+    nlp: "Language",
     *,
     data: Optional[Path] = None,
     lookups: Optional[Lookups] = None,
     vectors: Optional[str] = None,
     silent: bool = True,
-) -> Language:
+) -> "Language":
     msg = Printer(no_print=silent)
     if lookups:
         nlp.vocab.lookups = lookups
@@ -109,7 +101,7 @@ def init_vocab(
 
 
 def load_vectors_into_model(
-    nlp: Language, name: Union[str, Path], *, add_strings: bool = True
+    nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
 ) -> None:
     """Load word vectors from an installed model or path into a model instance."""
     try:
@@ -132,8 +124,8 @@ def load_vectors_into_model(
                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
-def add_tok2vec_weights(
-    nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
+def init_tok2vec(
+    nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
 ) -> bool:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     P = pretrain_config
@@ -171,7 +163,7 @@ def add_tok2vec_weights(
     return False
 
 
-def verify_config(nlp: Language) -> None:
+def verify_config(nlp: "Language") -> None:
     """Perform additional checks based on the config, loaded nlp object and training data."""
     # TODO: maybe we should validate based on the actual components, the list
     # in config["nlp"]["pipeline"] instead?
@@ -182,7 +174,7 @@ def verify_config(nlp: Language) -> None:
             verify_textcat_config(nlp, pipe_config)
 
 
-def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
+def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
     # if 'positive_label' is provided: double check whether it's in the data and
     # the task is binary
     if pipe_config.get("positive_label"):
@@ -211,7 +203,7 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
 
 
 def convert_vectors(
-    nlp: Language,
+    nlp: "Language",
     vectors_loc: Optional[Path],
     *,
     truncate: int,
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 5153be66c..41e6464e0 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -1,5 +1,5 @@
 from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
@@ -9,13 +9,15 @@ from wasabi import Printer
 
 from .example import Example
 from ..schemas import ConfigSchemaTraining
-from ..language import Language
 from ..errors import Errors
 from ..util import resolve_dot_names, registry
 
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
+
 
 def train(
-    nlp: Language,
+    nlp: "Language",
     output_path: Optional[Path] = None,
     *,
     use_gpu: int = -1,
@@ -110,7 +112,7 @@ def train(
 
 
 def train_while_improving(
-    nlp: Language,
+    nlp: "Language",
     optimizer: Optimizer,
     train_data,
     evaluate,
@@ -233,7 +235,7 @@ def subdivide_batch(batch, accumulate_gradient):
 
 
 def create_evaluation_callback(
-    nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
+    nlp: "Language", dev_corpus: Callable, weights: Dict[str, float]
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
     weights = {key: value for key, value in weights.items() if value is not None}
 
@@ -277,7 +279,7 @@ def create_train_batches(
 
 
 def update_meta(
-    training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
+    training: Union[Dict[str, Any], Config], nlp: "Language", info: Dict[str, Any]
 ) -> None:
     nlp.meta["performance"] = {}
     for metric in training["score_weights"]:
@@ -288,8 +290,10 @@ def update_meta(
 
 
 def create_before_to_disk_callback(
-    callback: Optional[Callable[[Language], Language]]
-) -> Callable[[Language], Language]:
+    callback: Optional[Callable[["Language"], "Language"]]
+) -> Callable[["Language"], "Language"]:
+    from ..language import Language  # noqa: F811
+
     def before_to_disk(nlp: Language) -> Language:
         if not callback:
             return nlp

From aa2a6882d064924165ee697cac0e431a92e64eb2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 16:08:39 +0200
Subject: [PATCH 244/516] Fix logging

---
 spacy/cli/init_pipeline.py   | 11 ++++++++---
 spacy/training/initialize.py | 35 ++++++++++++++---------------------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 0e9de0eb4..ac1cdb7be 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -19,13 +19,18 @@ def init_vectors_cli(
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
+    """Convert word vectors for use with spaCy. Will export an nlp object that
+    you can use in the [initialize.vocab] block of your config to initialize
+    a model with vectors.
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
-    convert_vectors(
-        nlp, vectors_loc, truncate=truncate, prune=prune, name=name, silent=False
-    )
+    convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
+    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
     nlp.to_disk(output_dir)
     msg.good(
         "Saved nlp object with vectors to output directory. You can now use the "
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 9517c6c48..ef0938321 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -2,7 +2,6 @@ from typing import Union, Dict, Optional, Any, List, IO, TYPE_CHECKING
 from thinc.api import Config, fix_random_seed, set_gpu_allocator
 from thinc.api import ConfigValidationError
 from pathlib import Path
-from wasabi import Printer
 import srsly
 import numpy
 import tarfile
@@ -14,16 +13,15 @@ from .loop import create_before_to_disk_callback
 from ..lookups import Lookups
 from ..vectors import Vectors
 from ..errors import Errors
-from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
-from ..util import registry, load_model_from_config, resolve_dot_names
+from ..schemas import ConfigSchemaTraining
+from ..util import registry, load_model_from_config, resolve_dot_names, logger
 from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
-def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Language":
-    msg = Printer(no_print=silent)
+def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     raw_config = config
     config = raw_config.interpolate()
     if config["training"]["seed"] is not None:
@@ -34,7 +32,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     nlp = load_model_from_config(raw_config, auto_fill=True)
-    msg.good("Set up nlp object from config")
+    logger.info("Set up nlp object from config")
     config = nlp.config.interpolate()
     # Resolve all training-relevant sections using the filled nlp config
     T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
@@ -46,14 +44,14 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> "Lang
     frozen_components = T["frozen_components"]
     # Sourced components that require resume_training
     resume_components = [p for p in sourced_components if p not in frozen_components]
-    msg.info(f"Pipeline: {nlp.pipe_names}")
+    logger.info(f"Pipeline: {nlp.pipe_names}")
     if resume_components:
         with nlp.select_pipes(enable=resume_components):
-            msg.info(f"Resuming training for: {resume_components}")
+            logger.info(f"Resuming training for: {resume_components}")
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        msg.good("Initialized pipeline components")
+        logger.good("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
@@ -72,12 +70,10 @@ def init_vocab(
     data: Optional[Path] = None,
     lookups: Optional[Lookups] = None,
     vectors: Optional[str] = None,
-    silent: bool = True,
 ) -> "Language":
-    msg = Printer(no_print=silent)
     if lookups:
         nlp.vocab.lookups = lookups
-        msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
+        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
     data_path = ensure_path(data)
     if data_path is not None:
         lex_attrs = srsly.read_jsonl(data_path)
@@ -93,11 +89,11 @@ def init_vocab(
         else:
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
-    msg.good("Created vocabulary")
+        logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    logger.good("Created vocabulary")
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
-        msg.good(f"Added vectors: {vectors}")
+        logger.good(f"Added vectors: {vectors}")
 
 
 def load_vectors_into_model(
@@ -209,9 +205,7 @@ def convert_vectors(
     truncate: int,
     prune: int,
     name: Optional[str] = None,
-    silent: bool = True,
 ) -> None:
-    msg = Printer(no_print=silent)
     vectors_loc = ensure_path(vectors_loc)
     if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
         nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@@ -220,9 +214,9 @@ def convert_vectors(
                 nlp.vocab.vectors.add(lex.orth, row=lex.rank)
     else:
         if vectors_loc:
-            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
-            msg.good(f"Loaded vectors from {vectors_loc}")
+            logger.info(f"Reading vectors from {vectors_loc}")
+            vectors_data, vector_keys = read_vectors(vectors_loc, truncate)
+            logger.info(f"Loaded vectors from {vectors_loc}")
         else:
             vectors_data, vector_keys = (None, None)
         if vector_keys is not None:
@@ -239,7 +233,6 @@ def convert_vectors(
     nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
     if prune >= 1:
         nlp.vocab.prune_vectors(prune)
-    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 
 
 def read_vectors(vectors_loc: Path, truncate_vectors: int):

From 58c8d4b414e61ecd612d44521216ff3e8fa9affa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:22:13 +0200
Subject: [PATCH 245/516] Add label_data property to pipeline

---
 spacy/pipeline/morphologizer.pyx     |  7 ++++++-
 spacy/pipeline/pipe.pyx              | 15 +++++++++++++++
 spacy/pipeline/senter.pyx            |  4 ++++
 spacy/pipeline/tagger.pyx            | 10 ++++++++++
 spacy/pipeline/textcat.py            | 15 +++++++++++++++
 spacy/pipeline/transition_parser.pyx |  4 ++++
 6 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 9b28a7ca1..c9798a638 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,5 +1,5 @@
 # cython: infer_types=True, profile=True, binding=True
-from typing import Optional
+from typing import Optional, Union, Dict
 import srsly
 from thinc.api import SequenceCategoricalCrossentropy, Model, Config
 from itertools import islice
@@ -101,6 +101,11 @@ class Morphologizer(Tagger):
         """RETURNS (Tuple[str]): The labels currently added to the component."""
         return tuple(self.cfg["labels_morph"].keys())
 
+    @property
+    def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
+        """RETURNS (Dict): A dictionary with all labels data."""
+        return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
+
     def add_label(self, label):
         """Add a new label to the pipe.
 
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index b8961f307..481430a2c 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True
+from typing import Optional, Tuple
 import srsly
 from thinc.api import set_dropout_rate, Model
 
@@ -32,6 +33,20 @@ cdef class Pipe:
         self.name = name
         self.cfg = dict(cfg)
 
+    @property
+    def labels(self) -> Optional[Tuple[str]]:
+        if "labels" in self.cfg:
+            return tuple(self.cfg["labels"])
+        else:
+            return None
+    
+    @property
+    def label_data(self):
+        """Optional JSON-serializable data that would be sufficient to recreate
+        the label set if provided to the `pipe.initialize()` method.
+        """
+        return None
+
     def __call__(self, Doc doc):
         """Apply the pipe to one document. The document is modified in place,
         and returned. This usually happens under the hood when the nlp object
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index ec635de5c..65c17c771 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
         # are 0
         return tuple(["I", "S"])
 
+    @property
+    def label_data(self):
+        return self.labels
+
     def set_annotations(self, docs, batch_tag_ids):
         """Modify a batch of documents, using pre-computed scores.
 
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3d5aca14e..253b6f08c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -90,6 +90,16 @@ class Tagger(Pipe):
         """
         return tuple(self.cfg["labels"])
 
+    @property
+    def label_data(self):
+        """Data about the labels currently added to the component.
+
+        RETURNS (Dict): The labels data.
+
+        DOCS: https://nightly.spacy.io/api/tagger#labels
+        """
+        return tuple(self.cfg["labels"])
+
     def __call__(self, doc):
         """Apply the pipe to a Doc.
 
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ea058ad31..63b040333 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -154,8 +154,23 @@ class TextCategorizer(Pipe):
 
     @labels.setter
     def labels(self, value: List[str]) -> None:
+        # TODO: This really shouldn't be here. I had a look and I added it when
+        # I added the labels property, but it's pretty nasty to have this, and
+        # will lead to problems.
         self.cfg["labels"] = tuple(value)
 
+    @property
+    def label_data(self) -> Dict:
+        """RETURNS (Dict): Information about the component's labels.
+
+        DOCS: https://nightly.spacy.io/api/textcategorizer#labels
+        """
+        return {
+            "labels": self.labels,
+            "positive": self.cfg["positive_label"],
+            "threshold": self.cfg["threshold"]
+        }
+
     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index c250d2522..9f165cb15 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -95,6 +95,10 @@ cdef class Parser(Pipe):
         class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
         return class_names
 
+    @property
+    def label_data(self):
+        return self.moves.labels
+
     @property
     def tok2vec(self):
         """Return the embedding and convolutional layer of the model."""

From 45daf5c9fe1c0b7a42f0af2dccf2f7ef2faeded9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:22:37 +0200
Subject: [PATCH 246/516] Add init labels command

---
 spacy/cli/__init__.py    |  1 +
 spacy/cli/init_labels.py | 43 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 spacy/cli/init_labels.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 7368bcef3..c5f60adfc 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,6 +16,7 @@ from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
+from .init_labels import init_labels_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py
new file mode 100644
index 000000000..29cb23072
--- /dev/null
+++ b/spacy/cli/init_labels.py
@@ -0,0 +1,43 @@
+from typing import Optional
+import logging
+from pathlib import Path
+from wasabi import msg
+import typer
+import srsly
+
+from .. import util
+from ..training.initialize import init_nlp, convert_vectors
+from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code, setup_gpu
+
+
+@init_cli.command(
+    "labels",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def init_labels_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the labels"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    if not output_path.exists():
+        output_path.mkdir()
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+    for name, component in nlp.pipeline:
+        if getattr(component, "label_data", None) is not None:
+            srsly.write_json(output_path / f"{name}.json", component.label_data)
+            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+        else:
+            msg.info(f"No labels found for {name}")

From 978ab54a84262682f75b8bb0aa196cd4f93976aa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 16:22:41 +0200
Subject: [PATCH 247/516] Fix logging

---
 spacy/training/initialize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index ef0938321..862c76448 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -51,7 +51,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.good("Initialized pipeline components")
+        logger.info("Initialized pipeline components")
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
@@ -89,11 +89,11 @@ def init_vocab(
         else:
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        logger.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
-    logger.good("Created vocabulary")
+        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+    logger.info("Created vocabulary")
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
-        logger.good(f"Added vectors: {vectors}")
+        logger.info(f"Added vectors: {vectors}")
 
 
 def load_vectors_into_model(

From 3f0d61232dbc8b45845463b27d766cdbb813af5e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:47:44 +0200
Subject: [PATCH 248/516] Remove outdated arg from train

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b0bd48ddb..79c3d893c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -56,7 +56,7 @@ def train_cli(
 def init_pipeline(
     config: Config, output_path: Optional[Path], *, use_gpu: int = -1
 ) -> Language:
-    init_kwargs = {"use_gpu": use_gpu, "silent": False}
+    init_kwargs = {"use_gpu": use_gpu}
     if output_path is not None:
         init_path = output_path / "model-initial"
         if not init_path.exists():

From e70a00fa76f50e6c49ece17b20d7c5246609ed35 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:47:54 +0200
Subject: [PATCH 249/516] Remove unnecessary warning from train

---
 spacy/cli/train.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 79c3d893c..7bbfe9315 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -74,12 +74,6 @@ def init_pipeline(
             else:
                 msg.good(f"Loaded initialized pipeline from {init_path}")
         return nlp
-    msg.warn(
-        "Not saving initialized model: no output directory specified. "
-        "To speed up training, spaCy can save the initialized nlp object with "
-        "the vocabulary, vectors and label scheme. To take advantage of this, "
-        "provide an output directory."
-    )
     return init_nlp(config, **init_kwargs)
 
 

From fd594cfb9b80e47614d72020a50b4a12b925bc01 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 16:47:55 +0200
Subject: [PATCH 250/516] Tighten up format

---
 spacy/default_config.cfg     | 24 +++++++++++-------------
 spacy/language.py            |  5 ++---
 spacy/schemas.py             | 16 ++++------------
 spacy/training/initialize.py |  8 ++++----
 4 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 86293fd40..c0fd27c3c 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -1,8 +1,9 @@
 [paths]
 train = ""
 dev = ""
-init_tok2vec = null
+vectors = null
 vocab_data = null
+init_tok2vec = null
 
 [system]
 seed = 0
@@ -96,19 +97,16 @@ eps = 1e-8
 learn_rate = 0.001
 
 # The 'initialize' step is run before training or pretraining. Components and
-# the tokenizer can each define their own prepare step, giving them a chance
-# to gather resources like lookup-tables, build label sets, construct vocabularies,
-# etc. After 'prepare' is finished, the result will be saved out to disk, which
-# will then be read in at the start of training. You can call the prepare step
-# separately with the `spacy prepare` command, or you can let the train script
-# do it for you.
+# the tokenizer can each define their own arguments via their .initialize
+# methods that are populated by the config. This lets them gather resources like
+# lookup tables and build label sets, construct vocabularies, etc.
 [initialize]
-tokenizer = {}
-components = {}
-
-[initialize.vocab]
-data = ${paths.vocab_data}
+vocab_data = ${paths.vocab_data}
 lookups = null
-vectors = null
+vectors = ${paths.vectors}
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
+# Arguments passed to the tokenizer's initialize method
+tokenizer = {}
+# Arguments passed to the initialize methods of the components (keyed by component name)
+components = {}
diff --git a/spacy/language.py b/spacy/language.py
index 8d546529d..7450db720 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1203,14 +1203,13 @@ class Language:
         config = self.config.interpolate()
         # These are the settings provided in the [initialize] block in the config
         I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-        V = I["vocab"]
         init_vocab(
-            self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
+            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
         )
         pretrain_cfg = config.get("pretraining")
         if pretrain_cfg:
             P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
-            init_tok2vec(self, P, V)
+            init_tok2vec(self, P, I)
         if self.vocab.vectors.data.shape[1] >= 1:
             ops = get_current_ops()
             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 0b2eeba68..658eeb574 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel):
         arbitrary_types_allowed = True
 
 
-class ConfigSchemaInitVocab(BaseModel):
+class ConfigSchemaInit(BaseModel):
     # fmt: off
-    data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
+    vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
     lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
+    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
     # fmt: on
 
     class Config:
@@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel):
         arbitrary_types_allowed = True
 
 
-class ConfigSchemaInit(BaseModel):
-    vocab: ConfigSchemaInitVocab
-    tokenizer: Any
-    components: Dict[StrictStr, Any]
-
-    class Config:
-        extra = "forbid"
-        arbitrary_types_allowed = True
-
-
 class ConfigSchema(BaseModel):
     training: ConfigSchemaTraining
     nlp: ConfigSchemaNlp
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 862c76448..aa5edde5d 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -121,15 +121,15 @@ def load_vectors_into_model(
 
 
 def init_tok2vec(
-    nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
+    nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
 ) -> bool:
     # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
     P = pretrain_config
-    V = vocab_config
+    I = init_config
     weights_data = None
-    init_tok2vec = ensure_path(V["init_tok2vec"])
+    init_tok2vec = ensure_path(I["init_tok2vec"])
     if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not V["vectors"]:
+        if P["objective"].get("type") == "vectors" and not I["vectors"]:
             err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
             errors = [{"loc": ["initialize", "vocab"], "msg": err}]
             raise ConfigValidationError(config=nlp.config, errors=errors)

From 10847c7f4e61ac08533efb1dee1eacea9e939d71 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:48:07 +0200
Subject: [PATCH 251/516] Fix arg

---
 spacy/cli/init_labels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py
index 29cb23072..e675901a3 100644
--- a/spacy/cli/init_labels.py
+++ b/spacy/cli/init_labels.py
@@ -34,7 +34,7 @@ def init_labels_cli(
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
     with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+        nlp = init_nlp(config, use_gpu=use_gpu)
     for name, component in nlp.pipeline:
         if getattr(component, "label_data", None) is not None:
             srsly.write_json(output_path / f"{name}.json", component.label_data)

From ca726080592abfae0f9045c61f4e7d15b1188f9d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:48:33 +0200
Subject: [PATCH 252/516] Fix language

---
 spacy/language.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 8d546529d..ec2e42a35 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1181,24 +1181,9 @@ class Language:
             )
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
-        # Populate vocab
         if not hasattr(get_examples, "__call__"):
             err = Errors.E930.format(name="Language", obj=type(get_examples))
             raise ValueError(err)
-        valid_examples = False
-        for example in get_examples():
-            if not isinstance(example, Example):
-                err = Errors.E978.format(
-                    name="Language.initialize", types=type(example)
-                )
-                raise ValueError(err)
-            else:
-                valid_examples = True
-            for word in [t.text for t in example.reference]:
-                _ = self.vocab[word]  # noqa: F841
-        if not valid_examples:
-            err = Errors.E930.format(name="Language", obj="empty list")
-            raise ValueError(err)
         # Make sure the config is interpolated so we can resolve subsections
         config = self.config.interpolate()
         # These are the settings provided in the [initialize] block in the config

From 99bff78617388d077b4113c602becfa09a23c344 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:48:44 +0200
Subject: [PATCH 253/516] Use labels in tagger

---
 spacy/pipeline/tagger.pyx | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 253b6f08c..f4e8ecebd 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -266,7 +266,7 @@ class Tagger(Pipe):
             raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
-    def initialize(self, get_examples, *, nlp=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -277,15 +277,19 @@ class Tagger(Pipe):
         DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
         self._ensure_examples(get_examples)
+        if labels is not None:
+            for tag in labels:
+                self.add_label(tag)
+        else:
+            tags = set()
+            for example in get_examples():
+                for token in example.y:
+                    if token.tag_:
+                        tags.add(token.tag_)
+            for tag in sorted(tags):
+                self.add_label(tag)
         doc_sample = []
         label_sample = []
-        tags = set()
-        for example in get_examples():
-            for token in example.y:
-                if token.tag_:
-                    tags.add(token.tag_)
-        for tag in sorted(tags):
-            self.add_label(tag)
         for example in islice(get_examples(), 10):
             doc_sample.append(example.x)
             gold_tags = example.get_aligned("TAG", as_string=True)

From 1fd002180e98d830da26f4593ce6bc7a838e2131 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:48:56 +0200
Subject: [PATCH 254/516] Allow more components to use labels

---
 spacy/pipeline/textcat.py            | 25 ++++++++++++-------------
 spacy/pipeline/transition_parser.pyx | 15 +++++++++------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 63b040333..d6dafa3f5 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -160,16 +160,12 @@ class TextCategorizer(Pipe):
         self.cfg["labels"] = tuple(value)
 
     @property
-    def label_data(self) -> Dict:
-        """RETURNS (Dict): Information about the component's labels.
+    def label_data(self) -> List[str]:
+        """RETURNS (List[str]): Information about the component's labels.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#labels
         """
-        return {
-            "labels": self.labels,
-            "positive": self.cfg["positive_label"],
-            "threshold": self.cfg["threshold"]
-        }
+        return self.labels
 
     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
@@ -354,6 +350,7 @@ class TextCategorizer(Pipe):
         get_examples: Callable[[], Iterable[Example]],
         *,
         nlp: Optional[Language] = None,
+        labels: Optional[Dict] = None
     ):
         """Initialize the pipe for training, using a representative set
         of data examples.
@@ -365,12 +362,14 @@ class TextCategorizer(Pipe):
         DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
         self._ensure_examples(get_examples)
-        subbatch = []  # Select a subbatch of examples to initialize the model
-        for example in islice(get_examples(), 10):
-            if len(subbatch) < 2:
-                subbatch.append(example)
-            for cat in example.y.cats:
-                self.add_label(cat)
+        if labels is None:
+            for example in get_examples():
+                for cat in example.y.cats:
+                    self.add_label(cat)
+        else:
+            for label in labels:
+                self.add_label(label)
+        subbatch = list(islice(get_examples(), 10))
         doc_sample = [eg.reference for eg in subbatch]
         label_sample, _ = self._examples_to_truth(subbatch)
         self._require_labels()
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 9f165cb15..11e0e5af8 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -409,17 +409,20 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def initialize(self, get_examples, nlp=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
         self._ensure_examples(get_examples)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
             langs = ", ".join(util.LEXEME_NORM_LANGS)
             util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
-        actions = self.moves.get_actions(
-            examples=get_examples(),
-            min_freq=self.cfg['min_action_freq'],
-            learn_tokens=self.cfg["learn_tokens"]
-        )
+        if labels is not None:
+            actions = dict(labels)
+        else:
+            actions = self.moves.get_actions(
+                examples=get_examples(),
+                min_freq=self.cfg['min_action_freq'],
+                learn_tokens=self.cfg["learn_tokens"]
+            )
         for action, labels in self.moves.labels.items():
             actions.setdefault(action, {})
             for label, freq in labels.items():

From 43fc7a316d415a0e5ef9fecc02112502928c9fd3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:49:09 +0200
Subject: [PATCH 255/516] Add registry function for reading jsonl

---
 spacy/training/corpus.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 12bda486e..bd431ab83 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -30,6 +30,11 @@ def create_jsonl_reader(
     return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
+@util.registry.readers("srsly.read_json.v1")
+def _read_json(loc: Path):
+    return srsly.read_json(loc)
+
+
 def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
     path = util.ensure_path(path)
     if not path.is_dir() and path.parts[-1].endswith(file_type):

From 4ad26f4a2f5ea73eff3b179c3a234d4713c1da6c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:54:53 +0200
Subject: [PATCH 256/516] Move reader

---
 spacy/training/corpus.py | 5 -----
 spacy/util.py            | 3 +++
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index bd431ab83..12bda486e 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -30,11 +30,6 @@ def create_jsonl_reader(
     return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
-@util.registry.readers("srsly.read_json.v1")
-def _read_json(loc: Path):
-    return srsly.read_json(loc)
-
-
 def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
     path = util.ensure_path(path)
     if not path.is_dir() and path.parts[-1].endswith(file_type):
diff --git a/spacy/util.py b/spacy/util.py
index 67c577927..948c4ab11 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -97,6 +97,9 @@ class registry(thinc.registry):
     models = catalogue.create("spacy", "models", entry_points=True)
     cli = catalogue.create("spacy", "cli", entry_points=True)
 
+# We want json loading in the registry, so manually register srsly.read_json.
+registry.readers("srsly.read_json.v0", srsly.read_json)
+
 
 class SimpleFrozenDict(dict):
     """Simplified implementation of a frozen dict, mainly used as default

From e4f535a964da107a5fd558acf3c975388f1dce75 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 16:55:07 +0200
Subject: [PATCH 257/516] Fix Pipe.labels

---
 spacy/pipeline/pipe.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 481430a2c..49d0bea35 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -35,10 +35,7 @@ cdef class Pipe:
 
     @property
     def labels(self) -> Optional[Tuple[str]]:
-        if "labels" in self.cfg:
-            return tuple(self.cfg["labels"])
-        else:
-            return None
+        return []
     
     @property
     def label_data(self):

From d7469283c5bcce87c7cdc088a161e909d5e87f19 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 16:59:21 +0200
Subject: [PATCH 258/516] Update docs [ci skip]

---
 website/docs/api/data-formats.md       | 35 +++++++++++++++++++++++---
 website/docs/api/dependencyparser.md   | 20 +++++++--------
 website/docs/api/entitylinker.md       | 22 ++++++++--------
 website/docs/api/entityrecognizer.md   | 20 +++++++--------
 website/docs/api/language.md           | 19 +++++++++-----
 website/docs/api/morphologizer.md      | 21 +++++++---------
 website/docs/api/pipe.md               | 20 +++++++--------
 website/docs/api/sentencerecognizer.md | 20 +++++++--------
 website/docs/api/tagger.md             | 20 +++++++--------
 website/docs/api/textcategorizer.md    | 20 +++++++--------
 website/docs/api/tok2vec.md            |  9 +++----
 website/docs/api/transformer.md        |  9 +++----
 12 files changed, 126 insertions(+), 109 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 6ff3bfd0d..0d2c78598 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -190,8 +190,6 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                 |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
 | `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                         |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                           |
-| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                  |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                           |
 | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                 |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                   |
@@ -200,7 +198,6 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                             |
 | `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                           |
 | `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                       |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                          |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -220,6 +217,38 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
 
+### initialize {#config-initialize tag="section"}
+
+This config block lets you define resources for **initializing the pipeline**.
+It's used by [`Language.initialize`](/api/language#initialize) and typically
+called right before training (but not at runtime). The section allows you to
+specify local file paths or custom functions to load data resources from,
+without requiring them at runtime when you load the trained pipeline back in.
+
+> #### Example
+>
+> ```ini
+> [initialize]
+> vectors = "/path/to/vectors_nlp"
+> init_tok2vec = "/path/to/pretrain.bin"
+>
+> [initialize_components]
+>
+> [initialize.components.my_component]
+> data_path = "/path/to/component_data"
+> ```
+
+<!-- TODO: -->
+
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
+| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                                                                                                                                                                                |
+| `lookups`      | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                                                                                                                                                                                       |
+| `tokenizer`    | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ |
+| `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                               |
+| `vocab_data`   | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~                                                                                                                                                                                                                                                                                           |
+
 ## Training data {#training}
 
 ### Binary training format {#binary-training new="3"}
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index c7c41f2a1..7c56ce84e 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -142,14 +142,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 
 ## DependencyParser.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -161,16 +161,14 @@ This method was previously called `begin_training`.
 >
 > ```python
 > parser = nlp.add_pipe("parser")
-> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
+> parser.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## DependencyParser.predict {#predict tag="method"}
 
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 1dbe78703..b104fb69a 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -141,14 +141,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 
 ## EntityLinker.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -159,17 +159,15 @@ This method was previously called `begin_training`.
 > #### Example
 >
 > ```python
-> entity_linker = nlp.add_pipe("entity_linker", last=True)
-> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
+> entity_linker = nlp.add_pipe("entity_linker")
+> entity_linker.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## EntityLinker.predict {#predict tag="method"}
 
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 2c32ff753..b930660d9 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -131,14 +131,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 
 ## EntityRecognizer.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -150,16 +150,14 @@ This method was previously called `begin_training`.
 >
 > ```python
 > ner = nlp.add_pipe("ner")
-> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
+> ner.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## EntityRecognizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 11631502c..8dbb0d821 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -204,12 +204,19 @@ more efficient than processing texts one-by-one.
 ## Language.initialize {#initialize tag="method"}
 
 Initialize the pipeline for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples can either be the full training data or a representative sample. They
-are used to **initialize the models** of trainable pipeline components and are
-passed each component's [`initialize`](/api/pipe#initialize) method, if
-available. Initialization includes validating the network,
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
+settings defined in the [`[initialize]`](/api/data-formats#config-initialize)
+config block to set up the vocabulary, load in vectors and tok2vec weights and
+pass optional arguments to the `initialize` methods implemented by pipeline
+components or the tokenizer. This method is typically called automatically when
+you run [`spacy train`](/api/cli#train).
+
+`get_examples` should be a function that returns an iterable of
+[`Example`](/api/example) objects. The data examples can either be the full
+training data or a representative sample. They are used to **initialize the
+models** of trainable pipeline components and are passed each component's
+[`initialize`](/api/pipe#initialize) method, if available. Initialization
+includes validating the network,
 [inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
 and setting up the label scheme based on the data.
 
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 4f00a09ef..68e096ab7 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -119,30 +119,27 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
 
 ## Morphologizer.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 > #### Example
 >
 > ```python
 > morphologizer = nlp.add_pipe("morphologizer")
-> nlp.pipeline.append(morphologizer)
-> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
+> morphologizer.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## Morphologizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 17752ed5e..385ad7ec9 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -100,14 +100,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 
 ## Pipe.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -119,16 +119,14 @@ This method was previously called `begin_training`.
 >
 > ```python
 > pipe = nlp.add_pipe("your_custom_pipe")
-> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
+> pipe.initialize(lambda: [], pipeline=nlp.pipeline)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## Pipe.predict {#predict tag="method"}
 
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index d81725343..ac7008465 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -116,29 +116,27 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
 
 ## SentenceRecognizer.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 > #### Example
 >
 > ```python
 > senter = nlp.add_pipe("senter")
-> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
+> senter.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## SentenceRecognizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 6ca554f49..ff9763e61 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -114,14 +114,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 
 ## Tagger.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -133,16 +133,14 @@ This method was previously called `begin_training`.
 >
 > ```python
 > tagger = nlp.add_pipe("tagger")
-> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
+> tagger.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## Tagger.predict {#predict tag="method"}
 
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 4c99d6984..6db960ea0 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -127,14 +127,14 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 
 ## TextCategorizer.initialize {#initialize tag="method"}
 
-Initialize the component for training and return an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
-function that returns an iterable of [`Example`](/api/example) objects. The data
-examples are used to **initialize the model** of the component and can either be
-the full training data or a representative sample. Initialization includes
-validating the network,
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. The data examples are
+used to **initialize the model** of the component and can either be the full
+training data or a representative sample. Initialization includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -146,16 +146,14 @@ This method was previously called `begin_training`.
 >
 > ```python
 > textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
+> textcat.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## TextCategorizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 8269ad7cf..fa6e6c689 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -132,22 +132,21 @@ examples are used to **initialize the model** of the component and can either be
 the full training data or a representative sample. Initialization includes
 validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 > #### Example
 >
 > ```python
 > tok2vec = nlp.add_pipe("tok2vec")
-> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
+> tok2vec.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## Tok2Vec.predict {#predict tag="method"}
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 712214fec..938574f2e 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -167,22 +167,21 @@ examples are used to **initialize the model** of the component and can either be
 the full training data or a representative sample. Initialization includes
 validating the network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data.
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
 
 > #### Example
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
+> trf.initialize(lambda: [], nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
-| `pipeline`     | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~             |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                         |
-| **RETURNS**    | The optimizer. ~~Optimizer~~                                                                                                          |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
 ## Transformer.predict {#predict tag="method"}
 

From f2352eb701a3fc968b0a20ba2e143ccd995347c3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 17:00:40 +0200
Subject: [PATCH 259/516] Test with default value

---
 spacy/tests/pipeline/test_initialize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index b6c22ee09..c9b514770 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -60,10 +60,10 @@ def test_initialize_arguments():
     assert errors[0]["type"] == "value_error.strictbool"
     init_cfg = {
         "tokenizer": {"custom": 1},
-        "components": {name: {"custom1": "x", "custom2": True}},
+        "components": {name: {"custom1": "x"}},
     }
     nlp.config["initialize"].update(init_cfg)
     nlp.initialize(get_examples)
     assert nlp.tokenizer.from_initialize == 1
     pipe = nlp.get_pipe(name)
-    assert pipe.from_initialize == ("x", True)
+    assert pipe.from_initialize == ("x", False)

From 534e1ef49889a3da771e273e14552078490c92b2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 17:02:55 +0200
Subject: [PATCH 260/516] Fix template

---
 spacy/cli/templates/quickstart_training.jinja | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 5e990611e..adad72995 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -314,8 +314,6 @@ compound = 1.001
 {% endif %}
 
 [initialize]
-
-[initialize.vocab]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = null
 {% else -%}

From 9353a82076f4babd60b4d150b5ff9b0632eae5f8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 18:07:48 +0200
Subject: [PATCH 261/516] Auto-format

---
 spacy/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/util.py b/spacy/util.py
index 948c4ab11..1e0a8e7d4 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -97,6 +97,7 @@ class registry(thinc.registry):
     models = catalogue.create("spacy", "models", entry_points=True)
     cli = catalogue.create("spacy", "cli", entry_points=True)
 
+
 # We want json loading in the registry, so manually register srsly.read_json.
 registry.readers("srsly.read_json.v0", srsly.read_json)
 

From dba26186efa4eb437e0f4f50d8dba395027164ec Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 18:08:02 +0200
Subject: [PATCH 262/516] Handle None default args in Cython methods

---
 spacy/schemas.py |  8 ++++++--
 spacy/util.py    | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 658eeb574..555a505d7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -11,6 +11,7 @@ import inspect
 
 from .attrs import NAMES
 from .lookups import Lookups
+from .util import is_cython_func
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -93,8 +94,11 @@ def get_arg_model(
             continue
         # If no annotation is specified assume it's anything
         annotation = param.annotation if param.annotation != param.empty else Any
-        # If no default value is specified assume that it's required
-        default = param.default if param.default != param.empty else ...
+        # If no default value is specified assume that it's required. Cython
+        # functions/methods will have param.empty for default value None so we
+        # need to treat them differently
+        default_empty = None if is_cython_func(func) else ...
+        default = param.default if param.default != param.empty else default_empty
         sig_args[param.name] = (annotation, default)
     is_strict = strict and not has_variable
     sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra
diff --git a/spacy/util.py b/spacy/util.py
index 1e0a8e7d4..98c2a4083 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1310,3 +1310,21 @@ def minibatch(items, size):
         if len(batch) == 0:
             break
         yield list(batch)
+
+
+def is_cython_func(func: Callable) -> bool:
+    """Slightly hacky check for whether a callable is implemented in Cython.
+    Can be used to implement slightly different behaviors, especially around
+    inspecting and parameter annotations.
+
+    func (Callable): The callable to check.
+    RETURNS (bool): Whether the callable is Cython (probably).
+    """
+    attr = "__reduce_cython__"
+    if hasattr(func, attr):  # function or class instance
+        return True
+    # https://stackoverflow.com/a/55767059
+    if hasattr(func, "__qualname__") and hasattr(func, "__module__"):  # method
+        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        return hasattr(cls_func, attr)
+    return False

From 71a0ee274a1f40c06f1c2ad077eb006aa42ac702 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 18:09:33 +0200
Subject: [PATCH 263/516] Move init labels to init pipeline module

---
 spacy/cli/__init__.py      |  1 -
 spacy/cli/init_labels.py   | 43 --------------------------------------
 spacy/cli/init_pipeline.py | 33 +++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 44 deletions(-)
 delete mode 100644 spacy/cli/init_labels.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index c5f60adfc..7368bcef3 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,7 +16,6 @@ from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
-from .init_labels import init_labels_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
 from .validate import validate  # noqa: F401
 from .project.clone import project_clone  # noqa: F401
diff --git a/spacy/cli/init_labels.py b/spacy/cli/init_labels.py
deleted file mode 100644
index e675901a3..000000000
--- a/spacy/cli/init_labels.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Optional
-import logging
-from pathlib import Path
-from wasabi import msg
-import typer
-import srsly
-
-from .. import util
-from ..training.initialize import init_nlp, convert_vectors
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-
-
-@init_cli.command(
-    "labels",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def init_labels_cli(
-    # fmt: off
-    ctx: typer.Context,  # This is only used to read additional arguments
-    config_path: Path = Arg(..., help="Path to config file", exists=True),
-    output_path: Path = Arg(..., help="Output directory for the labels"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
-    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
-    # fmt: on
-):
-    if not output_path.exists():
-        output_path.mkdir()
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
-    overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
-    setup_gpu(use_gpu)
-    with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=overrides)
-    with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu)
-    for name, component in nlp.pipeline:
-        if getattr(component, "label_data", None) is not None:
-            srsly.write_json(output_path / f"{name}.json", component.label_data)
-            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
-        else:
-            msg.info(f"No labels found for {name}")
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index ac1cdb7be..43b95cec1 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -3,6 +3,7 @@ import logging
 from pathlib import Path
 from wasabi import msg
 import typer
+import srsly
 
 from .. import util
 from ..training.initialize import init_nlp, convert_vectors
@@ -64,3 +65,35 @@ def init_pipeline_cli(
         nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
     nlp.to_disk(output_path)
     msg.good(f"Saved initialized pipeline to {output_path}")
+
+
+@init_cli.command(
+    "labels",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def init_labels_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    output_path: Path = Arg(..., help="Output directory for the labels"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
+    # fmt: on
+):
+    if not output_path.exists():
+        output_path.mkdir()
+    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    setup_gpu(use_gpu)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides)
+    with show_validation_error(hint_fill=False):
+        nlp = init_nlp(config, use_gpu=use_gpu)
+    for name, component in nlp.pipeline:
+        if getattr(component, "label_data", None) is not None:
+            srsly.write_json(output_path / f"{name}.json", component.label_data)
+            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+        else:
+            msg.info(f"No labels found for {name}")

From 78510206537eb6a7b0afe0eccfc62a84a5f96d6e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 18:14:15 +0200
Subject: [PATCH 264/516] Update tests

---
 spacy/tests/pipeline/test_morphologizer.py | 2 +-
 spacy/tests/pipeline/test_senter.py        | 2 +-
 spacy/tests/pipeline/test_tagger.py        | 2 +-
 spacy/tests/pipeline/test_textcat.py       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index c86ee3617..5d605f4e6 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -66,7 +66,7 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
         nlp.initialize(get_examples=train_examples)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 5d8a8be41..c64dfcbd6 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -40,7 +40,7 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
         nlp.initialize(get_examples=train_examples)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 69a6dd414..b32925d84 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -74,7 +74,7 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: train_examples[0])
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2870229c8..ff36bbda9 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -121,7 +121,7 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
         nlp.initialize(get_examples=train_examples)

From 0b5c72fce23bdb14f5fe2473d8bdea3d07d6609d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 18:30:38 +0200
Subject: [PATCH 265/516] Fix incorrect docstrings

---
 spacy/pipeline/morphologizer.pyx     | 2 +-
 spacy/pipeline/tagger.pyx            | 7 +------
 spacy/pipeline/textcat.py            | 5 +----
 spacy/pipeline/transition_parser.pyx | 3 ++-
 4 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index c9798a638..60ad10a2b 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -103,7 +103,7 @@ class Morphologizer(Tagger):
 
     @property
     def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
-        """RETURNS (Dict): A dictionary with all labels data."""
+        """A dictionary with all labels data."""
         return {"morph": self.cfg["labels_morph"], "pos": self.cfg["labels_pos"]}
 
     def add_label(self, label):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index f4e8ecebd..a4f9d395f 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -92,12 +92,7 @@ class Tagger(Pipe):
 
     @property
     def label_data(self):
-        """Data about the labels currently added to the component.
-
-        RETURNS (Dict): The labels data.
-
-        DOCS: https://nightly.spacy.io/api/tagger#labels
-        """
+        """Data about the labels currently added to the component."""
         return tuple(self.cfg["labels"])
 
     def __call__(self, doc):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index d6dafa3f5..776b0a178 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -161,10 +161,7 @@ class TextCategorizer(Pipe):
 
     @property
     def label_data(self) -> List[str]:
-        """RETURNS (List[str]): Information about the component's labels.
-
-        DOCS: https://nightly.spacy.io/api/textcategorizer#labels
-        """
+        """RETURNS (List[str]): Information about the component's labels."""
         return self.labels
 
     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 11e0e5af8..bcaa8e8d4 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -7,6 +7,7 @@ from libcpp.vector cimport vector
 from libc.string cimport memset
 from libc.stdlib cimport calloc, free
 import random
+from typing import Optional
 
 import srsly
 from thinc.api import set_dropout_rate
@@ -409,7 +410,7 @@ cdef class Parser(Pipe):
     def set_output(self, nO):
         self.model.attrs["resize_output"](self.model, nO)
 
-    def initialize(self, get_examples, *, nlp=None, labels=None):
+    def initialize(self, get_examples, nlp=None, labels=None):
         self._ensure_examples(get_examples)
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:

From a4da3120b4244c2599fa2b184189df52533f6fea Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 18:33:16 +0200
Subject: [PATCH 266/516] Fix multitasks

---
 spacy/pipeline/dep_parser.pyx | 4 ++--
 spacy/pipeline/ner.pyx        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index eedb4cba9..bdef332cc 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -126,13 +126,13 @@ cdef class DependencyParser(Parser):
     def add_multitask_objective(self, mt_component):
         self._multitasks.append(mt_component)
 
-    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
         # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
         for labeller in self._multitasks:
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.initialize(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, nlp=nlp)
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index effcef2e3..6482d6125 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -96,14 +96,14 @@ cdef class EntityRecognizer(Parser):
         """Register another component as a multi-task objective. Experimental."""
         self._multitasks.append(mt_component)
 
-    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+    def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
         """Setup multi-task objective components. Experimental and internal."""
         # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
         for labeller in self._multitasks:
             labeller.model.set_dim("nO", len(self.labels))
             if labeller.model.has_ref("output_layer"):
                 labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
-            labeller.initialize(get_examples, pipeline=pipeline)
+            labeller.initialize(get_examples, nlp=nlp)
 
     @property
     def labels(self):

From 2be80379ec644c253af7d312c0a5bcad4fd3515d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 20:38:35 +0200
Subject: [PATCH 267/516] Fix small issues, resolve_dot_names and debug model

---
 spacy/cli/debug_config.py            |  8 +++++--
 spacy/cli/debug_model.py             | 32 ++++++++++++++++++----------
 spacy/schemas.py                     | 14 ++++++------
 spacy/tests/test_util.py             |  4 ++--
 spacy/tests/training/test_readers.py | 10 ++++-----
 spacy/training/initialize.py         |  6 +++---
 spacy/training/loop.py               |  4 +---
 spacy/util.py                        | 29 ++++++-------------------
 8 files changed, 51 insertions(+), 56 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index d1dcc45b9..a6c7345f0 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -7,6 +7,8 @@ import typer
 
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
+from ..schemas import ConfigSchemaTraining
+from ..util import registry
 from .. import util
 
 
@@ -52,8 +54,10 @@ def debug_config(
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
         nlp = util.load_model_from_config(config)
-        dot_names = ["training.dev_corpus", "training.train_corpus"]
-        util.resolve_dot_names(nlp.config, dot_names)
+        config = nlp.config.interpolate()
+        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
+        dot_names = [T["train_corpus"], T["dev_corpus"]]
+        util.resolve_dot_names(config, dot_names)
     msg.good("Config is valid")
     if show_vars:
         variables = get_variables(config)
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index eca85dc04..3b8ba7dae 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -2,7 +2,7 @@ from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
 
 from spacy.training import Example
-from spacy.util import dot_to_object
+from spacy.util import resolve_dot_names
 from wasabi import msg
 from thinc.api import fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
@@ -15,7 +15,10 @@ from ..util import registry
 from .. import util
 
 
-@debug_cli.command("model")
+@debug_cli.command(
+    "model",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
 def debug_model_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
@@ -57,15 +60,14 @@ def debug_model_cli(
         raw_config = util.load_config(
             config_path, overrides=config_overrides, interpolate=False
         )
-    config = raw_config.iterpolate()
+    config = raw_config.interpolate()
     allocator = config["training"]["gpu_allocator"]
     if use_gpu >= 0 and allocator:
         set_gpu_allocator(allocator)
     with show_validation_error(config_path):
         nlp = util.load_model_from_config(raw_config)
-        T = registry.resolve(
-            nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
-        )
+        config = nlp.config.interpolate()
+        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     seed = T["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")
@@ -77,11 +79,16 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(T, nlp, model, print_settings=print_settings)
+    debug_model(config, T, nlp, model, print_settings=print_settings)
 
 
 def debug_model(
-    config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
+    config,
+    resolved_train_config,
+    nlp,
+    model: Model,
+    *,
+    print_settings: Optional[Dict[str, Any]] = None,
 ):
     if not isinstance(model, Model):
         msg.fail(
@@ -102,13 +109,16 @@ def debug_model(
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         try:
-            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-            nlp.initialize(lambda: train_corpus(nlp))
+            dot_names = [resolved_train_config["train_corpus"]]
+            with show_validation_error():
+                (train_corpus,) = resolve_dot_names(config, dot_names)
+                nlp.initialize(lambda: train_corpus(nlp))
             msg.info("Initialized the model with the training corpus.")
         except ValueError:
             try:
                 _set_output_dim(nO=7, model=model)
-                nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
+                with show_validation_error():
+                    nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except Exception:
                 msg.fail(
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 555a505d7..d9a31c742 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -389,14 +389,12 @@ class ConfigSchema(BaseModel):
         arbitrary_types_allowed = True
 
 
-class TrainingSchema(BaseModel):
-    training: ConfigSchemaTraining
-    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
-    corpora: Dict[str, Reader]
-
-    class Config:
-        extra = "allow"
-        arbitrary_types_allowed = True
+CONFIG_SCHEMAS = {
+    "nlp": ConfigSchemaNlp,
+    "training": ConfigSchemaTraining,
+    "pretraining": ConfigSchemaPretrain,
+    "initialize": ConfigSchemaInit,
+}
 
 
 # Project config Schema
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index f48cfba00..f710a38eb 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -128,10 +128,10 @@ def test_resolve_dot_names():
         "training": {"optimizer": {"@optimizers": "Adam.v1"}},
         "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
     }
-    result = util.resolve_dot_names(config, ["foo.bar"])
+    result = util.resolve_dot_names(config, ["training.optimizer"])
     assert isinstance(result[0], Optimizer)
     with pytest.raises(ConfigValidationError) as e:
-        util.resolve_dot_names(config, ["foo.baz", "foo.bar"])
+        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index ea39e8b90..9d82ca50a 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -39,12 +39,12 @@ def test_readers():
 
     config = Config().from_str(config_string)
     nlp = load_model_from_config(config, auto_fill=True)
-    dot_names = ["training.train_corpus", "training.dev_corpus"]
-    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
-    assert isinstance(train_corpus, Callable)
     T = registry.resolve(
         nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
     )
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
+    assert isinstance(train_corpus, Callable)
     optimizer = T["optimizer"]
     # simulate a training loop
     nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
@@ -92,11 +92,11 @@ def test_cat_readers(reader, additional_config):
     config["corpora"]["@readers"] = reader
     config["corpora"].update(additional_config)
     nlp = load_model_from_config(config, auto_fill=True)
-    dot_names = ["training.train_corpus", "training.dev_corpus"]
-    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
     T = registry.resolve(
         nlp.config["training"].interpolate(), schema=ConfigSchemaTraining
     )
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names)
     optimizer = T["optimizer"]
     # simulate a training loop
     nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index aa5edde5d..09ac2b0ac 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -130,12 +130,12 @@ def init_tok2vec(
     init_tok2vec = ensure_path(I["init_tok2vec"])
     if init_tok2vec is not None:
         if P["objective"].get("type") == "vectors" and not I["vectors"]:
-            err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
-            errors = [{"loc": ["initialize", "vocab"], "msg": err}]
+            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
+            errors = [{"loc": ["initialize"], "msg": err}]
             raise ConfigValidationError(config=nlp.config, errors=errors)
         if not init_tok2vec.exists():
             err = f"can't find pretrained tok2vec: {init_tok2vec}"
-            errors = [{"loc": ["initialize", "vocab", "init_tok2vec"], "msg": err}]
+            errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
             raise ConfigValidationError(config=nlp.config, errors=errors)
         with init_tok2vec.open("rb") as file_:
             weights_data = file_.read()
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 41e6464e0..e20cddd3e 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -29,9 +29,7 @@ def train(
     output_path (Path): Optional output path to save trained model to.
     use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
         before calling this function.
-    logger (Callable[[Any], Any]): Optional logger exposing the methods info,
-        error, debug and  warn. Defaults to regular spaCy logger but can be
-        swapped for CLI logger.
+    silent (bool): Whether to pretty-print outputs.
     RETURNS (Path / None): The path to the final exported model.
     """
     msg = Printer(no_print=silent)
diff --git a/spacy/util.py b/spacy/util.py
index 98c2a4083..2dfd00e2f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -392,7 +392,6 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A
     we could find the lowest part of the tree.
     """
     # TODO: include schema?
-    # TODO: clean this up and avoid duplication
     resolved = {}
     output = []
     errors = []
@@ -403,34 +402,20 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A
             section = name.split(".")[0]
             # We want to avoid resolving the same thing twice
             if section not in resolved:
-                resolved[section] = registry.resolve(config[section])
+                if registry.is_promise(config[section]):
+                    # Otherwise we can't resolve [corpus] if it's a promise
+                    result = registry.resolve({"config": config[section]})["config"]
+                else:
+                    result = registry.resolve(config[section])
+                resolved[section] = result
             try:
                 output.append(dot_to_object(resolved, name))
             except KeyError:
                 msg = f"not a valid section reference: {name}"
                 errors.append({"loc": name.split("."), "msg": msg})
-    objects = []
-    for ref in output:
-        if not isinstance(ref, str):
-            objects.append(ref)
-            continue
-        section = ref.split(".")[0]
-        # We want to avoid resolving the same thing twice
-        if section not in resolved:
-            if registry.is_promise(config[section]):
-                # Otherwise we can't resolve [corpus] if it's a promise
-                result = registry.resolve({"config": config[section]})["config"]
-            else:
-                result = registry.resolve(config[section])
-            resolved[section] = result
-        try:
-            objects.append(dot_to_object(resolved, ref))
-        except KeyError:
-            msg = f"not a valid section reference: {name}"
-            errors.append({"loc": ref.split("."), "msg": msg})
     if errors:
         raise ConfigValidationError(config=config, errors=errors)
-    return tuple(objects)
+    return tuple(output)
 
 
 def load_model_from_init_py(

From b486389eece1984d932472353e650e14ef1849d4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 20:48:43 +0200
Subject: [PATCH 268/516] Update website/docs/api/doc.md

---
 website/docs/api/doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 40fd8d531..45ecd4d8c 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -479,7 +479,7 @@ invalidated, although they may accidentally continue to work.
 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
-`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute name to values.
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
 
 > #### Example
 >

From 798040bc1d7073b44c349a4120bdf99a2a7dea99 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 21:08:13 +0200
Subject: [PATCH 269/516] Fix language detection

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index ee73faed3..9591cb61d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1518,7 +1518,7 @@ class Language:
             ).merge(config)
         if "nlp" not in config:
             raise ValueError(Errors.E985.format(config=config))
-        config_lang = config["nlp"]["lang"]
+        config_lang = config["nlp"].get("lang")
         if config_lang is not None and config_lang != cls.lang:
             raise ValueError(
                 Errors.E958.format(

From 4f3102d09cb9100d79d5e3cada8db4653a5adac5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 21:09:10 +0200
Subject: [PATCH 270/516] Auto-format

---
 spacy/lang/zh/__init__.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index ea145ea78..752f77d11 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -17,8 +17,10 @@ from .stop_words import STOP_WORDS
 from ... import util
 
 
+# fmt: off
 _PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
 _PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
+# fmt: on
 
 DEFAULT_CONFIG = """
 [nlp]
@@ -177,7 +179,7 @@ class ChineseTokenizer(DummyTokenizer):
                     with open(tempdir / "features.pkl", "wb") as fileh:
                         pickle5.dump(features, fileh, protocol=4)
                 except ImportError as e:
-                    raise(e)
+                    raise e
                 except Exception:
                     warnings.warn(_PKUSEG_PICKLE_WARNING)
                 with open(tempdir / "features.pkl", "rb") as fileh:
@@ -261,7 +263,7 @@ class ChineseTokenizer(DummyTokenizer):
                     with open(path / "features.pkl", "wb") as fileh:
                         pickle5.dump(features, fileh, protocol=4)
                 except ImportError as e:
-                    raise(e)
+                    raise e
                 except Exception:
                     warnings.warn(_PKUSEG_PICKLE_WARNING)
 
@@ -349,7 +351,9 @@ def try_jieba_import(segmenter: str) -> None:
             raise ImportError(msg) from None
 
 
-def try_pkuseg_import(segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str) -> None:
+def try_pkuseg_import(
+    segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
+) -> None:
     try:
         import pkuseg
 

From 6467a560e30052d79c3a9dd1b5649f12ddcb13f6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 21:10:22 +0200
Subject: [PATCH 271/516] WIP: Test updating Chinese tokenizer

---
 spacy/lang/zh/__init__.py | 33 ++++++++++++++++++++-------------
 spacy/tests/conftest.py   | 13 +++++++++----
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 752f77d11..457502e21 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Callable, Iterable
 from enum import Enum
 import tempfile
 import srsly
@@ -10,7 +10,7 @@ from ...errors import Warnings, Errors
 from ...language import Language
 from ...scorer import Scorer
 from ...tokens import Doc
-from ...training import validate_examples
+from ...training import validate_examples, Example
 from ...util import DummyTokenizer, registry
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
@@ -28,6 +28,10 @@ DEFAULT_CONFIG = """
 [nlp.tokenizer]
 @tokenizers = "spacy.zh.ChineseTokenizer"
 segmenter = "char"
+
+[initialize]
+
+[initialize.tokenizer]
 pkuseg_model = null
 pkuseg_user_dict = "default"
 """
@@ -44,18 +48,9 @@ class Segmenter(str, Enum):
 
 
 @registry.tokenizers("spacy.zh.ChineseTokenizer")
-def create_chinese_tokenizer(
-    segmenter: Segmenter = Segmenter.char,
-    pkuseg_model: Optional[str] = None,
-    pkuseg_user_dict: Optional[str] = "default",
-):
+def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
     def chinese_tokenizer_factory(nlp):
-        return ChineseTokenizer(
-            nlp,
-            segmenter=segmenter,
-            pkuseg_model=pkuseg_model,
-            pkuseg_user_dict=pkuseg_user_dict,
-        )
+        return ChineseTokenizer(nlp, segmenter=segmenter)
 
     return chinese_tokenizer_factory
 
@@ -78,6 +73,18 @@ class ChineseTokenizer(DummyTokenizer):
         self.jieba_seg = None
         self.configure_segmenter(segmenter)
 
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language],
+        pkuseg_model: Optional[str] = None,
+        pkuseg_user_dict: Optional[str] = None
+    ):
+        self.pkuseg_model = pkuseg_model
+        self.pkuseg_user_dict = pkuseg_user_dict
+        self.configure_segmenter(self.segmenter)
+
     def configure_segmenter(self, segmenter: str):
         if segmenter not in Segmenter.values():
             warn_msg = Warnings.W103.format(
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 23fc5e98f..6cf019173 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -284,11 +284,16 @@ def zh_tokenizer_pkuseg():
     pytest.importorskip("pkuseg")
     pytest.importorskip("pickle5")
     config = {
-        "@tokenizers": "spacy.zh.ChineseTokenizer",
-        "segmenter": "pkuseg",
-        "pkuseg_model": "default",
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.zh.ChineseTokenizer",
+                "segmenter": "pkuseg",
+            }
+        },
+        "initialize": {"tokenizer": {"pkuseg_model": "default"}},
     }
-    nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
+    nlp = get_lang_class("zh").from_config(config)
+    nlp.initialize()
     return nlp.tokenizer
 
 

From 604be54a5cb59361b664eb3bb33bce3c624b375c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 21:20:56 +0200
Subject: [PATCH 272/516] Support --code in evaluate CLI [ci skip]

---
 spacy/cli/evaluate.py   |  4 +++-
 website/docs/api/cli.md | 29 +++++++++++++++--------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 4c1eeb9e8..566820283 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -7,7 +7,7 @@ from thinc.api import fix_random_seed
 
 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu
+from ._util import app, Arg, Opt, setup_gpu, import_code
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
@@ -19,6 +19,7 @@ def evaluate_cli(
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@@ -37,6 +38,7 @@ def evaluate_cli(
 
     DOCS: https://nightly.spacy.io/api/cli#evaluate
     """
+    import_code(code_path)
     evaluate(
         model,
         data_path,
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e22b0bb2a..5c9f7e480 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -81,9 +81,9 @@ $ python -m spacy info [model] [--markdown] [--silent]
 Find all trained pipeline packages installed in the current environment and
 check whether they are compatible with the currently installed version of spaCy.
 Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
-all installed packages can be used with the new version. It will show a list
-of packages and their installed versions. If any package is out of date, the
-latest compatible versions and command for updating are shown.
+all installed packages can be used with the new version. It will show a list of
+packages and their installed versions. If any package is out of date, the latest
+compatible versions and command for updating are shown.
 
 > #### Automated validation
 >
@@ -826,17 +826,18 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```
 
-| Name                      | Description                                                                                                                                                               |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model`                   | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                |
-| `data_path`               | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                 |
-| `--output`, `-o`          | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                       |
-| `--gold-preproc`, `-G`    | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                   |
-| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                            |
-| `--displacy-path`, `-dp`  | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                |
-| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
-| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                |
-| **CREATES**               | Training results and optional metrics and visualizations.                                                                                                                 |
+| Name                      | Description                                                                                                                                                                          |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
+| `data_path`               | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
+| `--output`, `-o`          | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
+| `--code-path`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`    | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
+| `--displacy-path`, `-dp`  | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
+| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**               | Training results and optional metrics and visualizations.                                                                                                                            |
 
 ## package {#package tag="command"}
 

From fa47f87924c1c9cfcc30ade50933488bcd62c423 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 21:39:28 +0200
Subject: [PATCH 273/516] Tidy up and auto-format

---
 spacy/cli/convert.py                          |  3 ++-
 spacy/cli/debug_data.py                       |  2 +-
 spacy/cli/project/dvc.py                      |  2 +-
 spacy/lang/en/lemmatizer.py                   |  3 +--
 spacy/lang/es/syntax_iterators.py             |  2 +-
 spacy/lang/sa/lex_attrs.py                    |  4 ++--
 spacy/lang/vi/__init__.py                     |  2 +-
 spacy/language.py                             |  2 +-
 spacy/pipe_analysis.py                        |  2 +-
 spacy/pipeline/attributeruler.py              |  7 +++---
 spacy/pipeline/lemmatizer.py                  |  2 +-
 spacy/pipeline/textcat.py                     |  2 +-
 spacy/schemas.py                              |  2 +-
 spacy/scorer.py                               |  7 ++----
 spacy/tests/doc/test_doc_api.py               |  2 +-
 spacy/tests/lang/de/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/el/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/en/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/es/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/fa/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/fr/test_exceptions.py        |  4 +---
 spacy/tests/lang/fr/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/id/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/ja/test_tokenizer.py         |  2 +-
 spacy/tests/lang/nb/test_noun_chunks.py       |  3 +--
 spacy/tests/lang/ne/test_text.py              |  2 +-
 spacy/tests/lang/sa/test_text.py              |  2 +-
 spacy/tests/lang/sv/test_noun_chunks.py       |  3 +--
 spacy/tests/pipeline/test_entity_linker.py    |  8 +++----
 spacy/tests/pipeline/test_textcat.py          | 23 +++++++++++--------
 spacy/tests/pipeline/test_tok2vec.py          |  3 +--
 spacy/tests/regression/test_issue3501-4000.py | 11 ++++-----
 spacy/tests/regression/test_issue4001-4500.py |  7 ++----
 spacy/tests/test_cli.py                       |  2 +-
 spacy/tests/test_language.py                  |  4 +---
 spacy/tests/test_models.py                    |  2 +-
 spacy/tests/test_scorer.py                    |  2 +-
 spacy/tests/training/test_training.py         |  6 +++--
 spacy/training/augment.py                     |  6 ++---
 spacy/training/initialize.py                  |  4 ++--
 spacy/training/pretrain.py                    |  2 +-
 spacy/util.py                                 |  2 +-
 42 files changed, 71 insertions(+), 92 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3fc530822..e4559929e 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -9,7 +9,8 @@ import sys
 from ._util import app, Arg, Opt
 from ..training import docs_to_json
 from ..tokens import DocBin
-from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
+from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
+from ..training.converters import conllu_to_docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c4d1069c0..b4c420660 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -27,7 +27,7 @@ BLANK_MODEL_THRESHOLD = 2000
 
 
 @debug_cli.command(
-    "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 )
 @app.command(
     "debug-data",
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index 541253234..6eedc9c20 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -134,7 +134,7 @@ def update_dvc_config(
 
 
 def run_dvc_commands(
-    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
+    commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
 ) -> None:
     """Run a sequence of DVC commands in a subprocess, in order.
 
diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py
index be389f117..2cb0f9a53 100644
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@@ -3,8 +3,7 @@ from ...tokens import Token
 
 
 class EnglishLemmatizer(Lemmatizer):
-    """English lemmatizer. Only overrides is_base_form.
-    """
+    """English lemmatizer. Only overrides is_base_form."""
 
     def is_base_form(self, token: Token) -> bool:
         """
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index ad0a1b838..4dd4f99be 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -58,7 +58,7 @@ def noun_bounds(
                 doc, token, np_left_deps, np_right_deps, stop_deps
             )
             filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
-            if list(filter(filter_func, doc[left_bound.i : right.i],)):
+            if list(filter(filter_func, doc[left_bound.i : right.i])):
                 break
             else:
                 right_bound = right
diff --git a/spacy/lang/sa/lex_attrs.py b/spacy/lang/sa/lex_attrs.py
index f2b51650b..bdceb7ec2 100644
--- a/spacy/lang/sa/lex_attrs.py
+++ b/spacy/lang/sa/lex_attrs.py
@@ -108,8 +108,8 @@ _num_words = [
 
 def like_num(text):
     """
-   Check if text resembles a number
-   """
+    Check if text resembles a number
+    """
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
     text = text.replace(",", "").replace(".", "")
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 1db762adb..71f51eac6 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -17,7 +17,7 @@ use_pyvi = true
 
 
 @registry.tokenizers("spacy.vi.VietnameseTokenizer")
-def create_vietnamese_tokenizer(use_pyvi: bool = True,):
+def create_vietnamese_tokenizer(use_pyvi: bool = True):
     def vietnamese_tokenizer_factory(nlp):
         return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
 
diff --git a/spacy/language.py b/spacy/language.py
index 9591cb61d..14b9f4eb0 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1189,7 +1189,7 @@ class Language:
         # These are the settings provided in the [initialize] block in the config
         I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
         init_vocab(
-            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
+            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
         )
         pretrain_cfg = config.get("pretraining")
         if pretrain_cfg:
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index 008ac3384..d0362e7e1 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -78,7 +78,7 @@ def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
 
 
 def analyze_pipes(
-    nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
+    nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
 ) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
     """Print a formatted summary for the current nlp object's pipeline. Shows
     a table with the pipeline components and why they assign and require, as
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 4243ebcfb..f314953e9 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -82,8 +82,7 @@ class AttributeRuler(Pipe):
         matches = self.matcher(doc, allow_missing=True)
         # Sort by the attribute ID, so that later rules have precendence
         matches = [
-            (int(self.vocab.strings[m_id]), m_id, s, e)
-            for m_id, s, e in matches
+            (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
         ]
         matches.sort()
         for attr_id, match_id, start, end in matches:
@@ -93,7 +92,7 @@ class AttributeRuler(Pipe):
             try:
                 # The index can be negative, which makes it annoying to do
                 # the boundscheck. Let Span do it instead.
-                token = span[index]
+                token = span[index]  # noqa: F841
             except IndexError:
                 # The original exception is just our conditional logic, so we
                 # raise from.
@@ -103,7 +102,7 @@ class AttributeRuler(Pipe):
                         span=[t.text for t in span],
                         index=index,
                     )
-                ) from None 
+                ) from None
             set_token_attrs(span[index], attrs)
         return doc
 
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index c30d09f62..391769604 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -67,7 +67,7 @@ class Lemmatizer(Pipe):
         return {}
 
     @classmethod
-    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups:
+    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
         """Load and validate lookups tables. If the provided lookups is None,
         load the default lookups tables according to the language and mode
         settings. Confirm that all required tables for the language and mode
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 776b0a178..c5b8b615b 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -347,7 +347,7 @@ class TextCategorizer(Pipe):
         get_examples: Callable[[], Iterable[Example]],
         *,
         nlp: Optional[Language] = None,
-        labels: Optional[Dict] = None
+        labels: Optional[Dict] = None,
     ):
         """Initialize the pipe for training, using a representative set
         of data examples.
diff --git a/spacy/schemas.py b/spacy/schemas.py
index d9a31c742..1125fa7da 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -132,7 +132,7 @@ def validate_init_settings(
         block = "initialize" if not section else f"initialize.{section}"
         title = f"Error validating initialization settings in [{block}]"
         raise ConfigValidationError(
-            title=title, errors=e.errors(), config=settings, parent=name,
+            title=title, errors=e.errors(), config=settings, parent=name
         ) from None
 
 
diff --git a/spacy/scorer.py b/spacy/scorer.py
index b2f97e163..db32dabae 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -32,9 +32,7 @@ class PRFScore:
 
     def __add__(self, other):
         return PRFScore(
-            tp=self.tp+other.tp,
-            fp=self.fp+other.fp,
-            fn=self.fn+other.fn
+            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
         )
 
     def score_set(self, cand: set, gold: set) -> None:
@@ -485,7 +483,7 @@ class Scorer:
                     (pred_ent.start_char, pred_ent.end_char), None
                 )
                 label = gold_span.label_
-                if not label in f_per_type:
+                if label not in f_per_type:
                     f_per_type[label] = PRFScore()
                 gold = gold_span.kb_id_
                 # only evaluating entities that overlap between gold and pred,
@@ -632,7 +630,6 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]:
             continue
         golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
         align_x2y = eg.alignment.x2y
-        preds = set()
         for pred_ent in eg.x.ents:
             if pred_ent.label_ not in scores:
                 scores[pred_ent.label_] = PRFScore()
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index e5e72fe2a..b4b853701 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -19,7 +19,7 @@ def test_doc_api_init(en_vocab):
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
     # heads override sent_starts
     doc = Doc(
-        en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
+        en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4
     )
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index 0ed12d208..7b8b15b1c 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_de(de_tokenizer):
-    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed."""
     doc = de_tokenizer("Er lag auf seinem")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index 2d376c612..2684a5cfb 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_el(el_tokenizer):
-    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed."""
     doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 0189a26d4..540f3ed84 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -7,8 +7,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed(en_tokenizer):
-    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
     doc = en_tokenizer("This is a sentence")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index db89fd903..e5afd81c9 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_es(es_tokenizer):
-    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed."""
     doc = es_tokenizer("en Oxford este verano")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
index 53b39d9a1..d2411e6d3 100644
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_fa(fa_tokenizer):
-    """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed."""
 
     doc = fa_tokenizer("این یک جمله نمونه می باشد.")
     with pytest.raises(ValueError):
diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py
index 77e72a76b..d75c653d0 100644
--- a/spacy/tests/lang/fr/test_exceptions.py
+++ b/spacy/tests/lang/fr/test_exceptions.py
@@ -36,9 +36,7 @@ def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
     assert len(tokens) == 1
 
 
-@pytest.mark.parametrize(
-    "text", ["janv.", "juill.", "Dr.", "av.", "sept."],
-)
+@pytest.mark.parametrize("text", ["janv.", "juill.", "Dr.", "av.", "sept."])
 def test_fr_tokenizer_handles_abbr(fr_tokenizer, text):
     tokens = fr_tokenizer(text)
     assert len(tokens) == 1
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index d81199a3e..48ac88ead 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
-    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
     doc = fr_tokenizer("trouver des travaux antérieurs")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index fef1524f1..a39456581 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_id(id_tokenizer):
-    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed."""
     doc = id_tokenizer("sebelas")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index e52741b70..c8c85d655 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -112,7 +112,7 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
 
 
 @pytest.mark.parametrize(
-    "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS,
+    "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
 )
 def test_ja_tokenizer_sub_tokens(
     ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index 9965fcd14..dd259f2b7 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -2,8 +2,7 @@ import pytest
 
 
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
-    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed."""
     doc = nb_tokenizer("Smørsausen brukes bl.a. til")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py
index 7dd971132..e8a6c2e98 100644
--- a/spacy/tests/lang/ne/test_text.py
+++ b/spacy/tests/lang/ne/test_text.py
@@ -8,7 +8,7 @@ def test_ne_tokenizer_handlers_long_text(ne_tokenizer):
 
 
 @pytest.mark.parametrize(
-    "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)],
+    "text,length", [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)]
 )
 def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length):
     tokens = ne_tokenizer(text)
diff --git a/spacy/tests/lang/sa/test_text.py b/spacy/tests/lang/sa/test_text.py
index 41257a4d8..daa8d20c0 100644
--- a/spacy/tests/lang/sa/test_text.py
+++ b/spacy/tests/lang/sa/test_text.py
@@ -10,7 +10,7 @@ def test_sa_tokenizer_handles_long_text(sa_tokenizer):
 @pytest.mark.parametrize(
     "text,length",
     [
-        ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9,),
+        ("श्री भगवानुवाच पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः।", 9),
         ("गुणान् सर्वान् स्वभावो मूर्ध्नि वर्तते ।", 6),
     ],
 )
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index 3791d8021..d2410156c 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -3,8 +3,7 @@ from spacy.tokens import Doc
 
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
-    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
-    """
+    """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed."""
     doc = sv_tokenizer("Studenten läste den bästa boken")
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index d5c8de36b..66de54c06 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -254,14 +254,12 @@ def test_vocab_serialization(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
     q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
-    q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
 
     # adding aliases
-    douglas_hash = mykb.add_alias(
-        alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]
-    )
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     candidates = mykb.get_alias_candidates("adam")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index ff36bbda9..e0a785851 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -226,6 +226,7 @@ def test_positive_class_not_binary():
     with pytest.raises(ValueError):
         verify_textcat_config(nlp, pipe_config)
 
+
 def test_textcat_evaluation():
     train_examples = []
     nlp = English()
@@ -241,15 +242,17 @@ def test_textcat_evaluation():
     pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
     train_examples.append(Example(pred2, ref2))
 
-    scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
-    assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
-    assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
+    scores = Scorer().score_cats(
+        train_examples, "cats", labels=["winter", "summer", "spring", "autumn"]
+    )
+    assert scores["cats_f_per_type"]["winter"]["p"] == 1 / 2
+    assert scores["cats_f_per_type"]["winter"]["r"] == 1 / 1
     assert scores["cats_f_per_type"]["summer"]["p"] == 0
-    assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
-    assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
-    assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
-    assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
-    assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
+    assert scores["cats_f_per_type"]["summer"]["r"] == 0 / 1
+    assert scores["cats_f_per_type"]["spring"]["p"] == 1 / 1
+    assert scores["cats_f_per_type"]["spring"]["r"] == 1 / 2
+    assert scores["cats_f_per_type"]["autumn"]["p"] == 2 / 2
+    assert scores["cats_f_per_type"]["autumn"]["r"] == 2 / 2
 
-    assert scores["cats_micro_p"] == 4/5
-    assert scores["cats_micro_r"] == 4/6
+    assert scores["cats_micro_p"] == 4 / 5
+    assert scores["cats_micro_r"] == 4 / 6
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index f84b78247..06212e351 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -73,8 +73,7 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
     encode_config["width"] = width
     docs = get_batch(3)
     tok2vec = build_Tok2Vec_model(
-        embed_arch(**embed_config),
-        encode_arch(**encode_config)
+        embed_arch(**embed_config), encode_arch(**encode_config)
     )
     tok2vec.initialize(docs)
     vectors, backprop = tok2vec.begin_update(docs)
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 31e441d86..0505571c2 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -229,9 +229,7 @@ def test_issue3611():
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 
             for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
+                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
 
 
 def test_issue3625():
@@ -390,7 +388,7 @@ def test_issue3959():
 
 
 def test_issue3962(en_vocab):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    """Ensure that as_doc does not result in out-of-bound access of tokens.
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
@@ -428,7 +426,7 @@ def test_issue3962(en_vocab):
 
 
 def test_issue3962_long(en_vocab):
-    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    """Ensure that as_doc does not result in out-of-bound access of tokens.
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
@@ -463,8 +461,7 @@ def test_issue3962_long(en_vocab):
 
 
 def test_issue3972(en_vocab):
-    """Test that the PhraseMatcher returns duplicates for duplicate match IDs.
-    """
+    """Test that the PhraseMatcher returns duplicates for duplicate match IDs."""
     matcher = PhraseMatcher(en_vocab)
     matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
     matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 753cff37f..0e2579ac4 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -19,8 +19,7 @@ from ..util import make_tempdir
 
 
 def test_issue4002(en_vocab):
-    """Test that the PhraseMatcher can match on overwritten NORM attributes.
-    """
+    """Test that the PhraseMatcher can match on overwritten NORM attributes."""
     matcher = PhraseMatcher(en_vocab, attr="NORM")
     pattern1 = Doc(en_vocab, words=["c", "d"])
     assert [t.norm_ for t in pattern1] == ["c", "d"]
@@ -72,9 +71,7 @@ def test_issue4030():
             batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 
             for batch in batches:
-                nlp.update(
-                    examples=batch, sgd=optimizer, drop=0.1, losses=losses,
-                )
+                nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
     # processing of an empty doc should result in 0.0 for all categories
     doc = nlp("")
     assert doc.cats["offensive"] == 0.0
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index ee103208c..bba71d6da 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -7,7 +7,7 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from thinc.api import ConfigValidationError, Config
+from thinc.api import ConfigValidationError
 import srsly
 import os
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 6a487303e..917e7552e 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -290,9 +290,7 @@ def test_spacy_blank():
     assert nlp.meta["name"] == "my_custom_model"
 
 
-@pytest.mark.parametrize(
-    "value", [False, None, ["x", "y"], Language, Vocab],
-)
+@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
 def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
     with pytest.raises(ValueError) as e:
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 8f1bb1c3d..a123f459d 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
             width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
         ),
         "encode": MaxoutWindowEncoder(
-            width=32, depth=2, maxout_pieces=2, window_size=1,
+            width=32, depth=2, maxout_pieces=2, window_size=1
         ),
     }
 
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 2825f1703..89864d579 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -137,7 +137,7 @@ def test_las_per_type(en_vocab):
     examples = []
     for input_, annot in test_las_apple:
         doc = Doc(
-            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         example = Example.from_dict(doc, gold)
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 454f412e1..81e533a5a 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -496,8 +496,10 @@ def test_make_orth_variants(doc):
         output_file = tmpdir / "roundtrip.spacy"
         DocBin(docs=[doc]).to_disk(output_file)
         # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
-        train_examples = list(reader(nlp))
+        reader = Corpus(
+            output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
+        )
+        list(reader(nlp))
 
 
 @pytest.mark.skip("Outdated")
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 4d487ce93..1756144e6 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -23,7 +23,7 @@ def dont_augment(nlp, example):
     yield example
 
 
-def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
+def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
     if random.random() >= level:
         yield example
     else:
@@ -36,14 +36,14 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.
                 nlp,
                 raw_text,
                 orig_dict["token_annotation"],
-                lower=raw_text is not None and random.random() < lower
+                lower=raw_text is not None and random.random() < lower,
             )
             doc = nlp.make_doc(variant_text)
             orig_dict["token_annotation"] = variant_token_annot
             yield example.from_dict(doc, orig_dict)
 
 
-def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
+def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
     orig_token_dict = copy.deepcopy(token_dict)
     orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
     ndsv = orth_variants.get("single", [])
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 09ac2b0ac..267b77f05 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -188,8 +188,8 @@ def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
 
 def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
     """RETURNS (List[str]): All sourced components in the original config,
-        e.g. {"source": "en_core_web_sm"}. If the config contains a key
-        "factory", we assume it refers to a component factory.
+    e.g. {"source": "en_core_web_sm"}. If the config contains a key
+    "factory", we assume it refers to a component factory.
     """
     return [
         name
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 5e136cdf1..4f05c6344 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -94,7 +94,7 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
 
 
 def _resume_model(
-    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
+    model: Model, resume_path: Path, epoch_resume: int, silent: bool = True
 ) -> None:
     msg = Printer(no_print=silent)
     msg.info(f"Resume training tok2vec from: {resume_path}")
diff --git a/spacy/util.py b/spacy/util.py
index 2584d4752..2c33d737e 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -488,7 +488,7 @@ def load_config_from_str(
     RETURNS (Config): The loaded config.
     """
     return Config(section_order=CONFIG_SECTION_ORDER).from_str(
-        text, overrides=overrides, interpolate=interpolate,
+        text, overrides=overrides, interpolate=interpolate
     )
 
 

From 43c92ec8c99dfbc873ab62f63e33e93d167649d6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:01:04 +0200
Subject: [PATCH 274/516] Resolve dir for better output [ci skip]

---
 spacy/cli/init_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 43b95cec1..d217ad366 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -36,7 +36,7 @@ def init_vectors_cli(
     msg.good(
         "Saved nlp object with vectors to output directory. You can now use the "
         "path to it in your config as the 'vectors' setting in [initialize.vocab].",
-        output_dir,
+        output_dir.resolve(),
     )
 
 

From da30bae8a6f74bad87bcda5671a6a703b40315b2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:04:17 +0200
Subject: [PATCH 275/516] Use __pyx_vtable__ instead of __reduce_cython__

---
 spacy/util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 2c33d737e..83a18b78a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1301,12 +1301,14 @@ def minibatch(items, size):
 def is_cython_func(func: Callable) -> bool:
     """Slightly hacky check for whether a callable is implemented in Cython.
     Can be used to implement slightly different behaviors, especially around
-    inspecting and parameter annotations.
+    inspecting and parameter annotations. Note that this will only return True
+    for actual cdef functions and methods, not regular Python functions defined
+    in Python modules.
 
     func (Callable): The callable to check.
     RETURNS (bool): Whether the callable is Cython (probably).
     """
-    attr = "__reduce_cython__"
+    attr = "__pyx_vtable__"
     if hasattr(func, attr):  # function or class instance
         return True
     # https://stackoverflow.com/a/55767059

From 0250bcf6a397130bd8d772a42c5b45d4463a461d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:29:09 +0200
Subject: [PATCH 276/516] Show validation error during init

---
 spacy/cli/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7bbfe9315..0b3e2580e 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -48,7 +48,8 @@ def train_cli(
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
-    nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
+    with show_validation_error(config_path, hint_fill=False):
+        nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
     msg.divider("Training pipeline")
     train(nlp, output_path, use_gpu=use_gpu, silent=False)
 

From 1aeef3bfbbf71210d467b616787e37eef5f6e258 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:33:46 +0200
Subject: [PATCH 277/516] Make corpus paths default to None and improve errors

---
 spacy/cli/templates/quickstart_training.jinja  |  4 ++--
 spacy/cli/train.py                             |  2 +-
 spacy/default_config.cfg                       |  4 ++--
 spacy/errors.py                                |  2 ++
 spacy/tests/serialize/test_serialize_config.py | 10 +++++-----
 spacy/training/corpus.py                       |  6 ++++--
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index efe19d315..0e498ee20 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
 {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null
 
 [system]
 {% if use_transformer -%}
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 0b3e2580e..e8a422926 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -66,7 +66,7 @@ def init_pipeline(
             nlp.to_disk(init_path)
             msg.good(f"Saved initialized pipeline to {init_path}")
         else:
-            nlp = util.load_model(init_path)
+            nlp = util.load_model(init_path).from_config(config)
             if must_reinitialize(config, nlp.config):
                 msg.warn("Config has changed: need to re-initialize pipeline")
                 nlp = init_nlp(config, **init_kwargs)
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 222ef7d38..272dc7848 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -1,6 +1,6 @@
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null
 vectors = null
 vocab_data = null
 init_tok2vec = null
diff --git a/spacy/errors.py b/spacy/errors.py
index 09b722a7b..233ff29bd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,6 +477,8 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
+            "config.cfg or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
             "return the nlp object but got: {value}. Maybe you forgot to return "
             "the modified object in your function?")
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 663e76550..da048f3d6 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -14,8 +14,8 @@ from ..util import make_tempdir
 
 nlp_config_string = """
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null
 
 [corpora]
 
@@ -309,7 +309,7 @@ def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
     assert config["corpora"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["corpora"]["train"]["path"] == ""
+    assert interpolated["corpora"]["train"]["path"] is None
     nlp = English.from_config(config)
     assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
@@ -317,10 +317,10 @@ def test_config_interpolation():
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["corpora"]["train"]["path"] == ""
+    assert interpolated2["corpora"]["train"]["path"] is None
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["corpora"]["train"]["path"] == ""
+    assert nlp2.config["corpora"]["train"]["path"] is None
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 90eb62474..e85b50cd2 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -7,7 +7,7 @@ import srsly
 from .. import util
 from .augment import dont_augment
 from .example import Example
-from ..errors import Warnings
+from ..errors import Warnings, Errors
 from ..tokens import DocBin, Doc
 from ..vocab import Vocab
 
@@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
 
 @util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
-    path: Path,
+    path: Optional[Path],
     gold_preproc: bool,
     max_length: int = 0,
     limit: int = 0,
     augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
+    if path is None:
+        raise ValueError(Errors.E913)
     return Corpus(
         path,
         gold_preproc=gold_preproc,

From c334a7d45f5a895950a139f40ac7fb6ff24af5a0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:38:39 +0200
Subject: [PATCH 278/516] Remove

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e8a422926..0b3e2580e 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -66,7 +66,7 @@ def init_pipeline(
             nlp.to_disk(init_path)
             msg.good(f"Saved initialized pipeline to {init_path}")
         else:
-            nlp = util.load_model(init_path).from_config(config)
+            nlp = util.load_model(init_path)
             if must_reinitialize(config, nlp.config):
                 msg.warn("Config has changed: need to re-initialize pipeline")
                 nlp = init_nlp(config, **init_kwargs)

From ad6d40d028192aa8e974f8ac69ba965a2b4fa978 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:53:14 +0200
Subject: [PATCH 279/516] Add logging

---
 spacy/training/corpus.py     | 1 +
 spacy/training/initialize.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 90eb62474..85079f41c 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -26,6 +26,7 @@ def create_docbin_reader(
     limit: int = 0,
     augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
+    util.logger.debug(f"Loading corpus from path: {path}")
     return Corpus(
         path,
         gold_preproc=gold_preproc,
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 267b77f05..e248cf314 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -94,6 +94,7 @@ def init_vocab(
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
         logger.info(f"Added vectors: {vectors}")
+    logger.info("Finished initializing nlp object")
 
 
 def load_vectors_into_model(

From 0a1ee109db2fc30d98e41b269a751b0d3dcd8168 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 22:53:18 +0200
Subject: [PATCH 280/516] Remove init form path

---
 spacy/cli/train.py | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 0b3e2580e..36a9d08d9 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -49,35 +49,11 @@ def train_cli(
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
     with show_validation_error(config_path, hint_fill=False):
-        nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
+        nlp = init_nlp(config, use_gpu=use_gpu)
     msg.divider("Training pipeline")
     train(nlp, output_path, use_gpu=use_gpu, silent=False)
 
 
-def init_pipeline(
-    config: Config, output_path: Optional[Path], *, use_gpu: int = -1
-) -> Language:
-    init_kwargs = {"use_gpu": use_gpu}
-    if output_path is not None:
-        init_path = output_path / "model-initial"
-        if not init_path.exists():
-            msg.info(f"Initializing the pipeline in {init_path}")
-            nlp = init_nlp(config, **init_kwargs)
-            nlp.to_disk(init_path)
-            msg.good(f"Saved initialized pipeline to {init_path}")
-        else:
-            nlp = util.load_model(init_path)
-            if must_reinitialize(config, nlp.config):
-                msg.warn("Config has changed: need to re-initialize pipeline")
-                nlp = init_nlp(config, **init_kwargs)
-                nlp.to_disk(init_path)
-                msg.good(f"Re-initialized pipeline in {init_path}")
-            else:
-                msg.good(f"Loaded initialized pipeline from {init_path}")
-        return nlp
-    return init_nlp(config, **init_kwargs)
-
-
 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
     # Make sure all files and paths exists if they are needed
     if not config_path or not config_path.exists():

From a2aa1f68820e31c9eda2243a1a230387ebef7021 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 23:02:40 +0200
Subject: [PATCH 281/516] Disable the OVL augmentation by default

---
 spacy/default_config.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 222ef7d38..a55fad097 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -40,7 +40,7 @@ limit = 0
 # This is especially useful for punctuation and case replacement, to help
 # generalize beyond corpora that don't have smart-quotes, or only have smart
 # quotes, etc.
-augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
+augmenter = null
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"

From 9bb958fd0a117342dfc73b5b784489cc14803168 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 23:07:11 +0200
Subject: [PATCH 282/516] Fix debug data [ci skip]

---
 spacy/cli/debug_data.py | 53 ++++++++++-------------------------------
 website/docs/api/cli.md |  4 ++--
 2 files changed, 14 insertions(+), 43 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b4c420660..3dc8d262d 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -8,12 +8,12 @@ import typer
 
 from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 from ._util import import_code, debug_cli
-from ..training import Corpus, Example
+from ..training import Example
 from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..language import Language
-from ..util import registry
+from ..util import registry, resolve_dot_names
 from .. import util
 
 
@@ -37,8 +37,6 @@ BLANK_MODEL_THRESHOLD = 2000
 def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
     config_path: Path = Arg(..., help="Path to config file", exists=True),
     code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
@@ -62,8 +60,6 @@ def debug_data_cli(
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     debug_data(
-        train_path,
-        dev_path,
         config_path,
         config_overrides=overrides,
         ignore_warnings=ignore_warnings,
@@ -74,8 +70,6 @@ def debug_data_cli(
 
 
 def debug_data(
-    train_path: Path,
-    dev_path: Path,
     config_path: Path,
     *,
     config_overrides: Dict[str, Any] = {},
@@ -88,18 +82,11 @@ def debug_data(
         no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
     )
     # Make sure all files and paths exists if they are needed
-    if not train_path.exists():
-        msg.fail("Training data not found", train_path, exits=1)
-    if not dev_path.exists():
-        msg.fail("Development data not found", dev_path, exits=1)
-    if not config_path.exists():
-        msg.fail("Config file not found", config_path, exists=1)
     with show_validation_error(config_path):
         cfg = util.load_config(config_path, overrides=config_overrides)
         nlp = util.load_model_from_config(cfg)
-        T = registry.resolve(
-            nlp.config.interpolate()["training"], schema=ConfigSchemaTraining
-        )
+        config = nlp.config.interpolate()
+        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
     # Use original config here, not resolved version
     sourced_components = get_sourced_components(cfg)
     frozen_components = T["frozen_components"]
@@ -109,25 +96,15 @@ def debug_data(
     msg.divider("Data file validation")
 
     # Create the gold corpus to be able to better analyze data
-    loading_train_error_message = ""
-    loading_dev_error_message = ""
-    with msg.loading("Loading corpus..."):
-        try:
-            train_dataset = list(Corpus(train_path)(nlp))
-        except ValueError as e:
-            loading_train_error_message = f"Training data cannot be loaded: {e}"
-        try:
-            dev_dataset = list(Corpus(dev_path)(nlp))
-        except ValueError as e:
-            loading_dev_error_message = f"Development data cannot be loaded: {e}"
-    if loading_train_error_message or loading_dev_error_message:
-        if loading_train_error_message:
-            msg.fail(loading_train_error_message)
-        if loading_dev_error_message:
-            msg.fail(loading_dev_error_message)
-        sys.exit(1)
+    dot_names = [T["train_corpus"], T["dev_corpus"]]
+    train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
+    train_dataset = list(train_corpus(nlp))
+    dev_dataset = list(dev_corpus(nlp))
     msg.good("Corpus is loadable")
 
+    nlp.initialize(lambda: train_dataset)
+    msg.good("Pipeline can be initialized with data")
+
     # Create all gold data here to avoid iterating over the train_dataset constantly
     gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
     gold_train_unpreprocessed_data = _compile_gold(
@@ -348,17 +325,11 @@ def debug_data(
         msg.divider("Part-of-speech Tagging")
         labels = [label for label in gold_train_data["tags"]]
         # TODO: does this need to be updated?
-        tag_map = nlp.vocab.morphology.tag_map
-        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
+        msg.info(f"{len(labels)} label(s) in data")
         labels_with_counts = _format_labels(
             gold_train_data["tags"].most_common(), counts=True
         )
         msg.text(labels_with_counts, show=verbose)
-        non_tagmap = [l for l in labels if l not in tag_map]
-        if not non_tagmap:
-            msg.good(f"All labels present in tag map for language '{nlp.lang}'")
-        for label in non_tagmap:
-            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
 
     if "parser" in factory_names:
         has_low_data_warning = False
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 5c9f7e480..66e26f11f 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -436,6 +436,7 @@ $ python -m spacy debug data [config_path] [--code] [--ignore-warnings] [--verbo
 ```
 =========================== Data format validation ===========================
 ✔ Corpus is loadable
+✔ Pipeline can be initialized with data
 
 =============================== Training stats ===============================
 Training pipeline: tagger, parser, ner
@@ -465,7 +466,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 ✔ No entities consisting of or starting/ending with whitespace
 
 =========================== Part-of-speech Tagging ===========================
-ℹ 49 labels in data (57 labels in tag map)
+ℹ 49 labels in data
 'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830),
 'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB'
 (74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN'
@@ -476,7 +477,6 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
 '-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW'
 (794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX'
 (24)
-✔ All labels present in tag map for language 'en'
 
 ============================= Dependency Parsing =============================
 ℹ Found 111703 sentences with an average length of 18.6 words.

From ae518434684d64b191b31a5bc74986844ee4f0cb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 29 Sep 2020 23:08:50 +0200
Subject: [PATCH 283/516] Remove augmenter from jinja template [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index efe19d315..adad72995 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -270,7 +270,6 @@ factory = "{{ pipe }}"
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
-augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"

From 14c4da547f483d6fa6a741e5ea09118775294e71 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 23:08:56 +0200
Subject: [PATCH 284/516] Try to fix augmentation

---
 spacy/training/augment.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 1756144e6..caa24c054 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -38,7 +38,10 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
                 orig_dict["token_annotation"],
                 lower=raw_text is not None and random.random() < lower,
             )
-            doc = nlp.make_doc(variant_text)
+            if variant_text is None:
+                doc = Doc(nlp.vocab, words=variant_token_annot["words"])
+            else:
+                doc = nlp.make_doc(variant_text)
             orig_dict["token_annotation"] = variant_token_annot
             yield example.from_dict(doc, orig_dict)
 

From f52249fe2eb5afca9e68060a99d8cb31a6175c72 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 23:40:54 +0200
Subject: [PATCH 285/516] Fix data augmentation

---
 spacy/training/augment.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index caa24c054..95662eafa 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -4,6 +4,7 @@ import itertools
 import copy
 from functools import partial
 from ..util import registry
+from ..tokens import Doc
 
 
 @registry.augmenters("spacy.dont_augment.v1")
@@ -38,10 +39,12 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
                 orig_dict["token_annotation"],
                 lower=raw_text is not None and random.random() < lower,
             )
-            if variant_text is None:
-                doc = Doc(nlp.vocab, words=variant_token_annot["words"])
-            else:
+            if variant_text:
                 doc = nlp.make_doc(variant_text)
+            else:
+                doc = Doc(nlp.vocab, words=variant_token_annot["ORTH"])
+                variant_token_annot["ORTH"] = [w.text for w in doc]
+                variant_token_annot["SPACY"] = [w.whitespace_ for w in doc]
             orig_dict["token_annotation"] = variant_token_annot
             yield example.from_dict(doc, orig_dict)
 

From 7d04ba20c0da5742904db628922ca17f59be46b2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 00:05:17 +0200
Subject: [PATCH 286/516] Update Thinc

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6d3a29fe9..e88ba7db9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a41,<8.0.0a50",
+    "thinc>=8.0.0a42,<8.0.0a50",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 3ff8bea3d..064efed42 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a41,<8.0.0a50
+thinc>=8.0.0a42,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index bcab59487..2da84c829 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a41,<8.0.0a50
+    thinc>=8.0.0a42,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a41,<8.0.0a50
+    thinc>=8.0.0a42,<8.0.0a50
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0

From b799af16de31ec61b6757c80b1da36aaea7921e0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 00:05:27 +0200
Subject: [PATCH 287/516] Don't raise in Pipe.initialize if not implemented

---
 spacy/pipeline/pipe.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 49d0bea35..5316620e9 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -36,7 +36,7 @@ cdef class Pipe:
     @property
     def labels(self) -> Optional[Tuple[str]]:
         return []
-    
+
     @property
     def label_data(self):
         """Optional JSON-serializable data that would be sufficient to recreate
@@ -207,7 +207,7 @@ cdef class Pipe:
 
         DOCS: https://nightly.spacy.io/api/pipe#initialize
         """
-        raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
+        pass
 
     def _ensure_examples(self, get_examples):
         if get_examples is None or not hasattr(get_examples, "__call__"):

From 95b2a448cf674544d77109f461dd3e6f6cbb2b46 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 00:24:42 +0200
Subject: [PATCH 288/516] Update lookups data pin [ci skip]

---
 Makefile  | 2 +-
 setup.cfg | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index a180063b9..a4df0f8c8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 SHELL := /bin/bash
 
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
 endif
 
 ifndef PYVER
diff --git a/setup.cfg b/setup.cfg
index 2da84c829..36ab64bd9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,7 +65,7 @@ console_scripts =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data==0.4.0.dev0
+    spacy_lookups_data==1.0.0rc0
 cuda =
     cupy>=5.0.0b4,<9.0.0
 cuda80 =

From 56a2f778c404d2cd9d3561279438364ad0010033 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 01:08:55 +0200
Subject: [PATCH 289/516] Add logging [ci skip]

---
 spacy/cli/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 36a9d08d9..ed2dd7c83 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -50,6 +50,7 @@ def train_cli(
     msg.divider("Initializing pipeline")
     with show_validation_error(config_path, hint_fill=False):
         nlp = init_nlp(config, use_gpu=use_gpu)
+    msg.good("Initialized pipeline")
     msg.divider("Training pipeline")
     train(nlp, output_path, use_gpu=use_gpu, silent=False)
 

From a5debb356da68f2ef00a6c037d38f24d6ea41b92 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 01:22:08 +0200
Subject: [PATCH 290/516] Tidy up and adjust logging [ci skip]

---
 spacy/cli/init_pipeline.py | 16 ++++++++++------
 spacy/cli/train.py         |  2 +-
 spacy/util.py              |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index d217ad366..f241133ca 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -27,7 +27,7 @@ def init_vectors_cli(
     you can use in the [initialize.vocab] block of your config to initialize
     a model with vectors.
     """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
@@ -55,14 +55,14 @@ def init_pipeline_cli(
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
     with show_validation_error(hint_fill=False):
-        nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
+        nlp = init_nlp(config, use_gpu=use_gpu)
     nlp.to_disk(output_path)
     msg.good(f"Saved initialized pipeline to {output_path}")
 
@@ -81,9 +81,12 @@ def init_labels_cli(
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
+    """Generate a JSON file for labels in the data. This helps speed up the
+    training process, since spaCy won't have to preprocess the data to
+    extract the labels."""
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     if not output_path.exists():
         output_path.mkdir()
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
@@ -93,7 +96,8 @@ def init_labels_cli(
         nlp = init_nlp(config, use_gpu=use_gpu)
     for name, component in nlp.pipeline:
         if getattr(component, "label_data", None) is not None:
-            srsly.write_json(output_path / f"{name}.json", component.label_data)
-            msg.good(f"Saving {name} labels to {output_path}/{name}.json")
+            output_file = output_path / f"{name}.json"
+            srsly.write_json(output_file, component.label_data)
+            msg.good(f"Saving {name} labels to {output_file}")
         else:
             msg.info(f"No labels found for {name}")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ed2dd7c83..aede0e8f4 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -40,7 +40,7 @@ def train_cli(
 
     DOCS: https://nightly.spacy.io/api/cli#train
     """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.ERROR)
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     verify_cli_args(config_path, output_path)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
diff --git a/spacy/util.py b/spacy/util.py
index 83a18b78a..761eb9f0a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -67,7 +67,7 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
 # fmt: on
 
 
-logging.basicConfig()
+logging.basicConfig(format="%(message)s")
 logger = logging.getLogger("spacy")
 
 

From 34f9c26c6235842db219a543897baba95fd980ff Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 10:20:14 +0200
Subject: [PATCH 291/516] Add lexeme norm defaults

---
 spacy/lang/da/__init__.py      | 12 ++++++++++++
 spacy/lang/de/__init__.py      | 12 ++++++++++++
 spacy/lang/el/__init__.py      | 12 ++++++++++++
 spacy/lang/en/__init__.py      | 13 ++++++++++++-
 spacy/lang/id/__init__.py      | 12 ++++++++++++
 spacy/lang/ja/__init__.py      |  5 ++---
 spacy/lang/ko/__init__.py      |  5 ++---
 spacy/lang/lb/__init__.py      | 12 ++++++++++++
 spacy/lang/pt/__init__.py      | 12 ++++++++++++
 spacy/lang/ru/__init__.py      | 13 ++++++++++++-
 spacy/lang/sr/__init__.py      | 12 ++++++++++++
 spacy/lang/ta/__init__.py      | 12 ++++++++++++
 spacy/lang/th/__init__.py      | 13 +++++++++----
 spacy/lang/vi/__init__.py      | 10 ++++------
 spacy/lang/zh/__init__.py      |  5 ++---
 spacy/tests/parser/test_ner.py |  1 +
 16 files changed, 140 insertions(+), 21 deletions(-)

diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index 8cac30b26..7128338af 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class DanishDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index b645d3480..99c161961 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class GermanDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 1a7b19914..818405842 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -9,9 +9,21 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .lemmatizer import GreekLemmatizer
 from ...lookups import Lookups
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class GreekDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index bf7e9987f..f4ea10f9c 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,5 +1,4 @@
 from typing import Optional
-
 from thinc.api import Model
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@@ -10,9 +9,21 @@ from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
 from ...lookups import Lookups
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class EnglishDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 87373551c..46bef57ca 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -4,9 +4,21 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class IndonesianDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index e7cc1ef3b..4e6bf9d3c 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -2,7 +2,6 @@ from typing import Optional, Union, Dict, Any
 from pathlib import Path
 import srsly
 from collections import namedtuple
-from thinc.api import Config
 
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
@@ -16,7 +15,7 @@ from ...scorer import Scorer
 from ...symbols import POS
 from ...tokens import Doc
 from ...training import validate_examples
-from ...util import DummyTokenizer, registry
+from ...util import DummyTokenizer, registry, load_config_from_str
 from ... import util
 
 
@@ -166,7 +165,7 @@ class JapaneseTokenizer(DummyTokenizer):
 
 
 class JapaneseDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
+    config = load_config_from_str(DEFAULT_CONFIG)
     stop_words = STOP_WORDS
     syntax_iterators = SYNTAX_ITERATORS
     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index dd07ef89c..83c9f4962 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,5 +1,4 @@
 from typing import Optional, Any, Dict
-from thinc.api import Config
 
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
@@ -10,7 +9,7 @@ from ...compat import copy_reg
 from ...scorer import Scorer
 from ...symbols import POS
 from ...training import validate_examples
-from ...util import DummyTokenizer, registry
+from ...util import DummyTokenizer, registry, load_config_from_str
 
 
 DEFAULT_CONFIG = """
@@ -70,7 +69,7 @@ class KoreanTokenizer(DummyTokenizer):
 
 
 class KoreanDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
+    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index da6fe55d7..ead5f5d10 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -3,9 +3,21 @@ from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class LuxembourgishDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 0447099f0..1c95c11d9 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -3,9 +3,21 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class PortugueseDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     prefixes = TOKENIZER_PREFIXES
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 4a296dd23..857e197e9 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,5 +1,4 @@
 from typing import Optional
-
 from thinc.api import Model
 
 from .stop_words import STOP_WORDS
@@ -8,9 +7,21 @@ from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
 from ...lookups import Lookups
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class RussianDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index 165e54975..5da19c6f3 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -2,9 +2,21 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class SerbianDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index ac5fc7124..7a5a3ac8f 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,9 +1,21 @@
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
+from ...util import load_config_from_str
+
+
+DEFAULT_CONFIG = """
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
+"""
 
 
 class TamilDefaults(Language.Defaults):
+    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index a35ae987f..834fe1871 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,10 +1,8 @@
-from thinc.api import Config
-
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
-from ...util import DummyTokenizer, registry
+from ...util import DummyTokenizer, registry, load_config_from_str
 
 
 DEFAULT_CONFIG = """
@@ -12,6 +10,13 @@ DEFAULT_CONFIG = """
 
 [nlp.tokenizer]
 @tokenizers = "spacy.th.ThaiTokenizer"
+
+[initialize]
+
+[initialize.lookups]
+@misc = "spacy.LookupsDataLoader.v1"
+lang = ${nlp.lang}
+tables = ["lexeme_norm"]
 """
 
 
@@ -42,7 +47,7 @@ class ThaiTokenizer(DummyTokenizer):
 
 
 class ThaiDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
+    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 1db762adb..e2f7b3e35 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,10 +1,8 @@
-from thinc.api import Config
-
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
-from .stop_words import STOP_WORDS
-from ...util import DummyTokenizer, registry
-from .lex_attrs import LEX_ATTRS
+from ...util import DummyTokenizer, registry, load_config_from_str
 
 
 DEFAULT_CONFIG = """
@@ -55,7 +53,7 @@ class VietnameseTokenizer(DummyTokenizer):
 
 
 class VietnameseDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
+    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 457502e21..a413d86eb 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -4,14 +4,13 @@ import tempfile
 import srsly
 import warnings
 from pathlib import Path
-from thinc.api import Config
 
 from ...errors import Warnings, Errors
 from ...language import Language
 from ...scorer import Scorer
 from ...tokens import Doc
 from ...training import validate_examples, Example
-from ...util import DummyTokenizer, registry
+from ...util import DummyTokenizer, registry, load_config_from_str
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ... import util
@@ -329,7 +328,7 @@ class ChineseTokenizer(DummyTokenizer):
 
 
 class ChineseDefaults(Language.Defaults):
-    config = Config().from_str(DEFAULT_CONFIG)
+    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index b657ae2e8..78a20c1e8 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -339,6 +339,7 @@ def test_ner_warns_no_lookups(caplog):
     nlp.vocab.lookups = Lookups()
     assert not len(nlp.vocab.lookups)
     nlp.add_pipe("ner")
+    nlp.config["initialize"]["lookups"] = None
     with caplog.at_level(logging.DEBUG):
         nlp.initialize()
         assert "W033" in caplog.text

From 469f0e539c2bba97a1c46207f5528e9ec94fe98e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 10:24:06 +0200
Subject: [PATCH 292/516] Fix docs [ci skip]

---
 website/docs/usage/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index a18c2af32..4c75ad771 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -6,7 +6,7 @@ menu:
   - ['Introduction', 'basics']
   - ['Quickstart', 'quickstart']
   - ['Config System', 'config']
-  <!-- - ['Data Utilities', 'data'] -->
+  # - ['Data Utilities', 'data']
   - ['Custom Functions', 'custom-functions']
   - ['Parallel Training', 'parallel-training']
   - ['Internal API', 'api']

From 6b7bb32834c412367c2d49aa62a3bd1deeb4f921 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 30 Sep 2020 11:46:45 +0200
Subject: [PATCH 293/516] Refactor Chinese initialization

---
 spacy/errors.py                       | 20 +++++--
 spacy/lang/zh/__init__.py             | 81 ++++++++++-----------------
 spacy/tests/conftest.py               | 15 +++--
 spacy/tests/lang/zh/test_serialize.py | 16 ++++--
 4 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 09b722a7b..f8fb7dd8b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -672,14 +672,22 @@ class Errors:
     E999 = ("Unable to merge the `Doc` objects because they do not all share "
             "the same `Vocab`.")
     E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
-             "specified. Provide the name of a pretrained model or the path to "
-             "a model when initializing the pipeline:\n"
+             "loaded. Provide the name of a pretrained model or the path to "
+             "a model and initialize the pipeline:\n\n"
              'config = {\n'
-             '   "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
-             '   "segmenter": "pkuseg",\n'
-             '   "pkuseg_model": "default", # or "/path/to/pkuseg_model" \n'
+             '    "nlp": {\n'
+             '        "tokenizer": {\n'
+             '            "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
+             '            "segmenter": "pkuseg",\n'
+             '        }\n'
+             '    },\n'
+             '    "initialize": {"tokenizer": {\n'
+             '            "pkuseg_model": "default", # or /path/to/model\n'
+             '        }\n'
+             '    },\n'
              '}\n'
-             'nlp = Chinese.from_config({"nlp": {"tokenizer": config}})')
+             'nlp = Chinese.from_config(config)\n'
+             'nlp.initialize()')
     E1001 = ("Target token outside of matched span for match with tokens "
              "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
     E1002 = ("Span index out of range.")
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index a413d86eb..ecabb6555 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -59,32 +59,13 @@ class ChineseTokenizer(DummyTokenizer):
         self,
         nlp: Language,
         segmenter: Segmenter = Segmenter.char,
-        pkuseg_model: Optional[str] = None,
-        pkuseg_user_dict: Optional[str] = None,
     ):
         self.vocab = nlp.vocab
         if isinstance(segmenter, Segmenter):
             segmenter = segmenter.value
         self.segmenter = segmenter
-        self.pkuseg_model = pkuseg_model
-        self.pkuseg_user_dict = pkuseg_user_dict
         self.pkuseg_seg = None
         self.jieba_seg = None
-        self.configure_segmenter(segmenter)
-
-    def initialize(
-        self,
-        get_examples: Callable[[], Iterable[Example]],
-        *,
-        nlp: Optional[Language],
-        pkuseg_model: Optional[str] = None,
-        pkuseg_user_dict: Optional[str] = None
-    ):
-        self.pkuseg_model = pkuseg_model
-        self.pkuseg_user_dict = pkuseg_user_dict
-        self.configure_segmenter(self.segmenter)
-
-    def configure_segmenter(self, segmenter: str):
         if segmenter not in Segmenter.values():
             warn_msg = Warnings.W103.format(
                 lang="Chinese",
@@ -94,12 +75,21 @@ class ChineseTokenizer(DummyTokenizer):
             )
             warnings.warn(warn_msg)
             self.segmenter = Segmenter.char
-        self.jieba_seg = try_jieba_import(self.segmenter)
-        self.pkuseg_seg = try_pkuseg_import(
-            self.segmenter,
-            pkuseg_model=self.pkuseg_model,
-            pkuseg_user_dict=self.pkuseg_user_dict,
-        )
+        if segmenter == Segmenter.jieba:
+            self.jieba_seg = try_jieba_import()
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language],
+        pkuseg_model: Optional[str] = None,
+        pkuseg_user_dict: str = "default",
+    ):
+        if self.segmenter == Segmenter.pkuseg:
+            self.pkuseg_seg = try_pkuseg_import(
+                pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
+            )
 
     def __call__(self, text: str) -> Doc:
         if self.segmenter == Segmenter.jieba:
@@ -154,14 +144,10 @@ class ChineseTokenizer(DummyTokenizer):
     def _get_config(self) -> Dict[str, Any]:
         return {
             "segmenter": self.segmenter,
-            "pkuseg_model": self.pkuseg_model,
-            "pkuseg_user_dict": self.pkuseg_user_dict,
         }
 
     def _set_config(self, config: Dict[str, Any] = {}) -> None:
         self.segmenter = config.get("segmenter", Segmenter.char)
-        self.pkuseg_model = config.get("pkuseg_model", None)
-        self.pkuseg_user_dict = config.get("pkuseg_user_dict", "default")
 
     def to_bytes(self, **kwargs):
         pkuseg_features_b = b""
@@ -339,42 +325,33 @@ class Chinese(Language):
     Defaults = ChineseDefaults
 
 
-def try_jieba_import(segmenter: str) -> None:
+def try_jieba_import() -> None:
     try:
         import jieba
 
-        if segmenter == Segmenter.jieba:
-            # segment a short text to have jieba initialize its cache in advance
-            list(jieba.cut("作为", cut_all=False))
+        # segment a short text to have jieba initialize its cache in advance
+        list(jieba.cut("作为", cut_all=False))
 
         return jieba
     except ImportError:
-        if segmenter == Segmenter.jieba:
-            msg = (
-                "Jieba not installed. To use jieba, install it with `pip "
-                " install jieba` or from https://github.com/fxsjy/jieba"
-            )
-            raise ImportError(msg) from None
+        msg = (
+            "Jieba not installed. To use jieba, install it with `pip "
+            " install jieba` or from https://github.com/fxsjy/jieba"
+        )
+        raise ImportError(msg) from None
 
 
-def try_pkuseg_import(
-    segmenter: str, pkuseg_model: Optional[str], pkuseg_user_dict: str
-) -> None:
+def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
     try:
         import pkuseg
 
-        if pkuseg_model is None:
-            return None
-        else:
-            return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
+        return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
     except ImportError:
-        if segmenter == Segmenter.pkuseg:
-            msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
-            raise ImportError(msg) from None
+        msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+        raise ImportError(msg) from None
     except FileNotFoundError:
-        if segmenter == Segmenter.pkuseg:
-            msg = "Unable to load pkuseg model from: " + pkuseg_model
-            raise FileNotFoundError(msg) from None
+        msg = "Unable to load pkuseg model from: " + pkuseg_model
+        raise FileNotFoundError(msg) from None
 
 
 def _get_pkuseg_trie_data(node, path=""):
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 6cf019173..bcf582388 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -272,10 +272,14 @@ def zh_tokenizer_char():
 def zh_tokenizer_jieba():
     pytest.importorskip("jieba")
     config = {
-        "@tokenizers": "spacy.zh.ChineseTokenizer",
-        "segmenter": "jieba",
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.zh.ChineseTokenizer",
+                "segmenter": "jieba",
+            }
+        }
     }
-    nlp = get_lang_class("zh").from_config({"nlp": {"tokenizer": config}})
+    nlp = get_lang_class("zh").from_config(config)
     return nlp.tokenizer
 
 
@@ -290,7 +294,10 @@ def zh_tokenizer_pkuseg():
                 "segmenter": "pkuseg",
             }
         },
-        "initialize": {"tokenizer": {"pkuseg_model": "default"}},
+        "initialize": {"tokenizer": {
+                "pkuseg_model": "default",
+            }
+        },
     }
     nlp = get_lang_class("zh").from_config(config)
     nlp.initialize()
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 5491314e2..58c084ec8 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -28,9 +28,17 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 @pytest.mark.slow
 def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
     config = {
-        "@tokenizers": "spacy.zh.ChineseTokenizer",
-        "segmenter": "pkuseg",
-        "pkuseg_model": "medicine",
+        "nlp": {
+            "tokenizer": {
+                "@tokenizers": "spacy.zh.ChineseTokenizer",
+                "segmenter": "pkuseg",
+            }
+        },
+        "initialize": {"tokenizer": {
+                "pkuseg_model": "medicine",
+            }
+        },
     }
-    nlp = Chinese.from_config({"nlp": {"tokenizer": config}})
+    nlp = Chinese.from_config(config)
+    nlp.initialize()
     zh_tokenizer_serialize(nlp.tokenizer)

From 4cbb954281ad47148667de130e5c4eb23e579edf Mon Sep 17 00:00:00 2001
From: Elijah Rippeth <erippeth@mitre.org>
Date: Wed, 30 Sep 2020 07:26:06 -0400
Subject: [PATCH 294/516] reorder so tagmap is replaced only if a custom file
 is provided. (#6164)

* reorder so tagmap is replaced only if a custom file is provided.

* Remove unneeded variable initialization

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/cli/debug_data.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 22540c779..7e6c99c06 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -59,10 +59,6 @@ def debug_data(
     if not dev_path.exists():
         msg.fail("Development data not found", dev_path, exits=1)
 
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-
     # Initialize the model and pipeline
     pipeline = [p.strip() for p in pipeline.split(",")]
     if base_model:
@@ -70,8 +66,11 @@ def debug_data(
     else:
         lang_cls = get_lang_class(lang)
         nlp = lang_cls()
-    # Replace tag map with provided mapping
-    nlp.vocab.morphology.load_tag_map(tag_map)
+
+    if tag_map_path is not None:
+        tag_map = srsly.read_json(tag_map_path)
+        # Replace tag map with provided mapping
+        nlp.vocab.morphology.load_tag_map(tag_map)
 
     msg.divider("Data format validation")
 

From 23c63eefafa3e91eb802e1dba6b1ef3145b71b58 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 15:15:11 +0200
Subject: [PATCH 295/516] Tidy up env vars [ci skip]

---
 spacy/cli/_util.py      |  4 ++--
 spacy/tests/test_cli.py | 12 +++++++-----
 spacy/util.py           |  4 ++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 2c944bf3a..69c32bbad 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -16,6 +16,7 @@ import os
 
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
+from ..util import ENV_VARS
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -39,7 +40,6 @@ commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
-OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@@ -65,7 +65,7 @@ def setup_cli() -> None:
 
 
 def parse_config_overrides(
-    args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
+    args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
 ) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index bba71d6da..62584d0ce 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -3,10 +3,11 @@ from click import NoSuchOption
 from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.util import ENV_VARS
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
+from spacy.cli._util import string_to_list
 from thinc.api import ConfigValidationError
 import srsly
 import os
@@ -342,21 +343,22 @@ def test_parse_config_overrides_invalid_2(args):
 
 
 def test_parse_cli_overrides():
-    os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    overrides = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = overrides
     result = parse_config_overrides([])
     assert len(result) == 4
     assert result["x.foo"] == "bar"
     assert result["x.bar"] == 12
     assert result["x.baz"] is False
     assert result["y.foo"] == "hello"
-    os.environ[OVERRIDES_ENV_VAR] = "--x"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = "--x"
     assert parse_config_overrides([], env_var=None) == {}
     with pytest.raises(SystemExit):
         parse_config_overrides([])
-    os.environ[OVERRIDES_ENV_VAR] = "hello world"
+    os.environ[ENV_VARS.CONFIG_OVERRIDES] = "hello world"
     with pytest.raises(SystemExit):
         parse_config_overrides([])
-    del os.environ[OVERRIDES_ENV_VAR]
+    del os.environ[ENV_VARS.CONFIG_OVERRIDES]
 
 
 @pytest.mark.parametrize("lang", ["en", "nl"])
diff --git a/spacy/util.py b/spacy/util.py
index 761eb9f0a..8a96ba4fe 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -71,6 +71,10 @@ logging.basicConfig(format="%(message)s")
 logger = logging.getLogger("spacy")
 
 
+class ENV_VARS:
+    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
+
+
 class registry(thinc.registry):
     languages = catalogue.create("spacy", "languages", entry_points=True)
     architectures = catalogue.create("spacy", "architectures", entry_points=True)

From 115481aca7b5eb50127c16821cb0c13c4c608307 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 15:16:00 +0200
Subject: [PATCH 296/516] Update docs [ci skip]

---
 website/docs/api/corpus.md                 |  16 +-
 website/docs/api/top-level.md              |  41 ++--
 website/docs/images/lifecycle.svg          |  93 +++++++
 website/docs/usage/101/_pipelines.md       |   2 +-
 website/docs/usage/processing-pipelines.md |  20 +-
 website/docs/usage/training.md             | 267 +++++++++++++--------
 website/docs/usage/v3.md                   |   7 +-
 7 files changed, 307 insertions(+), 139 deletions(-)
 create mode 100644 website/docs/images/lifecycle.svg

diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index e7d6773e6..37f24819d 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -32,14 +32,16 @@ streaming.
 > gold_preproc = false
 > max_length = 0
 > limit = 0
+> augmenter = null
 > ```
 
-| Name            | Description                                                                                                                                              |
-| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~                    |
-|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
-| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~      |
-| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                          |
+| Name            | Description                                                                                                                                                                                                                                                                              |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~                                                                                                                                                    |
+|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~                                                                                                                                 |
+| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      |
+| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          |
+| `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/training/corpus.py
@@ -74,7 +76,7 @@ train/test skew.
 |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~                     |
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                     |
-| `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
+| `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~                                                           |
 
 ## Corpus.\_\_call\_\_ {#call tag="method"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 8d5556c7a..7f1b1ed7f 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -191,16 +191,16 @@ browser. Will run a simple web server.
 > displacy.serve([doc1, doc2], style="dep")
 > ```
 
-| Name      | Description                                                                                                                                                        |
-| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `docs`    | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~                                                                              |
-| `style`   | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~                                                                                              |
-| `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                      |
-| `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                  |
-| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                  |
+| Name      | Description                                                                                                                                                       |
+| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `docs`    | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~                                                                             |
+| `style`   | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~                                                                                             |
+| `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                     |
+| `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                 |
+| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                 |
 | `manual`  | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
-| `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                           |
-| `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                      |
+| `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                          |
+| `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                     |
 
 ### displacy.render {#displacy.render tag="method" new="2"}
 
@@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
 | `page`      | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                                          |
 | `minify`    | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                                      |
 | `options`   | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                                      |
-| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
+| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                      |
 | `jupyter`   | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
 | **RETURNS** | The rendered HTML markup. ~~str~~                                                                                                                                                      |
 
@@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
 | Name                                       | Description                                                                                                                                  |
 | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
 | `fine_grained`                             | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
-| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      |
 | `collapse_punct`                           | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
 | `collapse_phrases`                         | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
 | `compact`                                  | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
@@ -498,12 +498,13 @@ the [`Corpus`](/api/corpus) class.
 > limit = 0
 > ```
 
-| Name            | Description                                                                                                                                              |
-| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~        |
-|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
-| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~      |
-| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                          |
+| Name            | Description                                                                                                                                                                                                                                                                              |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`          | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Union[str, Path]~~                                                                                                                                        |
+|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~                                                                                                                                 |
+| `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      |
+| `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          |
+| `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
 
 ### JsonlReader {#jsonlreader}
 
@@ -935,7 +936,7 @@ Compile a sequence of prefix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                         |
 
 ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
 
@@ -952,7 +953,7 @@ Compile a sequence of suffix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                         |
 
 ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
 
@@ -969,7 +970,7 @@ Compile a sequence of infix rules into a regex object.
 | Name        | Description                                                                                                                               |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
+| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                      |
 
 ### util.minibatch {#util.minibatch tag="function" new="2"}
 
diff --git a/website/docs/images/lifecycle.svg b/website/docs/images/lifecycle.svg
new file mode 100644
index 000000000..2f4b304b8
--- /dev/null
+++ b/website/docs/images/lifecycle.svg
@@ -0,0 +1,93 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1788" height="925" viewBox="0 0 1788 925">
+  <!-- Icons: Twemoji (https://twemoji.twitter.com/) -->
+  <g fill="none" fill-rule="evenodd">
+    <rect width="710" height="525" x="13" y="271" fill="#FFE6ED" stroke="#F03969" stroke-width="6" rx="25"/>
+    <rect width="710" height="393" x="1064" y="274" fill="#E1F9FF" stroke="#3AC" stroke-width="6" rx="25"/>
+    <path stroke="#CCD6DD" stroke-dasharray="3 15" stroke-width="5" d="M896.9 85v772.3h2-2V85h-2 2zm0 0v-1 1h324.6-324.6zm0 772.3L480 859.5l416.9-2.2v2-2z"/>
+    <path fill="#CCD6DD" d="M1237 84.5l-19 9.5V75z"/>
+    <rect width="212" height="203" x="786" y="373" fill="#EAEFF3" rx="22"/>
+    <path fill="#F03969" fill-rule="nonzero" d="M818.6 540.1a9.5 9.5 0 11-18.2 5.3 9.5 9.5 0 0118.2-5.3zm-238.6 71l-.9-2.9 3.9-1 .8 2.8-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1.1.9 2.9-3.9 1zm8.7-2.5l-.9-2.9 3.9-1.1.8 2.9-3.8 1zm8.6-2.5l-.8-3 3.8-1 .8 2.8-3.8 1.2zm8.6-2.6l-.8-2.8 3.8-1.1.9 2.8-3.9 1.1zm8.7-2.5l-.9-2.8 3.9-1.2.8 3-3.8 1zm8.6-2.5l-.8-2.9 3.8-1 .9 2.8-3.9 1.1zm8.7-2.5l-.9-2.9 3.9-1 .8 2.8-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1.1.9 2.9-3.9 1zm8.7-2.5l-.9-3 3.9-1 .8 2.9-3.8 1zm8.6-2.5l-.8-3 3.8-1 .8 2.8-3.8 1.2zm8.6-2.6l-.8-2.8 3.8-1.2.9 3-3.9 1zm8.7-2.5l-.9-2.9 3.9-1 .8 2.8-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1 .9 2.8-3.9 1.1zm8.7-2.5l-.9-2.9 3.9-1.1.8 2.9-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1.1.8 2.9-3.8 1zm8.6-2.5l-.8-3 3.8-1 .9 2.9-3.9 1zm8.7-2.5l-.9-3 3.9-1 .8 2.8-3.8 1.2zm8.6-2.6l-.8-2.8 3.8-1.2.9 3-3.9 1zm8.7-2.5l-.9-2.9 3.9-1 .8 2.8-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1 .8 2.8-3.8 1.1zm8.7-2.5l-.9-2.9 3.9-1.1.8 2.9-3.8 1.1zm8.6-2.5l-.8-2.9 3.8-1.1.8 2.9-3.8 1zm8.6-2.5l-.8-3 3.8-1 .9 2.8-3.9 1.2zm8.7-2.5l-.9-3 3.9-1 .8 2.8-3.8 1.2zm8.6-2.6l-.8-2.8 3.8-1.2.9 3-3.9 1zM819 501.9a9.5 9.5 0 11-19 2.2 9.5 9.5 0 0119-2.2zm-176.3 22.3l-.4-3 4-.5.4 3-4 .5zm9-1l-.4-3 4-.5.3 3-4 .4zm8.9-1.1l-.4-3 4-.5.3 3-4 .5zm8.9-1l-.4-3 4-.5.4 3-4 .4zm9-1.1l-.4-3 4-.5.3 3-4 .5zm8.9-1.1l-.4-3 4-.4.3 3-4 .4zm9-1l-.4-3 4-.5.3 3-4 .4zm8.9-1.1l-.4-3 4-.5.3 3-4 .5zm8.9-1l-.4-3 4-.5.4 3-4 .4zm9-1.1l-.4-3 4-.5.3 3-4 .5zm8.9-1l-.4-3 4-.5.3 3-4 .4zm9-1.1l-.4-3 4-.5.3 3-4 .5zm8.8-1l-.3-3 4-.5.3 3-4 .4zm9-1.1l-.4-3 4-.5.4 3-4 .5zm9-1l-.4-3 4-.5.3 3-4 .4zm8.9-1.1l-.4-3 4-.5.3 3-4 .5zm8.9-1l-.4-3 4-.5.4 3-4 .4zm9-1.1l-.4-3 4-.5.3 3-4 .5zM804 423l-.6.1-.3-3h.7a9.5 9.5 0 11.3 3zM696 434l-.4-3 4-.4.3 3-4 .4zm8.9-1l-.3-2.9 4-.4.2 3-4 .4zm9-.8l-.4-3 4-.4.3 3-4 .4zm8.9-.9l-.3-3 4-.4.3 3-4 .4zm9-.9l-.4-3 4-.4.3 3-4 .4zm8.9-.9l-.3-3 4-.4.3 3-4 .4zm9-.9l-.4-3 4-.4.3 3-4 .4zm8.9-.9l-.3-3 4-.4.3 3-4 .4zm9-.9l-.4-3 4-.4.3 3-4 .4zm8.9-.9l-.3-3 4-.4.3 3-4 .4zm9-.9l-.3-3 4-.4.2 3-4 .4zm9-.9l-.4-3 4-.4.3 3-4 .4zM823.7 459.4a9.5 9.5 0 11-19-1.6 9.5 9.5 0 0119 1.6zm-126.3-8.8l.2-3 4 .3-.2 3-4-.3zm9 .7l.2-3 4 .3-.3 3-4-.3zm9 .7l.2-3 4 .4-.3 3-4-.4zm8.9.8l.2-3 4 .3-.2 3-4-.3zm9 .7l.2-3 4 .3-.3 3-4-.3zm9 .7l.2-3 4 .4-.3 3-4-.4zm8.9.8l.2-3 4 .3-.2 3-4-.3zm9 .7l.2-3 4 .3-.2 3-4-.3zm9 .7l.2-3 4 .4-.3 3-4-.4zm9 .8l.2-3 4 .3-.3 3-4-.3zm8.9.7l.2-3 4 .3-.2 3-4-.3zm9 .7l.2-3 4 .4-.3 3-4-.4z"/>
+    <path fill="#3AC" fill-rule="nonzero" d="M993.7 456.5h.7l.3 3h-.7a9.5 9.5 0 11-.3-3zm90.3-8.6l.3 3-4 .3-.3-3 4-.3zm-9 .8l.3 3-4 .4-.3-3 4-.4zm-9 .9l.3 3-4 .4-.2-3 4-.4zm-8.9.8l.3 3-4 .4-.3-3 4-.4zm-9 1l.3 2.9-4 .4-.2-3 4-.4zm-9 .8l.4 3-4 .3-.3-3 4-.3zm-8.9.8l.3 3-4 .4-.3-3 4-.4zm-9 .9l.4 3-4 .4-.3-3 4-.4zm-8.9.9l.3 3-4 .3-.3-3 4-.3zm-9 .8l.3 3-4 .4-.2-3 4-.4zM898.6 421a9.5 9.5 0 1119 .6 9.5 9.5 0 01-19-.6zm185.2 4.6l-.1 3h-4l.1-3h4zm-9-.3l-.1 3h-4l.1-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3h-4v-3h4zm-9-.3v3l-4-.1v-3l4 .1z"/>
+    <rect width="135.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(814 361)"/>
+    <rect width="638" height="56" fill="#FFF" rx="28" transform="translate(50 409)"/>
+    <rect width="638" height="56" fill="#FFF" rx="28" transform="translate(1099 412)"/>
+    <rect width="448" height="56" x="92" fill="#3D4251" rx="28" transform="translate(47 321)"/>
+    <rect width="421" height="56" x="107" fill="#3D4251" rx="28" transform="translate(1101 324)"/>
+    <rect width="520" height="56" x="56" fill="#FFF" rx="28" transform="translate(51 495)"/>
+    <rect width="386" height="56" x="123" fill="#FFF" rx="28" transform="translate(50 581)"/>
+    <rect width="333" height="56" x="150" fill="#FFF" rx="28" transform="translate(50 667)"/>
+    <rect width="385" height="56" x="128" fill="#FFF" rx="28" transform="translate(1099 498)"/>
+    <g fill-rule="nonzero">
+      <path fill="#E1E8ED" d="M416 67l-30-30a6.7 6.7 0 00-9.4 9.4l-12.8 12.8 30 30 12.8-12.8A6.7 6.7 0 00416 67z"/>
+      <path fill="#CCD6DD" d="M381.3 35h-50C324 35 318 41 318 48.3v93.4c0 7.3 6 13.3 13.3 13.3h73.4c7.3 0 13.3-6 13.3-13.3v-70h-30a7.2 7.2 0 01-6.7-6.7V35z"/>
+      <path fill="#99AAB5" d="M381.3 35h-6.6v30c0 7.4 6 13.3 13.3 13.3h30v-6.6h-30a7.2 7.2 0 01-6.7-6.7V35zm-16.6 29.3c0 2.2-1.5 4-3.4 4h-26.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h26.6c1.9 0 3.4 1.8 3.4 4zm0 13.4c0 2.2-1.5 4-3.4 4h-26.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h26.6c1.9 0 3.4 1.8 3.4 4zm40 13.3c0 2.2-1.5 4-3.4 4h-66.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.3c0 2.2-1.5 4-3.4 4h-66.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.4c0 2.2-1.5 4-3.4 4h-66.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.3c0 2.2-1.5 4-3.4 4h-66.6c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4z"/>
+    </g>
+    <rect width="121.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(303 141)"/>
+    <g fill-rule="nonzero">
+      <path fill="#662113" d="M298.4 787.9v55c0 9 4.9 11 4.9 11l51.4 40.5c8.1 6.3 6-7 6-7v-50.6l-62.3-49z"/>
+      <path fill="#C1694F" d="M422.9 787.9v55c0 9-4.7 11-4.7 11l-51.5 40.5c-8.1 6.3-6-7-6-7v-50.6l62.2-49z"/>
+      <path fill="#D99E82" d="M366.4 741.2a10.1 10.1 0 00-12.2 0L301 783.6c-3.4 2.7-3.4 7.1 0 9.8l53.4 42.9c3.3 2.7 8.8 2.7 12.2 0l53.8-43.3c3.3-2.7 3.3-7 0-9.8l-54-42z"/>
+      <path fill="#D99E82" d="M361.4 897.9c-2.8 0-5.2-2.2-5.2-4.8v-57.5c0-2.6 2.4-4.7 5.2-4.7 2.9 0 5.2 2.1 5.2 4.7v57.5c0 2.6-2.3 4.8-5.2 4.8z"/>
+      <path fill="#D0A800" d="M405.1 822.7c0 5 .5 7.3-4.4 10.9l-11 8.3c-5 3.6-6.8.5-6.8-4.4v-13.2c0-.8-.1-1.7-1.2-2.7-11.5-9.4-56-44.7-64.1-51.3l20.6-16.3c5.6 4.1 47.7 36.7 65.4 50.6.9.6 1.5 1.4 1.5 2.2v16z"/>
+      <path fill="#FFDA3C" d="M403.6 804.6a5735 5735 0 00-65.4-50.6l-7.7 6-12.9 10.3c8.1 6.6 52.6 41.9 64 51.3.6.5 1 1 1.2 1.7l21.9-17.6c-.3-.5-.7-.8-1.1-1.1z"/>
+      <path fill="#CFC59C" d="M405.1 806.8c0-.8-.6-1.5-1.5-2.2a5735 5735 0 00-65.4-50.6l-9.3 7.3c15.1 11.9 55.5 43 66 51.5a3 3 0 011.3 2.7V837l4.5-3.4c4.9-3.6 4.4-6 4.4-10.9v-15.9z"/>
+      <path fill="#FFF9DF" d="M403.6 804.6a5735 5735 0 00-65.4-50.6l-9.3 7.3c15.1 11.9 55.5 43 66 51.5.3 0 .4.2.5.3l9.3-7.4c-.3-.5-.7-.8-1.1-1.1z"/>
+    </g>
+    <rect width="179.5" height="47.5" x="1.8" y="1.8" fill="#F03969" stroke="#F03969" stroke-width="3.5" rx="23.8" transform="translate(278 243)"/>
+    <rect width="193.5" height="47.5" x="1.8" y="1.8" fill="#3AC" stroke="#3AC" stroke-width="3.5" rx="23.8" transform="translate(1320 246)"/>
+    <path fill="#F03969" fill-rule="nonzero" d="M371.8 211h7.2l-9.5 19-9.5-19h7.8v-28h4v28z"/>
+    <path fill="#3AC" fill-rule="nonzero" d="M1421.8 214h7.2l-9.5 19-9.5-19h7.8v-28h4v28z"/>
+    <g fill-rule="nonzero">
+      <path fill="#662113" d="M1356.4 61.9v55c0 9 4.9 11 4.9 11l51.4 40.5c8.1 6.3 6-7 6-7v-50.6l-62.3-49z"/>
+      <path fill="#C1694F" d="M1480.9 61.9v55c0 9-4.7 11-4.7 11l-51.5 40.5c-8.1 6.3-6-7-6-7v-50.6l62.2-49z"/>
+      <path fill="#D99E82" d="M1424.4 15.2a10.1 10.1 0 00-12.2 0L1359 57.6c-3.4 2.7-3.4 7.1 0 9.8l53.4 42.9c3.3 2.7 8.8 2.7 12.2 0l53.8-43.3c3.3-2.7 3.3-7 0-9.8l-54-42z"/>
+      <path fill="#D99E82" d="M1419.4 171.9c-2.8 0-5.2-2.2-5.2-4.8v-57.5c0-2.6 2.4-4.7 5.2-4.7 2.9 0 5.2 2.1 5.2 4.7v57.5c0 2.6-2.3 4.8-5.2 4.8z"/>
+      <path fill="#D0A800" d="M1463.1 96.7c0 5 .5 7.3-4.4 10.9l-11 8.3c-5 3.6-6.8.5-6.8-4.4V98.3c0-.8-.1-1.7-1.2-2.7-11.5-9.4-56-44.7-64.1-51.3l20.6-16.3c5.6 4.1 47.7 36.7 65.4 50.6.9.6 1.5 1.4 1.5 2.2v16z"/>
+      <path fill="#FFDA3C" d="M1461.6 78.6a5735 5735 0 00-65.4-50.6l-7.7 6-12.9 10.3c8.1 6.6 52.6 41.9 64 51.3.6.5 1 1 1.2 1.7l21.9-17.6c-.3-.5-.7-.8-1.1-1.1z"/>
+      <path fill="#CFC59C" d="M1463.1 80.8c0-.8-.6-1.5-1.5-2.2a5735 5735 0 00-65.4-50.6l-9.3 7.3c15.1 11.9 55.5 43 66 51.5a3 3 0 011.3 2.7V111l4.5-3.4c4.9-3.6 4.4-6 4.4-10.9V80.8z"/>
+      <path fill="#FFF9DF" d="M1461.6 78.6a5735 5735 0 00-65.4-50.6l-9.3 7.3c15.1 11.9 55.5 43 66 51.5.3 0 .4.2.5.3l9.3-7.4c-.3-.5-.7-.8-1.1-1.1z"/>
+    </g>
+    <g fill-rule="nonzero">
+      <path fill="#E1E8ED" d="M1445.6 64l-.7-20.4a3.2 3.2 0 00-6.4.3l-8.7.3.7 20.3 8.7-.3a3.2 3.2 0 006.4-.2z"/>
+      <path fill="#CCD6DD" d="M1444 41.4L1426.3 25a6.4 6.4 0 00-9 .4L1386.7 58a6.4 6.4 0 00.3 9l25.7 24a6.4 6.4 0 009-.2l23-24.6-10.5-9.8a3.4 3.4 0 01-.2-4.6l9.8-10.5z"/>
+      <path fill="#99AAB5" d="M1444 41.4l-2.4-2.2-9.8 10.6a6.4 6.4 0 00.3 9l10.5 9.8 2.2-2.3-10.5-9.8a3.4 3.4 0 01-.2-4.6l9.8-10.5zm-15.5 4.9c-.8.7-1.9.9-2.5.3l-9.4-8.8c-.6-.6-.6-1.7.2-2.5.7-.7 1.8-.9 2.4-.3l9.4 8.8c.7.6.6 1.7-.1 2.5zm-4.4 4.6c-.7.8-1.8 1-2.5.4l-9.3-8.8c-.7-.6-.6-1.7.1-2.5.7-.7 1.8-.9 2.5-.3l9.3 8.7c.7.6.6 1.8-.1 2.5zm9.7 17.8c-.7.8-1.9 1-2.5.3l-23.4-21.8c-.7-.6-.6-1.7.1-2.5.8-.8 1.9-1 2.5-.3l23.4 21.8c.7.6.6 1.7-.1 2.5zm-4.4 4.7c-.7.8-1.8 1-2.5.3L1403.5 52c-.6-.6-.6-1.7.2-2.5.7-.8 1.8-1 2.4-.3l23.5 21.8c.6.6.5 1.7-.2 2.5zM1425 78c-.7.7-1.8.9-2.4.3l-23.4-21.8c-.7-.6-.6-1.8.1-2.5.7-.8 1.8-1 2.5-.3l23.4 21.8c.6.6.6 1.7-.2 2.5zm-4.3 4.7c-.7.7-1.8.9-2.5.3l-23.4-21.9c-.6-.6-.6-1.7.1-2.5.8-.7 1.9-.9 2.5-.3l23.4 21.9c.7.6.6 1.7-.1 2.5z"/>
+    </g>
+    <rect width="58.3" height="11.3" x="2.3" y=".8" fill="#3D4251" stroke="#3D4251" stroke-width="1.7" rx="5.6" transform="rotate(43 623.6 1780.3)"/>
+    <g fill-rule="nonzero">
+      <path fill="#CCD6DD" d="M1484 704.4c-1.7-3.9-3.6-8-8-8h-4a8 8 0 008-8V620a8 8 0 00-8-8h-104a8 8 0 00-8 8v68.3a8 8 0 008 8h-4c-4.4 0-6.7 4-8 8l-8 19.7a8 8 0 008 8h128a8 8 0 008-8l-8-19.6z"/>
+      <path fill="#9AAAB4" d="M1348 724.3v.4l.7 5.2c1.3 5.3 4.2 8.8 7.3 8.8h128c4.3 0 7.8-6.4 8-14.4h-144z"/>
+      <path fill="#4CC0E6" d="M1472 684.3a4 4 0 01-4 4h-96a4 4 0 01-4-4v-60.2a4 4 0 014-4h96a4 4 0 014 4v60.2z"/>
+      <path fill="#AEBBC1" d="M1479.6 712.2l-3-8.7c-1-1.9-2.6-3.4-4.8-3.4h-103.4c-2.2 0-3.6 1.7-4.6 4.2l-2.7 7.9c-1 2.4 1.8 4 4 4h30.3s3.7-.2 4.4-2.4c.8-2.6 1.6-6.5 1.8-7.2.3-1 1.6-2.2 3.5-2.2h34.3c2 0 3 1 3.2 2.3.2.8 1.3 4.6 1.8 7.1.4 2.3 4.9 2.4 4.9 2.4h26.3c2.2 0 4.9-1.9 4-4z"/>
+      <path fill="#9AAAB4" d="M1437.5 722.8h-31.1c-1.5 0-2.2-1.6-2-3.2l1.6-9.4c0-.8 1-1.8 2-1.8h28.1c1.2 0 2 .8 2.2 2.3l1.3 9.3c0 1.5-.7 2.8-2.1 2.8z"/>
+      <g fill="#FFF">
+        <path d="M1390.2 651.3c-1.8-.2-2-2.8-4.4-2.6-1.2 0-2.3.5-2.3 1.6 0 1.6 2.5 1.8 4 2.2 2.6.8 5 1.3 5 4.1 0 3.6-2.7 4.8-6.3 4.8-3 0-6.2-1-6.2-3.9 0-.8.7-1.4 1.5-1.4.9 0 1.2.4 1.5 1 .8 1.3 1.6 2 3.5 2 1.3 0 2.6-.5 2.6-1.6 0-1.6-1.6-1.9-3.2-2.3-3-.8-5.4-1.2-5.7-4.4-.3-5.5 11.1-5.7 11.7-1 0 1-.8 1.5-1.7 1.5zm13.6-4.8c4 0 6.2 3.3 6.2 7.5 0 4.1-2.2 7.5-6.2 7.5-2.3 0-3.7-1-4.7-2.5v5.5c0 1.7-.5 2.5-1.7 2.5-1.5 0-1.8-1-1.8-2.5v-16c0-1.3.6-2 1.8-2 1 0 1.7.8 1.7 2v.5c1.1-1.4 2.4-2.5 4.7-2.5zm-1 12.1c2.4 0 3.4-2.1 3.4-4.7 0-2.4-1-4.7-3.4-4.7s-3.5 2-3.5 4.7c0 2.6 1.1 4.7 3.5 4.7zm10-8.2c0-2.9 3.3-4 6.5-4 4.5 0 6.3 1.4 6.3 5.8v4.2c0 1 .6 3 .6 3.6 0 .9-.8 1.4-1.7 1.4-1 0-1.7-1.1-2.2-2a8 8 0 01-5.4 2c-2.7 0-4.7-1.5-4.7-4 0-2.4 1.6-3.7 3.6-4.1l6.2-1.5c0-2-.7-2.8-2.7-2.8-1.8 0-2.7.5-3.4 1.6-.5.8-.5 1.3-1.5 1.3-.9 0-1.6-.6-1.6-1.5zm5.3 8.7c2.8 0 4-1.5 4-4.4v-.6l-4.6 1.1c-.9.2-1.7.8-1.7 1.8 0 1.2 1.1 2 2.3 2zm20.2-18.1c3.9 0 8 2.3 8 6 0 1-.7 1.8-1.6 1.8-1.2 0-1.4-.7-2-1.7-1-1.7-2-2.9-4.4-2.9-3.7 0-5.4 3.2-5.4 7.2s1.4 6.9 5.2 6.9c2.6 0 4-1.5 4.7-3.4.3-1 .8-1.8 2-1.8.8 0 1.7 1 1.7 1.9 0 3.9-4 6.6-8.1 6.6-4.5 0-7-1.9-8.4-5a12 12 0 01-1.1-5.3c-.1-6 3.4-10.3 9.4-10.3zm22.1 5.5c1 0 1.6.6 1.6 1.6l-.4 1.7-4.8 12.6c-1 2.7-1.9 4.6-5.5 4.6-1.7 0-3.2-.2-3.2-1.7 0-.8.7-1.3 1.6-1.3l.6.1.6.1c1.8 0 2-1.9 2.7-3.1l-4.6-11.5c-.3-.6-.4-1-.4-1.4 0-1 .7-1.7 1.8-1.7s1.6.9 2 2l3 9 3-8.4c.4-1.3.5-2.6 2-2.6z"/>
+      </g>
+    </g>
+    <g fill-rule="nonzero">
+      <path fill="#535D64" d="M829.4 429.2v-18.8h6.1v2.8h-3.1v13.2h3.1v2.8h-6.1zm9.8-13.2h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm11.3-.7v-2.7h6.4v10.6c0 .3.1.6.3.8.2.2.5.3.8.3h3.3v2.7h-3.6c-1.2 0-2.1-.3-2.8-1-.7-.7-1-1.6-1-2.8v-7.9h-3.4zm12.8.7h2.9v1.9h.2c0-.7.4-1.2.9-1.5.5-.4 1.1-.6 1.9-.6a4 4 0 011.6.3l1.2.9.8 1.3c.2.6.2 1.1.2 1.8v2.7c0 .7 0 1.3-.2 1.8-.2.6-.5 1-.8 1.4a3.5 3.5 0 01-2.8 1.2c-.8 0-1.4-.2-2-.6-.4-.3-.7-.9-.8-1.5h-.2a18.1 18.1 0 010 1.9v3.4h-3V416zm3 4.2v2.6c0 .5.1 1 .5 1.3.3.3.8.5 1.4.5.6 0 1-.2 1.3-.5.4-.4.5-.8.5-1.4v-2.5c0-.6-.1-1-.5-1.3a2 2 0 00-1.3-.5 2 2 0 00-1.4.5c-.4.3-.5.8-.5 1.3zm10.2 9v-2.8h3.1v-13.2h-3.1v-2.8h6.1v18.8h-6.1zm-47.1 38v-18.8h6.1v2.8h-3.1v13.2h3.1v2.8h-6.1zm14.7-2c-.8 0-1.4-.1-2-.3-.7-.2-1.2-.5-1.7-.9-.4-.3-.7-.8-1-1.3-.2-.5-.3-1.1-.3-1.8v-2.8c0-.7 0-1.2.3-1.8.3-.5.6-1 1-1.3.5-.4 1-.7 1.6-.9a6.5 6.5 0 014 0c.5.2 1 .4 1.5.8a4.5 4.5 0 011.7 2.9h-3c-.1-.4-.3-.8-.7-1-.4-.3-.9-.4-1.4-.4-.6 0-1.1.1-1.5.5-.4.3-.5.7-.5 1.2v2.8c0 .6.1 1 .5 1.3.4.3.9.4 1.5.4.5 0 1-.1 1.4-.4.3-.3.6-.6.7-1h3a4.8 4.8 0 01-1.7 3l-1.5.7a6 6 0 01-1.9.3zm11.9 0c-.8 0-1.4-.1-2-.3-.7-.2-1.2-.5-1.7-1a4 4 0 01-1.3-3.1v-2.6c0-.7 0-1.2.3-1.8l1-1.4c.5-.4 1-.7 1.6-.9.7-.2 1.3-.3 2.1-.3s1.4.1 2 .3c.7.2 1.2.5 1.7 1 .4.3.7.8 1 1.3.2.6.3 1.1.3 1.8v2.6c0 .6 0 1.2-.3 1.8a4 4 0 01-1 1.4c-.5.4-1 .7-1.6.9-.7.2-1.3.3-2.1.3zm-2-4.4c0 .5.1 1 .5 1.3.4.3.9.5 1.5.5s1.1-.2 1.5-.5c.4-.3.5-.8.5-1.3v-2.6c0-.6-.1-1-.5-1.3-.4-.3-.9-.5-1.5-.5s-1.1.2-1.5.5c-.4.3-.5.7-.5 1.3v2.6zm8.8 4.2v-11h2.4v1.6h.2c0-.6.2-1 .5-1.3.4-.3.8-.5 1.3-.5.6 0 1 .2 1.2.5.3.3.4.7.5 1.3h.2c0-.6.2-1 .5-1.3.4-.3.8-.5 1.4-.5a2 2 0 011.6.8c.4.5.6 1.1.6 2v8.4h-2.6v-8c0-.8-.2-1.2-.8-1.2-.5 0-.8.4-.8 1.1v8.1h-2v-8c0-.8-.3-1.2-.9-1.2l-.6.3-.1.8v8.1h-2.6zm12.5-11h2.9v1.9h.2c0-.7.4-1.2.9-1.5.5-.4 1.1-.6 1.9-.6a4 4 0 011.6.3l1.2.9.8 1.3c.2.6.2 1.1.2 1.8v2.7c0 .7 0 1.3-.2 1.8-.2.6-.5 1-.8 1.4a3.5 3.5 0 01-2.8 1.2c-.8 0-1.4-.2-2-.6-.4-.3-.7-.9-.8-1.5h-.2a18.1 18.1 0 010 1.9v3.4h-3V454zm3 4.2v2.6c0 .5.1 1 .5 1.3.3.3.8.5 1.4.5.6 0 1-.2 1.3-.5.4-.4.5-.8.5-1.4v-2.5c0-.6-.1-1-.5-1.3a2 2 0 00-1.3-.5 2 2 0 00-1.4.5c-.4.3-.5.8-.5 1.3zm13.7 7c-.8 0-1.4-.1-2-.3-.7-.2-1.2-.5-1.7-1a4 4 0 01-1.3-3.1v-2.6c0-.7 0-1.2.3-1.8l1-1.4c.5-.4 1-.7 1.6-.9.7-.2 1.3-.3 2.1-.3s1.4.1 2 .3c.7.2 1.2.5 1.7 1 .4.3.7.8 1 1.3.2.6.3 1.1.3 1.8v2.6c0 .6 0 1.2-.3 1.8a4 4 0 01-1 1.4c-.5.4-1 .7-1.6.9-.7.2-1.3.3-2.1.3zm-2-4.4c0 .5.1 1 .5 1.3.4.3.9.5 1.5.5s1.1-.2 1.5-.5c.4-.3.5-.8.5-1.3v-2.6c0-.6-.1-1-.5-1.3-.4-.3-.9-.5-1.5-.5s-1.1.2-1.5.5c-.4.3-.5.7-.5 1.3v2.6zm9.2-6.8h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm11.8 4.2c0-.7 0-1.3.3-1.8.3-.6.6-1 1-1.4.5-.4 1-.7 1.6-.9.7-.2 1.3-.3 2.1-.3.7 0 1.4.1 2 .3.7.2 1.2.5 1.6 1 .5.3.8.8 1 1.3.3.6.4 1.1.4 1.8v2h-7v.6c0 .7.1 1.2.5 1.6.3.3.8.5 1.5.5.5 0 1 0 1.3-.2l.7-.7h2.9c-.1.5-.3 1-.6 1.3l-1.1 1c-.4.3-1 .5-1.5.7l-1.7.2c-.8 0-1.4-.1-2-.3-.7-.2-1.2-.5-1.7-1-.4-.3-.7-.8-1-1.3-.2-.5-.3-1.1-.3-1.8v-2.6zm3 .2h4.1v-.2c0-.7-.2-1.2-.6-1.6-.3-.3-.8-.5-1.5-.5a2 2 0 00-1.5.5 2 2 0 00-.6 1.6v.2zm9.2-4.4h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm11.5 0h2.9v-3.4h3v3.4h4.1v2.7h-4.1v4.5c0 .3.1.6.3.8.2.2.5.3.8.3h2.8v2.7h-3.1c-1.2 0-2.1-.3-2.8-1-.7-.7-1-1.6-1-2.8v-4.5h-2.9V454zm15.2 8.1c0 .3.3.5.6.7.2.2.6.2 1 .2h1c.5 0 1 0 1.3-.3.3-.2.4-.4.4-.8 0-.7-.5-1-1.6-1.2l-1.5-.1a5.5 5.5 0 01-3-1 3 3 0 01-1-2.5c0-1 .4-1.9 1.1-2.4.8-.6 1.9-.9 3.3-.9h1c1.3 0 2.4.3 3.1.8a3 3 0 011.3 2.2h-3c-.1-.6-.6-.9-1.4-.9h-1c-1 0-1.5.4-1.5 1 0 .7.5 1 1.4 1.2l1.6.1c2.7.4 4.1 1.5 4.1 3.6 0 1-.4 2-1.2 2.5-.8.6-2 .9-3.4.9h-1c-1.4 0-2.5-.3-3.2-.8a3.1 3.1 0 01-1.4-2.3h3zm10.6 5.1v-2.8h3.1v-13.2h-3.1v-2.8h6.1v18.8h-6.1zm-131.1 38v-18.8h6.1v2.8h-3.1v13.2h3.1v2.8h-6.1zm10.1-5h3.9v-5.5H840V492h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.1c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm8.4 2.9h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm12.3 8.3h3.9v-5.6H864V492h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.2c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm7.9 2.9h2.9v-3.4h3v3.4h4.1v2.7h-4.1v4.5c0 .3.1.6.3.8.2.2.5.3.8.3h2.8v2.7h-3.1c-1.2 0-2.1-.3-2.8-1-.7-.7-1-1.6-1-2.8v-4.5h-2.9V492zm12.8 8.3h3.9v-5.6H888V492h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.2c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm8 10.9c0-1.1.5-2 1.3-2.6.8-.7 1.9-1 3.3-1h2.4v-1c0-.4-.2-.7-.5-1-.3-.2-.8-.3-1.3-.3s-1 0-1.3.3c-.3.2-.5.4-.6.7h-2.9a3.5 3.5 0 011.5-2.4c.4-.3 1-.5 1.5-.7l1.8-.2a7 7 0 012 .3c.5.1 1 .4 1.5.7l1 1.1.3 1.5v7.6h-2.9v-2.2h-.2c0 .7-.4 1.3-1 1.8a4 4 0 01-2.4.6c-1 0-1.9-.3-2.5-.9a3 3 0 01-1-2.3zm3-.5c0 .4.2.8.5 1 .3.3.7.4 1.3.4.6 0 1.2-.1 1.6-.4.4-.4.6-.7.6-1.2v-1.1h-2.4c-.5 0-.8.1-1.1.3-.3.3-.4.6-.4 1zm8.7-8.2v-2.7h6.4v10.6c0 .3.1.6.3.8.2.2.5.3.8.3h3.3v2.7h-3.6c-1.2 0-2.1-.3-2.8-1-.7-.7-1-1.6-1-2.8v-7.9h-3.4zm13 9h3.9v-5.6H924V492h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.2c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm8.5 11.3l5.5-5.9v-.1a5.2 5.2 0 01-1 .2h-4.3V492h9v2.6l-5.6 6a35.3 35.3 0 011.7-.2h4.1v2.6h-9.4v-2.6zm11.7-4.2c0-.7 0-1.3.3-1.8.3-.6.6-1 1-1.4.5-.4 1-.7 1.6-.9.7-.2 1.3-.3 2.1-.3.7 0 1.4.1 2 .3.7.2 1.2.5 1.6 1 .5.3.8.8 1 1.3.3.6.4 1.1.4 1.8v2h-7v.6c0 .7.1 1.2.5 1.6.3.3.8.5 1.5.5.5 0 1 0 1.3-.2l.7-.7h2.9c-.1.5-.3 1-.6 1.3l-1.1 1c-.4.3-1 .5-1.5.7l-1.7.2c-.8 0-1.4-.1-2-.3-.7-.2-1.2-.5-1.7-1-.4-.3-.7-.8-1-1.3-.2-.5-.3-1.1-.3-1.8v-2.6zm3 .2h4.1v-.2c0-.7-.2-1.2-.6-1.6-.3-.3-.8-.5-1.5-.5a2 2 0 00-1.5.5 2 2 0 00-.6 1.6v.2zm10.5 8.8v-2.8h3.1v-13.2h-3.1v-2.8h6.1v18.8h-6.1zm-131.1 38v-18.8h6.1v2.8h-3.1v13.2h3.1v2.8h-6.1zm9.3-13.2h2.9v-3.4h3v3.4h4.1v2.7h-4.1v4.5c0 .3.1.6.3.8.2.2.5.3.8.3h2.8v2.7h-3.1c-1.2 0-2.1-.3-2.8-1-.7-.7-1-1.6-1-2.8v-4.5h-2.9V530zm15.8 0v1.9h.2c0-.7.4-1.2 1-1.6.4-.3 1.1-.5 2-.5 1.2 0 2.1.4 2.8 1.1.7.8 1 1.8 1 3.1v.9h-3.2v-.7c0-.6-.2-1-.5-1.4a2 2 0 00-1.4-.5c-.6 0-1 .2-1.3.5a2 2 0 00-.5 1.4v6.8h-3v-11h2.9zm8.4 8c0-1.1.4-2 1.2-2.6.8-.7 1.9-1 3.3-1h2.4v-1c0-.4-.2-.7-.5-1-.3-.2-.8-.3-1.3-.3s-1 0-1.3.3c-.3.2-.5.4-.6.7h-2.9a3.5 3.5 0 011.5-2.4c.4-.3 1-.5 1.5-.7l1.8-.2a7 7 0 012 .3c.5.1 1 .4 1.5.7l1 1.1.3 1.5v7.6h-2.9v-2.2h-.2c0 .7-.4 1.3-1 1.8a4 4 0 01-2.4.6c-1 0-1.9-.3-2.5-.9a3 3 0 01-1-2.3zm3-.5c0 .4.1.8.4 1 .3.3.7.4 1.3.4.6 0 1.2-.1 1.6-.4.4-.4.6-.7.6-1.2v-1.1h-2.4c-.5 0-.8.1-1.1.3-.3.3-.4.6-.4 1zm9.6.8h3.9v-5.6H876V530h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.2c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm8.4 2.9h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm12.3 8.3h3.9v-5.6H900V530h6.2v8.3h3.1v2.7h-9.8v-2.7zm3.3-11.2c0-.4.1-.8.4-1 .3-.3.6-.4 1.1-.4h.6c.5 0 .8.1 1 .4.4.2.5.6.5 1s-.1.8-.4 1c-.3.3-.7.4-1.1.4h-.6c-.5 0-.8-.1-1-.4-.4-.2-.5-.6-.5-1zm8.4 2.9h2.9v1.9h.2c0-.7.3-1.2.8-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1 .7.8 1 1.8 1 3.1v7.1h-3v-6.8c0-.6-.2-1-.5-1.3-.3-.3-.7-.5-1.2-.5-.6 0-1 .2-1.3.5-.3.3-.5.7-.5 1.3v6.8h-3v-11zm11.8 4.1c0-.6 0-1.2.2-1.8l.8-1.3c.3-.4.7-.7 1.2-.9.5-.2 1-.3 1.6-.3.8 0 1.5.2 2 .6.5.4.8 1 .9 1.6h.1v-2h3v10.4c0 .6-.2 1.2-.4 1.6-.2.5-.5 1-1 1.3-.4.3-.9.6-1.5.8-.5.2-1.2.3-1.9.3h-3.1V542h3.1c.5 0 1-.1 1.3-.4.3-.3.4-.7.4-1.2v-.1l.1-2.2h-.1c0 .7-.4 1.2-.9 1.6-.5.4-1.2.6-2 .6a3.5 3.5 0 01-2.8-1.2l-.8-1.3c-.2-.6-.2-1.2-.2-1.8v-1.9zm3 1.8c0 .6.1 1 .5 1.3.3.3.8.5 1.3.5.6 0 1-.2 1.4-.5.4-.3.5-.8.5-1.3v-1.7c0-.5-.1-1-.5-1.3a2 2 0 00-1.4-.5 2 2 0 00-1.3.5c-.4.3-.5.7-.5 1.3v1.7zm10.5 7.3v-2.8h3.1v-13.2h-3.1v-2.8h6.1v18.8h-6.1z"/>
+      <path fill="#FFF" d="M836.4 381.2a6 6 0 01-2-.3l-1.5-.9c-.4-.3-.7-.8-1-1.3-.1-.5-.3-1.1-.3-1.8v-2.8c0-.7.2-1.2.4-1.8.2-.5.5-1 1-1.3.4-.4.9-.7 1.4-.9a5.9 5.9 0 013.8 0 4.4 4.4 0 013 3.6h-2.5c-.1-.5-.3-1-.8-1.3-.4-.3-.9-.4-1.5-.4-.7 0-1.3.2-1.7.5a2 2 0 00-.6 1.6v2.8c0 .7.2 1.2.6 1.6.4.3 1 .5 1.7.5.6 0 1.1-.1 1.5-.4.4-.4.7-.8.8-1.3h2.5c-.1.6-.3 1.2-.6 1.6a4.4 4.4 0 01-2.4 2l-1.8.3zm11.3 0c-.7 0-1.4-.1-2-.3-.5-.2-1-.5-1.5-1a4 4 0 01-1-1.3c-.1-.5-.3-1.1-.3-1.8v-2.7c0-.6.2-1.2.4-1.7a4 4 0 012.5-2.3 6 6 0 012-.3 6 6 0 011.9.3c.6.2 1 .5 1.5.9.4.4.7.8 1 1.4.2.5.3 1 .3 1.7v2.7c0 .7-.1 1.3-.3 1.8a3.9 3.9 0 01-2.5 2.3 6 6 0 01-2 .3zm-2.3-4.4c0 .7.2 1.2.7 1.6.4.4 1 .6 1.6.6.7 0 1.3-.2 1.7-.6.4-.4.6-.9.6-1.6v-2.7a2 2 0 00-.6-1.5c-.4-.4-1-.6-1.7-.6s-1.2.2-1.6.6a2 2 0 00-.7 1.5v2.7zm9.2-6.8h2.4v1.9h.1c.2-.7.5-1.2 1-1.6.5-.3 1.2-.5 2-.5 1.1 0 2 .4 2.6 1.1.7.7 1 1.7 1 3v7.1h-2.5v-6.9c0-.7-.2-1.2-.6-1.6a2 2 0 00-1.4-.5 2 2 0 00-1.5.6c-.4.3-.6.9-.6 1.6v6.8h-2.5v-11zm11 3.5v-2.3h3.1v-1.5a3 3 0 011-2.3c.6-.5 1.4-.8 2.5-.8h3.2v2.2h-3.1a1 1 0 00-.8.3 1 1 0 00-.3.7v1.4h4.2v2.3h-4.2v7.5h-2.5v-7.5h-3zm12.1 5.2h4v-6.4h-3.5V370h5.9v8.7h3.1v2.3h-9.5v-2.3zM881 367c0-.4.2-.7.4-1 .3-.2.6-.3 1-.3h.6c.4 0 .7.1 1 .4.3.2.4.5.4 1 0 .3-.1.6-.4.9-.3.2-.6.3-1 .3h-.5c-.5 0-.8 0-1-.3-.3-.3-.5-.6-.5-1zm7.8 7c0-.6 0-1.2.2-1.7l.8-1.3c.3-.4.7-.7 1.2-.9a4 4 0 011.6-.3c.8 0 1.5.2 2 .6.5.4.9 1 1 1.6h.1v-2h2.4v10.4c0 .6 0 1.1-.3 1.6-.2.5-.5 1-.9 1.3l-1.4.8c-.6.2-1.2.3-2 .3h-2.9v-2h3c.6 0 1.1-.3 1.5-.6.4-.4.5-.8.5-1.4v-.3l.1-2h-.1c-.1.6-.5 1.1-1 1.5-.6.4-1.2.6-2 .6-.6 0-1.1 0-1.6-.3a3.5 3.5 0 01-2-2.2c-.2-.5-.2-1.1-.2-1.7v-2zm2.5 1.9c0 .7.2 1.2.6 1.6.3.4.9.5 1.6.5.6 0 1.2-.1 1.6-.5.3-.4.5-1 .5-1.6v-1.8c0-.6-.2-1.2-.5-1.6-.4-.3-1-.5-1.6-.5-.7 0-1.3.2-1.6.5-.4.4-.6 1-.6 1.6v1.8zm11.8 3.3c0-.6.2-1 .5-1.4a2 2 0 011.4-.5c.6 0 1.1.2 1.5.5.3.4.5.8.5 1.4 0 .6-.2 1.1-.5 1.5-.4.3-.9.5-1.5.5-.5 0-1-.2-1.4-.5a2 2 0 01-.5-1.5zm13.5 2a6 6 0 01-2-.3l-1.5-.9c-.4-.3-.7-.8-1-1.3-.1-.5-.3-1.1-.3-1.8v-2.8c0-.7.2-1.2.4-1.8.2-.5.5-1 1-1.3.4-.4.9-.7 1.4-.9a5.9 5.9 0 013.8 0 4.4 4.4 0 013 3.6h-2.5c0-.5-.3-1-.8-1.3-.4-.3-.9-.4-1.5-.4-.7 0-1.3.2-1.7.5a2 2 0 00-.6 1.6v2.8c0 .7.2 1.2.6 1.6.4.3 1 .5 1.7.5.6 0 1.1-.1 1.5-.4.5-.4.7-.8.8-1.3h2.5c0 .6-.3 1.2-.5 1.6a4.4 4.4 0 01-2.5 2l-1.8.3zm6.4-7.7v-2.3h3v-1.5a3 3 0 011-2.3c.6-.5 1.5-.8 2.5-.8h3.3v2.2h-3.2a1 1 0 00-.8.3 1 1 0 00-.2.7v1.4h4.2v2.3h-4.2v7.5h-2.6v-7.5h-3zm11.5.6c0-.7.1-1.3.3-1.8.2-.5.4-1 .8-1.3.3-.4.7-.7 1.2-.9a4 4 0 011.5-.3c.9 0 1.5.2 2 .6.6.4 1 1 1 1.6h.2v-2h2.4v10.4c0 .6 0 1.1-.3 1.6-.2.5-.5 1-1 1.3-.3.3-.8.6-1.4.8-.5.2-1.2.3-1.9.3h-2.9v-2h3c.6 0 1-.3 1.5-.6.3-.4.5-.8.5-1.4v-.3l.1-2h-.1c-.2.6-.5 1.1-1 1.5-.6.4-1.2.6-2 .6-.6 0-1.2 0-1.6-.3a3.5 3.5 0 01-2-2.2c-.2-.5-.3-1.1-.3-1.7v-2zm2.6 1.8c0 .7.2 1.2.5 1.6.4.4 1 .5 1.6.5.7 0 1.2-.1 1.6-.5.4-.4.6-1 .6-1.6v-1.8c0-.6-.2-1.2-.6-1.6-.4-.3-.9-.5-1.6-.5-.6 0-1.2.2-1.6.5-.3.4-.5 1-.5 1.6v1.8z"/>
+      <path fill="#3D4251" d="M85.4 430.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V446h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V441c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2H111a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V429h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8H119v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm42.6-4.7v3.2h-12.4v-3.2h12.4zm-12.4 6.9h12.4v3.2h-12.4v-3.2zm37.7-12.8v17h8.8v3.2H187v-20.2h3.5zm12 15.9c0-.8 0-1.5.3-2 .3-.7.7-1.2 1.3-1.6.5-.5 1.1-.8 1.8-1 .8-.3 1.6-.4 2.5-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2H203a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V446h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.4-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V439h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm14-10.7h3.2v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V446H229v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 5.7c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm17.2-8.3v9.6c0 2 1 3 2.8 3 1.9 0 2.8-1 2.8-3v-9.6h3.5v9.6c0 2-.6 3.4-1.7 4.5a6.4 6.4 0 01-4.6 1.6c-2 0-3.5-.6-4.6-1.6a5.9 5.9 0 01-1.7-4.5v-9.6h3.5zm12.6 11c0-.7.2-1.4.4-2a5.6 5.6 0 013-2.5c.9-.3 1.7-.4 2.6-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V446h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.3c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V439h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.5-5c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm13.3-2.6c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8H307v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm17.3 6.7c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+      <path fill="#F03969" d="M337 435.5v-3.2h4.1v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V446H341v-10.5H337zm21.2-4.9v2.7h.2c.2-1 .7-1.7 1.4-2.2a5 5 0 012.9-.8 5 5 0 013.9 1.6 6 6 0 011.4 4.2v1.1h-3.7v-.8c0-1-.3-1.7-.8-2.3a3 3 0 00-2.3-.8c-.9 0-1.6.3-2 .8-.6.6-.8 1.4-.8 2.3v9.6h-3.5v-15.4h3.3zm19.2 15.7c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13 5.8v-15.4h2.8v2.2h.3c0-.7.3-1.4.7-1.8.5-.5 1-.7 1.8-.7.7 0 1.3.2 1.7.7.4.4.7 1 .8 1.8h.2c.1-.7.4-1.4.8-1.8.4-.5 1-.7 1.8-.7 1 0 1.7.4 2.3 1 .6.8.9 1.7.9 3V446H398v-11.5c0-1.1-.4-1.7-1.3-1.7-.4 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V446H393v-11.5c0-.5-.2-1-.4-1.3-.3-.3-.6-.4-1-.4-.5 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V446h-3.1zm17 .7h13.5v3.2h-13.4v-3.2zm23.8-.4c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm16.6 0c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.6-9.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V446h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6H455v-15.4zm16.3 4.9v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V446h-3.6v-10.5h-4.2zm17.6 7.3h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 9.9c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5z"/>
+      <path fill="#3D4251" d="M524.9 433.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm20.7 13c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm16.6 0c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.6-9.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V446h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 4.9v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V446H593v-10.5H589zm17.6 7.3h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 9.9c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm14.8-16.6c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V446c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5zM1134.4 433.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V449h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V444c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V432h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm42.6-4.7v3.2h-12.4v-3.2h12.4zm-12.4 6.9h12.4v3.2h-12.4v-3.2zm37.7-12.8v17h8.8v3.2H1236v-20.2h3.5zm12 15.9c0-.8 0-1.5.3-2 .3-.7.7-1.2 1.3-1.6.5-.5 1.1-.8 1.8-1 .8-.3 1.6-.4 2.5-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V449h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.4-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V442h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm14-10.7h3.2v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V449h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 5.7c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm17.2-8.3v9.6c0 2 1 3 2.8 3 1.9 0 2.8-1 2.8-3v-9.6h3.5v9.6c0 2-.6 3.4-1.7 4.5a6.4 6.4 0 01-4.6 1.6c-2 0-3.5-.6-4.6-1.6a5.9 5.9 0 01-1.7-4.5v-9.6h3.5zm12.6 11c0-.7.2-1.4.4-2a5.6 5.6 0 013-2.5c.9-.3 1.7-.4 2.6-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V449h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.3c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V442h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.5-5c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm13.3-2.6c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm17.3 6.7c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+      <path fill="#3AC" d="M1386 438.5v-3.2h4.1v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V449h-3.6v-10.5h-4.2zm21.2-4.9v2.7h.2c.2-1 .7-1.7 1.4-2.2a5 5 0 012.9-.8 5 5 0 013.9 1.6 6 6 0 011.4 4.2v1.1h-3.7v-.8c0-1-.3-1.7-.8-2.3a3 3 0 00-2.3-.8c-.9 0-1.6.3-2 .8-.6.6-.8 1.4-.8 2.3v9.6h-3.5v-15.4h3.3zm19.2 15.7c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13 5.8v-15.4h2.8v2.2h.3c0-.7.3-1.4.7-1.8.5-.5 1-.7 1.8-.7.7 0 1.3.2 1.7.7.4.4.7 1 .8 1.8h.2c.1-.7.4-1.4.8-1.8.4-.5 1-.7 1.8-.7 1 0 1.7.4 2.3 1 .6.8.9 1.7.9 3V449h-3.2v-11.5c0-1.1-.4-1.7-1.3-1.7-.4 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V449h-2.5v-11.5c0-.5-.2-1-.4-1.3-.3-.3-.6-.4-1-.4-.5 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V449h-3.1zm17 .7h13.5v3.2h-13.4v-3.2zm23.8-.4c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm16.6 0c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.6-9.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V449h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 4.9v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V449h-3.6v-10.5h-4.2zm17.6 7.3h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 9.9c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5z"/>
+      <path fill="#3D4251" d="M1573.9 436.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm20.7 13c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm16.6 0c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.2c.3.7.5 1.5.5 2.4v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.6-9.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V449h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 4.9v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V449h-3.6v-10.5h-4.2zm17.6 7.3h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 9.9c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm14.8-16.6c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V449c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5z"/>
+      <path fill="#FFF" d="M169.4 352.7c0 .6.3 1.2.7 1.7.4.5 1 .8 1.7 1V349l-.3-.1a8 8 0 01-3.9-2.2c-.8-1-1.3-2.2-1.3-3.7 0-1.6.5-2.8 1.5-3.8s2.3-1.5 4-1.7v-3.4h1.8v3.4c1.7.1 3 .7 4 1.6 1.1 1 1.7 2.3 1.8 3.8h-3.5c0-.6-.3-1.2-.7-1.6a3 3 0 00-1.6-.9v5.7l1 .2c1.7.5 3 1.3 3.8 2.3.8 1 1.3 2.3 1.3 3.8 0 .8-.2 1.5-.5 2.2a5 5 0 01-1.2 1.8 6 6 0 01-1.9 1.2c-.7.3-1.6.5-2.5.6v3.6h-1.8v-3.7a6.8 6.8 0 01-4.2-1.7c-1-1-1.6-2.3-1.7-3.8h3.5zm4.2-3.1v5.8c1-.1 1.6-.4 2-.8.5-.5.8-1.1.8-2a3 3 0 00-.7-1.9c-.5-.5-1.1-.9-2-1v-.1zm-4-6.7c0 1.3.8 2.2 2.2 2.8v-5.3c-.7.2-1.2.5-1.6.9-.4.4-.6 1-.6 1.6zm30 11.2h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3h-3.6c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4H206c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm17.2-11.5h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12.6 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V358H243v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V351H239c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm20.5 5c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm9.2-15.7h3.8l3 8.2a7.6 7.6 0 01.5 2l.2 1.1h.2a11.5 11.5 0 01.3-2c0-.4.2-.8.3-1.1l2.8-8.2h3.7l-7.3 20.2H270l2.1-5.7-6-14.5zm33.9 0h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .5 1.2.2.3.6.4 1.2.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm21.5 0v2.7h.2c.2-1 .7-1.7 1.4-2.2a5 5 0 012.9-.8 5 5 0 013.9 1.6 6 6 0 011.4 4.2v1.1h-3.7v-.8c0-1-.3-1.7-.8-2.3a3 3 0 00-2.3-.8c-.9 0-1.6.3-2 .8-.6.6-.8 1.4-.8 2.3v9.6h-3.5v-15.4h3.3zm12.2 11c0-.7.2-1.4.4-2a5.6 5.6 0 013-2.5c.9-.3 1.7-.4 2.6-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V358h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.3c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V351h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm14.2 1.5h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm12 4.2h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V358h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6H368v-15.4zm40.2 15.7c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm16.6 0c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.1c.3.8.5 1.6.5 2.5v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.6-9.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V358h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm16.3 4.9v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V358h-3.6v-10.5h-4.2zm17.6 7.3h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2H469v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 9.9c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm17.3 4.6c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2zm19.7 2.8c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2H529c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm9.7-10.8v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V358h-3.6v-10.5h-4.2zm17 .8c0-1 0-1.7.3-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.5 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zM1237.2 345.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V361h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V356c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V344h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm42.6-4.7v3.2h-12.4v-3.2h12.4zm-12.4 6.9h12.4v3.2h-12.4v-3.2zm33 3.5h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3h-3.6c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4h-1.1c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm17.2-11.5h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12.6 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V361h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V354h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm20.5 5c-1 0-2-.2-2.8-.4a6 6 0 01-2-1.2c-.7-.6-1.1-1.2-1.4-2-.3-.7-.5-1.5-.5-2.4v-4c0-.9.2-1.7.5-2.4.3-.8.7-1.4 1.3-2a8.2 8.2 0 017.3-1.3 6.2 6.2 0 014.3 5.2h-3.5c-.1-.8-.5-1.3-1-1.8-.7-.4-1.4-.6-2.2-.6-1 0-1.8.2-2.4.8-.5.5-.8 1.2-.8 2.1v4c0 .9.3 1.6.8 2.1.6.6 1.4.8 2.4.8.8 0 1.5-.2 2.1-.6.6-.5 1-1 1-1.8h3.6c-.1.9-.4 1.6-.8 2.3a6.1 6.1 0 01-3.5 2.8c-.7.3-1.5.4-2.4.4zm9.2-15.7h3.8l3 8.2a7.6 7.6 0 01.5 2l.2 1.1h.2a11.5 11.5 0 01.3-2c0-.4.2-.8.3-1.1l2.8-8.2h3.7l-7.3 20.2h-3.7l2.1-5.7-6-14.5zm21.5 13c0-.9.2-1.6.7-2 .5-.6 1.2-.8 2-.8s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2zm12-14.7v-3.2h8.4V356c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V344h-5zm24.3 17.4c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.1c.3.8.5 1.6.5 2.5v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13 1.5a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V361h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V354h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.6-5c0-.9.1-1.7.4-2.5.2-.7.6-1.3 1-1.9.5-.5 1-.9 1.7-1.2.7-.2 1.4-.4 2.2-.4 1.1 0 2 .3 2.8.8.7.5 1.1 1.2 1.3 2.2h.3v-1a8.7 8.7 0 01-.1-1.7v-4.8h3.5V361h-3.4v-2.7h-.3c-.2 1-.6 1.7-1.3 2.2a5.4 5.4 0 01-5 .4c-.6-.3-1.2-.7-1.6-1.3-.5-.5-.9-1.1-1.1-1.9a8 8 0 01-.4-2.5v-3.9zm3.5.1v3.7c0 1 .3 1.7.8 2.3a3 3 0 002.3.8c.9 0 1.6-.3 2.1-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2 3 3 0 00-2.1-.9 3 3 0 00-2.3.9 3 3 0 00-.8 2.2zm16.3-3.1c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm14.1-2.7h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12.6 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V361h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V354h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.2-10.7h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .4 1.2.3.3.7.4 1.3.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.5-4.8h3.5v4.8a14.6 14.6 0 01-.2 2.4v.3h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.7.5 3.7 1.5.9 1 1.3 2.4 1.3 4.1V361h-3.5v-9.7a3 3 0 00-.7-2.2c-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-20.2zm18-3.5c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V361c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5z"/>
+      <path fill="#3D4251" d="M136.8 516.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V532H146v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V527c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V515h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm16.9 7.1c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+      <path fill="#F03969" d="M204.3 528.8h5.5v-9H205v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm12 4.2h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V532H230v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm17.1 12.2h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.3 4.2h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .5 1.2.2.3.6.4 1.2.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.8 12.2h5.5v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.4 15.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2H288a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V532h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V525h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm12.8-12.4v-3.2h8.4V527c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V515h-5zm18.2 14h5.5v-9.1h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.5-16.5c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm12 16.3l8.4-9v-.2a8 8 0 01-.7 0 6.6 6.6 0 01-1.5.2h-6v-3.1H351v3.3l-8.5 9.1v.2a7.7 7.7 0 012.3-.3h6.5v3.1h-12.8v-3.3zm16.5-6.4c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5z"/>
+      <path fill="#3D4251" d="M374.7 519.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm13.7 3c0-1 .1-1.7.4-2.5l1-1.9a5.5 5.5 0 013.9-1.6c1.2 0 2.2.3 2.9.8.7.6 1.2 1.3 1.4 2.3h.2v-2.8h3.4v14.5c0 .9-.2 1.6-.5 2.3-.3.7-.7 1.3-1.3 1.8a6 6 0 01-2 1.2c-.8.2-1.7.4-2.7.4h-4v-3h4a3 3 0 002.2-.7c.5-.5.8-1.2.8-2v-3.3h-.1c-.2 1-.7 1.7-1.4 2.3-.8.5-1.7.8-2.9.8a5 5 0 01-5-3.5c-.2-.7-.3-1.6-.3-2.5v-2.6zm3.6 2.6c0 .9.2 1.6.8 2.2a3 3 0 002.2.8 3 3 0 002.2-.8 3 3 0 00.9-2.2v-2.5a3 3 0 00-.9-2.2 3 3 0 00-2.2-.9 3 3 0 00-2.2.9 3 3 0 00-.8 2.2v2.5zm13.3-2.6c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm12.9-6.2h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .4 1.2.3.3.7.4 1.3.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.2 16.1h13.4v3.2H439v-3.2zm16.8-10.4c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm18 1.2l-5.1-7.4h4l2.5 3.9.4.7.1.3h.2l.1-.3.1-.3.3-.4 2.6-3.9h4l-5.2 7.4 5.5 8h-4l-2.9-4.3a8 8 0 01-.5-1.1h-.2l-.1.3a3.2 3.2 0 01-.4.7l-3 4.4h-3.9l5.5-8zm11.8 3.7a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V532H499v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V525H495c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.3 4.7v-15.4h2.8v2.2h.3c0-.7.3-1.4.7-1.8.5-.5 1-.7 1.8-.7.7 0 1.3.2 1.7.7.4.4.7 1 .8 1.8h.2c.1-.7.4-1.4.8-1.8.4-.5 1-.7 1.8-.7 1 0 1.7.4 2.3 1 .6.8.9 1.7.9 3V532h-3.2v-11.5c0-1.1-.4-1.7-1.3-1.7-.4 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V532h-2.5v-11.5c0-.5-.2-1-.4-1.3-.3-.3-.6-.4-1-.4-.5 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V532h-3.1zm17.4-15.4h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12-7.5v-3.2h8.5V527c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V515h-5zm17.7 7.4c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8H560v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm13.2 5.3h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3H583c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4h-1.1c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm18.4-19.8c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V532c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5zM203 602.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V618h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6H203v-15.4zm15.7-1.7v-3.2h8.4V613c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V601h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm16.9 7.1c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+      <path fill="#F03969" d="M273.8 602.6v9.6c0 2 1 3 2.8 3 1.9 0 2.8-1 2.8-3v-9.6h3.5v9.6c0 2-.6 3.4-1.7 4.5a6.4 6.4 0 01-4.6 1.6c-2 0-3.5-.6-4.6-1.6a5.9 5.9 0 01-1.7-4.5v-9.6h3.5zm13.2 0h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8H287v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm13 0c0-1 0-1.8.3-2.6.2-.7.6-1.3 1-1.9.5-.5 1-.9 1.7-1.2.7-.2 1.4-.4 2.2-.4 1.1 0 2 .3 2.8.8.7.5 1.1 1.2 1.3 2.2h.3v-1a8.7 8.7 0 01-.1-1.7v-4.8h3.5V618h-3.4v-2.7h-.3c-.2 1-.6 1.7-1.3 2.2a5.4 5.4 0 01-5 .4c-.6-.3-1.2-.7-1.6-1.3-.5-.5-.9-1.1-1.1-1.9a8 8 0 01-.4-2.5v-3.9zm3.4 0v3.7c0 1 .3 1.7.8 2.3a3 3 0 002.3.8c.9 0 1.6-.3 2.1-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2 3 3 0 00-2.1-.9 3 3 0 00-2.3.9 3 3 0 00-.8 2.2zm13 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V618H330v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V611H326c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.2-10.7h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .5 1.2.2.3.6.4 1.2.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.2 5.7c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5z"/>
+      <path fill="#3D4251" d="M373.7 605.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm13.8 3c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8H391v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm18 1.2l-5.1-7.4h4l2.5 3.9.4.7.1.3h.2l.1-.3s0-.2.2-.3c0-.1 0-.3.2-.4l2.6-3.9h4L413 610l5.5 8h-4l-2.9-4.3a8 8 0 01-.5-1.1h-.2l-.1.3a3.2 3.2 0 01-.4.7l-3 4.4h-3.9l5.5-8zm11.8 3.7a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V618h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V611h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.3 4.7v-15.4h2.8v2.2h.3c0-.7.3-1.4.7-1.8.5-.5 1-.7 1.8-.7.7 0 1.3.2 1.7.7.4.4.7 1 .8 1.8h.2c.1-.7.4-1.4.8-1.8.4-.5 1-.7 1.8-.7 1 0 1.7.4 2.3 1 .6.8.9 1.7.9 3V618h-3.2v-11.5c0-1.1-.4-1.7-1.3-1.7-.4 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V618h-2.5v-11.5c0-.5-.2-1-.4-1.3-.3-.3-.6-.4-1-.4-.5 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V618h-3.1zm17.4-15.4h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.7 1.2a7.7 7.7 0 011.4 4.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8H455v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12-7.5v-3.2h8.5V613c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V601h-5zm17.7 7.4c0-.9.2-1.7.5-2.5.3-.7.7-1.3 1.3-1.9a6 6 0 012.1-1.2c.8-.2 1.8-.4 2.8-.4 1 0 2 .2 2.7.5.8.2 1.5.6 2.1 1.2.6.5 1 1.1 1.4 1.9.3.7.4 1.6.4 2.5v2.8h-9.8v1c0 1 .3 1.8.8 2.4.6.6 1.4.9 2.4.9.7 0 1.4-.1 2-.4.5-.2.9-.6 1-1h3.5c-.1.6-.4 1.2-.8 1.7s-.9 1-1.5 1.3a7 7 0 01-1.9.9 8.7 8.7 0 01-5-.1c-.9-.3-1.6-.7-2.2-1.3-.6-.5-1-1.1-1.3-1.9-.3-.7-.5-1.6-.5-2.5v-3.9zm3.5.5h6.4v-.4c0-1-.3-1.9-.8-2.4-.6-.6-1.4-.9-2.4-.9s-1.8.3-2.4.9c-.5.5-.8 1.3-.8 2.3v.5zm13.2 5.3h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3h-3.6c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4h-1.1c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm18.4-19.8c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V618c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5z"/>
+      <g>
+        <path fill="#3D4251" d="M228.2 688.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V704h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V699c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V687h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm16.9 7.1c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+        <path fill="#F03969" d="M294.7 688.6h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .5 1.2.2.3.6.4 1.2.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm23.9 15.7c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.1c.3.8.5 1.6.5 2.5v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13.3 6.5H342v3.2h-13.4v-3.2zm16.8-10.4c0-.9.1-1.7.4-2.5.2-.7.6-1.3 1-1.9.5-.5 1-.9 1.7-1.2.7-.2 1.4-.4 2.2-.4 1.1 0 2 .3 2.8.8.7.5 1.1 1.2 1.3 2.2h.3v-1a8.7 8.7 0 01-.1-1.7v-4.8h3.5V704h-3.4v-2.7h-.3c-.2 1-.6 1.7-1.3 2.2a5.4 5.4 0 01-5 .4c-.6-.3-1.2-.7-1.6-1.3-.5-.5-.9-1.1-1.1-1.9a8 8 0 01-.4-2.5v-3.9zm3.5.1v3.7c0 1 .3 1.7.8 2.3a3 3 0 002.3.8c.9 0 1.6-.3 2.1-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2 3 3 0 00-2.1-.9 3 3 0 00-2.3.9 3 3 0 00-.8 2.2zm14 6.4h5.4v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.4-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 15.7h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3h-3.6c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4h-1.1c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm17.4-16.3h3.5v10.9h2.5l3.8-6.1h4l-4.8 7.5 4.8 7.9h-4l-3.8-6.3h-2.5v6.3h-3.5v-20.2z"/>
+        <path fill="#3D4251" d="M415.7 691.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm14.1-2.7h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12.6 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V704H456v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V697H452c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.2-10.7h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .4 1.2.3.3.7.4 1.3.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.5-4.8h3.5v4.8a14.6 14.6 0 01-.2 2.4v.3h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.7.5 3.7 1.5.9 1 1.3 2.4 1.3 4.1V704h-3.5v-9.7a3 3 0 00-.7-2.2c-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-20.2zm18-3.5c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V704c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5z"/>
+      </g>
+      <g>
+        <path fill="#3D4251" d="M1260.4 519.6h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.8.5 3.7 1.6a6 6 0 011.3 4V535h-3.5v-9.6c0-1-.2-1.7-.7-2.3-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-15.4zm15.7-1.7v-3.2h8.4V530c0 .6.1 1 .5 1.4.3.3.7.4 1.3.4h4.6v3.2h-4.9a5 5 0 01-3.7-1.3 4.9 4.9 0 01-1.3-3.7V518h-5zm18 1.7h3.3v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.5 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm16.9 7.1c0-.8.2-1.5.7-2 .5-.5 1.2-.7 2-.7s1.5.2 2 .7c.5.5.7 1.2.7 2 0 .9-.2 1.5-.7 2-.5.5-1.2.8-2 .8s-1.5-.3-2-.8-.7-1.1-.7-2z"/>
+        <path fill="#3AC" d="M1327.1 524.5v-3.2h4.2v-2a4 4 0 011.4-3.3c.9-.8 2-1.2 3.5-1.2h4.5v3.1h-4.4c-.4 0-.8.1-1 .4-.3.3-.4.6-.4 1v2h5.8v3.2h-5.8V535h-3.6v-10.5h-4.2zm21.3-4.9v2.7h.2c.2-1 .7-1.7 1.4-2.2a5 5 0 012.9-.8 5 5 0 013.9 1.6 6 6 0 011.4 4.2v1.1h-3.7v-.8c0-1-.3-1.7-.8-2.3a3 3 0 00-2.3-.8c-.9 0-1.6.3-2 .8-.6.6-.8 1.4-.8 2.3v9.6h-3.5v-15.4h3.3zm19.2 15.7c-1 0-2-.2-2.8-.5a5.5 5.5 0 01-3.5-3.2c-.2-.7-.4-1.5-.4-2.4v-3.8c0-1 .2-1.7.5-2.5a5.5 5.5 0 013.4-3.1 8.3 8.3 0 015.5 0 5.5 5.5 0 013.5 3.1c.3.8.5 1.6.5 2.5v3.8c0 .9-.2 1.7-.5 2.5a5.5 5.5 0 01-3.4 3.1c-.8.3-1.8.5-2.8.5zm-3.2-6.1c0 1 .3 1.6.9 2.2.5.5 1.3.8 2.3.8 1 0 1.8-.3 2.3-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2c-.5-.6-1.3-.8-2.3-.8-1 0-1.8.2-2.3.8a3 3 0 00-.9 2.2v3.8zm13 5.8v-15.4h2.8v2.2h.3c0-.7.3-1.4.7-1.8.5-.5 1-.7 1.8-.7.7 0 1.3.2 1.7.7.4.4.7 1 .8 1.8h.2c.1-.7.4-1.4.8-1.8.4-.5 1-.7 1.8-.7 1 0 1.7.4 2.3 1 .6.8.9 1.7.9 3V535h-3.2v-11.5c0-1.1-.4-1.7-1.3-1.7-.4 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V535h-2.5v-11.5c0-.5-.2-1-.4-1.3-.3-.3-.6-.4-1-.4-.5 0-.8.1-1 .5-.2.3-.3.7-.3 1.3V535h-3.1zm17 .7h13.5v3.2h-13.4v-3.2zm17-10.4c0-.9 0-1.7.3-2.5.2-.7.6-1.3 1-1.9.5-.5 1-.9 1.7-1.2.7-.2 1.4-.4 2.2-.4 1.1 0 2 .3 2.8.8.7.5 1.1 1.2 1.3 2.2h.3v-1a8.7 8.7 0 01-.1-1.7v-4.8h3.5V535h-3.4v-2.7h-.3c-.2 1-.6 1.7-1.3 2.2a5.4 5.4 0 01-5 .4c-.6-.3-1.2-.7-1.6-1.3-.5-.5-.9-1.1-1.1-1.9a8 8 0 01-.4-2.5v-3.9zm3.4.1v3.7c0 1 .3 1.7.8 2.3a3 3 0 002.3.8c.9 0 1.6-.3 2.1-.8a3 3 0 00.9-2.2v-3.8a3 3 0 00-.9-2.2 3 3 0 00-2.1-.9 3 3 0 00-2.3.9 3 3 0 00-.8 2.2zm14 6.4h5.4v-9h-4.8v-3.2h8.2v12.2h4.5v3.2h-13.4v-3.2zm4.4-16.4c0-.5.2-1 .6-1.3a2 2 0 011.4-.5h.7c.6 0 1.1.2 1.5.5.4.3.5.8.5 1.3 0 .6-.1 1-.5 1.4a2 2 0 01-1.5.5h-.7a2 2 0 01-1.4-.5c-.4-.4-.6-.8-.6-1.4zm11.6 15.7h3.6c.1.4.4.8.9 1 .4.4 1 .5 1.7.5h1.1c1 0 1.6-.2 2-.5.5-.4.8-.8.8-1.4 0-1-.8-1.7-2.4-2l-2.2-.2c-1.7-.2-3-.7-3.9-1.4-.8-.8-1.2-1.8-1.2-3.2a4 4 0 011.5-3.4 7 7 0 014.4-1.2h1.1a7 7 0 014.1 1.1c1 .7 1.7 1.7 1.9 3h-3.6c-.1-.4-.4-.8-.8-1a3 3 0 00-1.6-.4h-1.1c-1.7 0-2.5.6-2.5 1.7 0 1 .7 1.6 2 1.8l2.3.3c1.9.2 3.2.7 4.1 1.5.9.8 1.3 1.8 1.3 3.2 0 1.6-.5 2.7-1.6 3.5-1 .9-2.6 1.3-4.6 1.3h-1c-1.9 0-3.3-.4-4.4-1.1-1-.8-1.7-1.8-1.9-3.1zm17.4-16.3h3.5v10.9h2.5l3.8-6.1h4l-4.8 7.5 4.8 7.9h-4l-3.8-6.3h-2.5v6.3h-3.5v-20.2z"/>
+        <path fill="#3D4251" d="M1481.5 522.3c0-1.4.2-2.6.6-3.8a10.3 10.3 0 014.6-5.6c1-.7 2.3-1.2 3.7-1.6v3.5c-.9.2-1.6.5-2.3 1-.6.4-1.2 1-1.7 1.6a9.2 9.2 0 00-1.4 4.9v5.3a8 8 0 001.4 4.8c1 1.3 2.3 2.2 4 2.6v3.5c-2.9-.7-5-2-6.6-4a10.6 10.6 0 01-2.3-6.9v-5.3zm14.1-2.7h3.4v2.7h.3c.2-1 .6-1.7 1.3-2.2a5.5 5.5 0 015-.4c.6.3 1.2.7 1.6 1.2.5.6.9 1.2 1.1 2 .3.7.4 1.5.4 2.4v3.9a8 8 0 01-.4 2.5c-.2.8-.6 1.4-1 2-.5.5-1 .9-1.7 1.2-.7.2-1.4.4-2.2.4-1.1 0-2-.3-2.8-.8a3.4 3.4 0 01-1.3-2.2h-.3v.4l.1.6a28.8 28.8 0 000 1.7v4.8h-3.5v-20.2zm3.6 5.8v3.8c0 .9.2 1.6.8 2.2.5.5 1.2.8 2.1.8a3 3 0 002.3-.8c.5-.6.8-1.4.8-2.3v-3.7a3 3 0 00-.8-2.2 3 3 0 00-2.3-.9c-.9 0-1.6.3-2.1.9a3 3 0 00-.8 2.2zm12.6 5.3a4.3 4.3 0 011.6-3.6c.6-.5 1.2-.8 2-1 .7-.3 1.5-.4 2.4-.4h3.9v-1.3a2 2 0 00-.8-1.7c-.5-.4-1.3-.6-2.2-.6a4 4 0 00-2 .4 2 2 0 00-1 1.2h-3.4a4.6 4.6 0 012-3.2c.6-.4 1.3-.7 2-.9a9.6 9.6 0 015 0 6 6 0 012.1 1 4.6 4.6 0 011.8 3.8V535h-3.3v-3h-.3c-.1 1-.7 1.8-1.5 2.4-1 .6-2 .9-3.3.9-1.5 0-2.7-.4-3.6-1.3-1-.8-1.4-2-1.4-3.3zm3.5-.4c0 .7.3 1.2.7 1.6a3 3 0 002 .6c1 0 2-.2 2.6-.7.7-.5 1-1.2 1-2V528h-3.8c-.7 0-1.4.2-1.8.6a2 2 0 00-.7 1.7zm13.2-10.7h4.2v-4.8h3.5v4.8h5.9v3.2h-5.9v7.4c0 .5.2 1 .5 1.2.2.3.6.4 1.2.4h3.9v3.2h-4.2c-1.5 0-2.7-.4-3.6-1.3-.8-.8-1.3-2-1.3-3.5v-7.4h-4.2v-3.2zm17.5-4.8h3.5v4.8a14.6 14.6 0 01-.2 2.4v.3h.3c.2-1 .6-1.7 1.3-2.2.7-.5 1.6-.8 2.8-.8 1.5 0 2.7.5 3.7 1.5.9 1 1.3 2.4 1.3 4.1V535h-3.5v-9.7a3 3 0 00-.7-2.2c-.5-.5-1.2-.8-2.1-.8-.9 0-1.6.3-2.1.9-.5.5-.8 1.3-.8 2.2v9.6h-3.5v-20.2zm18-3.5c1.4.4 2.6 1 3.7 1.6a11 11 0 014.6 5.6c.4 1.2.6 2.4.6 3.8v5.3c0 2.8-.7 5-2.3 7a12.2 12.2 0 01-6.6 3.9V535c.9-.2 1.6-.6 2.3-1a6.6 6.6 0 002.7-3.8c.3-.8.4-1.7.4-2.6v-5.3c0-1-.1-1.9-.4-2.7a6.5 6.5 0 00-5-4.8v-3.5z"/>
+      </g>
+      <path fill="#FFF" d="M323 161.2l-1.8-.3c-.5-.2-1-.4-1.3-.8-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6.2-.5.5-.9.9-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.4 0 4 4 0 012.7 3.2h-2.2c-.1-.4-.3-.8-.7-1.1-.4-.3-.8-.4-1.4-.4-.6 0-1.1.1-1.5.5-.3.3-.5.8-.5 1.4v2.5c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5.5 0 1-.1 1.4-.4.4-.3.6-.7.7-1.1h2.2c0 .5-.2 1-.5 1.4a4 4 0 01-2.2 1.8 5 5 0 01-1.6.3zm10.2 0l-1.8-.3a4 4 0 01-1.3-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6a3.5 3.5 0 012.2-2l1.8-.3 1.8.3c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v2.5l-.3 1.6a3.5 3.5 0 01-2.2 2c-.5.2-1.1.3-1.8.3zm-2-4c0 .7.1 1.1.5 1.5.4.3.9.5 1.5.5a2 2 0 001.5-.5c.4-.4.6-.8.6-1.4v-2.5c0-.6-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.4.4-.6.8-.6 1.4v2.5zm8.2-6.1h2.1v1.7h.2c.1-.6.4-1 .8-1.4a3 3 0 011.8-.5c1 0 1.8.3 2.4 1 .6.7.9 1.5.9 2.6v6.5h-2.3v-6.2c0-.6-.1-1-.5-1.4-.3-.4-.7-.5-1.3-.5-.6 0-1 .1-1.4.5a2 2 0 00-.5 1.4v6.2h-2.2v-9.9zm10 3.1v-2h2.7v-1.3c0-.9.3-1.6.8-2.1.6-.5 1.4-.8 2.3-.8h3v2h-3a1 1 0 00-.6.3c-.2.1-.2.3-.2.6v1.3h3.7v2h-3.7v6.8H352v-6.8h-2.7zm10.8 4.7h3.5v-5.8h-3v-2h5.2v7.8h2.9v2.1h-8.6v-2zm2.9-10.5c0-.3.1-.6.3-.8.3-.3.6-.4 1-.4h.4c.4 0 .7.1 1 .4.2.2.3.5.3.8 0 .4-.1.7-.3.9-.3.2-.6.3-1 .3h-.4c-.4 0-.7-.1-1-.3-.2-.2-.3-.5-.3-.9zm7 6.4c0-.6 0-1.1.2-1.6.1-.5.4-.9.7-1.2a3 3 0 011-.8l1.5-.3a3 3 0 011.8.6c.5.3.8.8.9 1.4h.1v-1.8h2.2v9.3c0 .6-.1 1-.3 1.5-.2.5-.5.9-.8 1.2l-1.3.7c-.5.2-1.1.3-1.7.3h-2.7v-2h2.7c.5 0 1 0 1.3-.4.4-.3.5-.7.5-1.3v-.2l.1-1.9h-.1c-.1.6-.4 1.1-1 1.5a3 3 0 01-1.7.5c-.6 0-1 0-1.4-.3a3.2 3.2 0 01-1.8-2c-.2-.4-.3-1-.3-1.5v-1.7zm2.2 1.6c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5s1-.1 1.4-.5c.4-.3.5-.8.5-1.4v-1.6c0-.6-.1-1-.5-1.4a2 2 0 00-1.4-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.4v1.6zm10.7 3c0-.5.1-1 .4-1.3.3-.3.8-.4 1.3-.4s1 .1 1.3.4c.3.3.4.8.4 1.3s-.1 1-.4 1.3c-.3.3-.8.5-1.3.5s-1-.2-1.3-.5c-.3-.3-.4-.8-.4-1.3zm12.1 1.8l-1.8-.3c-.5-.2-1-.4-1.3-.8-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6.2-.5.5-.9.9-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.4 0 4 4 0 012.7 3.2h-2.2c-.1-.4-.3-.8-.7-1-.4-.4-.8-.5-1.4-.5-.6 0-1.1.1-1.5.5-.3.3-.5.8-.5 1.4v2.5c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5.5 0 1-.1 1.4-.4.4-.3.6-.7.7-1.1h2.2c0 .5-.2 1-.5 1.4a4 4 0 01-2.2 1.8 5 5 0 01-1.6.3zm5.8-7v-2h2.7v-1.3c0-.9.3-1.6.8-2 .6-.6 1.4-.9 2.3-.9h3v2h-3a1 1 0 00-.6.3c-.2.1-.2.3-.2.6v1.3h3.7v2H406v6.8h-2.3v-6.8h-2.7zm10.3.6c0-.6.1-1.1.3-1.6.1-.5.4-.9.7-1.2a3 3 0 011-.8l1.5-.3a3 3 0 011.8.6c.5.3.8.8.9 1.4h.1v-1.8h2.2v9.3c0 .6-.1 1-.3 1.5-.2.5-.5.9-.8 1.2l-1.3.7c-.5.2-1.1.3-1.7.3h-2.7v-2h2.7c.5 0 1 0 1.3-.4.4-.3.5-.7.5-1.3v-.2l.1-1.9h-.1c-.1.6-.4 1.1-1 1.5a3 3 0 01-1.7.5c-.6 0-1 0-1.4-.3a3.2 3.2 0 01-1.8-2c-.2-.4-.3-1-.3-1.5v-1.7zm2.3 1.6c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5s1-.1 1.4-.5c.4-.3.5-.8.5-1.4v-1.6c0-.6-.1-1-.5-1.4a2 2 0 00-1.4-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.4v1.6zM323.7 253v4.5H315V282h-5.4v-24.5h-8.8v-4.4h22.9zm-1.2 29v-20.5h2.9c.5 0 .9 0 1 .3.3.1.4.5.5 1l.3 2.4a9.7 9.7 0 012.5-3c1-.7 2.1-1.1 3.4-1.1 1 0 1.8.2 2.5.7l-.7 3.7c0 .2 0 .4-.2.5l-.6.2-.8-.2-1.4-.1c-1 0-1.9.3-2.6.8-.7.6-1.4 1.4-1.9 2.5V282h-5zm30.7 0H351c-.5 0-.8 0-1.1-.2-.3-.2-.5-.4-.6-.9l-.4-1.4a17 17 0 01-3.1 2.1 8 8 0 01-1.7.5 10.2 10.2 0 01-4.4-.1l-2-1a6.2 6.2 0 01-1.7-4.2c.1-.8.3-1.6.7-2.3.4-.8 1.1-1.5 2-2a13 13 0 013.8-1.6c1.6-.4 3.6-.6 5.9-.7V269c0-1.3-.3-2.3-.9-3-.5-.7-1.4-1-2.5-1a6.1 6.1 0 00-3.4 1l-1 .5-1.2.3c-.3 0-.6 0-.8-.2a2 2 0 01-.6-.7l-1-1.6c2.4-2.1 5.3-3.2 8.6-3.2 1.2 0 2.3.2 3.2.6a6.8 6.8 0 014 4.1c.3 1 .4 2 .4 3.2v13zm-9.6-3l1.4-.2a5.3 5.3 0 002.3-1.1l1.1-1v-3.5a27 27 0 00-3.6.4c-1 .2-1.8.4-2.3.7a3 3 0 00-1.3 1c-.2.4-.4.8-.4 1.2 0 .9.3 1.5.8 1.9.5.3 1.2.5 2 .5zm18.6-17.5V282h-5v-20.5h5zm.8-6l-.3 1.2a3.3 3.3 0 01-1.7 1.6 3.1 3.1 0 01-3.4-.6l-.7-1a3.1 3.1 0 01.7-3.4 3.2 3.2 0 012.2-1l1.2.3a3.3 3.3 0 011.7 1.7c.2.3.3.8.3 1.2zm3.7 26.5v-20.5h3c.7 0 1 .3 1.3.9l.3 1.6a11.3 11.3 0 012.8-2 7.4 7.4 0 013.6-.8c1 0 2 .1 3 .5.8.4 1.5 1 2.1 1.6a7 7 0 011.3 2.5c.3 1 .5 2 .5 3.1V282h-5v-13c0-1.3-.2-2.3-.8-3-.6-.7-1.5-1-2.6-1-.9 0-1.7.2-2.4.6-.8.4-1.5.9-2.2 1.6V282h-4.9zm26.9-20.5V282h-5v-20.5h5zm.7-6c0 .4 0 .8-.2 1.2a3.3 3.3 0 01-1.7 1.7 3.1 3.1 0 01-3.5-.7c-.3-.3-.5-.6-.6-1a3.1 3.1 0 01.7-3.4 3.2 3.2 0 012.1-1c.5 0 .9.1 1.3.3a3.3 3.3 0 011.7 1.7l.2 1.2zm3.8 26.5v-20.5h3c.6 0 1 .3 1.2.9l.4 1.6a11.3 11.3 0 012.7-2 7.4 7.4 0 013.6-.8c1.1 0 2.1.1 3 .5.9.4 1.6 1 2.2 1.6a7 7 0 011.3 2.5c.3 1 .4 2 .4 3.1V282H411v-13c0-1.3-.3-2.3-.9-3-.6-.7-1.4-1-2.6-1-.8 0-1.6.2-2.4.6-.7.4-1.4.9-2.1 1.6V282h-5zm28.7-20.9c.9 0 1.7.1 2.5.3.7.2 1.4.4 2 .8h6v1.8c0 .3 0 .6-.3.7-.1.2-.4.3-.8.4l-1.8.3a6.4 6.4 0 01.4 2.4 6.1 6.1 0 01-2.3 4.8c-.7.6-1.5 1-2.5 1.3a10.4 10.4 0 01-5.5.3c-.6.4-1 .8-1 1.3 0 .4.3.7.6.9l1.5.4 2.1.2a44 44 0 014.9.4l2.1.8c.6.4 1.1.9 1.5 1.5.4.6.6 1.4.6 2.4 0 .9-.3 1.8-.7 2.6a7 7 0 01-2 2.3c-.8.7-1.8 1.2-3 1.6a16.2 16.2 0 01-8.3.2 9.5 9.5 0 01-2.9-1.2c-.7-.5-1.3-1.1-1.7-1.8a4 4 0 01-.5-2c0-1 .3-1.8.9-2.4a6 6 0 012.4-1.6c-.5-.3-1-.6-1.2-1.1-.3-.5-.5-1-.5-1.8 0-.3 0-.6.2-1 0-.3.3-.6.5-1l.8-.8c.3-.3.7-.6 1.2-.8-1-.6-1.9-1.3-2.5-2.3a6 6 0 01-.9-3.2 6.1 6.1 0 012.4-4.9c.7-.6 1.5-1 2.5-1.3 1-.3 2.1-.5 3.3-.5zm5.4 21.8c0-.4-.1-.7-.4-1l-1-.6-1.4-.3a26.7 26.7 0 00-3.7-.2l-1.9-.2c-.5.3-1 .7-1.3 1.1-.4.4-.5 1-.5 1.5 0 .3 0 .7.2 1 .2.3.5.6.9.8l1.5.5 2.3.2c1 0 1.7 0 2.4-.2s1.2-.3 1.7-.6c.4-.2.7-.5 1-.9l.2-1zm-5.4-11.7c.6 0 1.1 0 1.6-.2l1.1-.7.7-1c.2-.5.2-1 .2-1.4 0-1-.3-1.8-.9-2.4-.6-.6-1.5-1-2.7-1-1.2 0-2.1.4-2.7 1-.7.6-1 1.4-1 2.4 0 .5.1 1 .3 1.3a3 3 0 001.8 1.8c.5.2 1 .2 1.6.2zM1352.8 273.7V285h-5.3v-29h8.8c2 0 3.6.3 5 .7 1.5.4 2.6 1 3.5 1.7 1 .7 1.6 1.6 2 2.6.4 1 .6 2.2.6 3.4a8.7 8.7 0 01-1.7 5.1c-.5.7-1.2 1.3-2 1.9-.7.5-1.6 1-2.7 1.3.7.3 1.3.9 1.8 1.6l7.2 10.7h-4.8c-.5 0-.9 0-1.2-.3-.3-.2-.6-.4-.8-.8l-6.1-9.2c-.2-.4-.5-.6-.8-.8l-1.2-.2h-2.3zm0-3.9h3.4c1 0 1.9 0 2.6-.3l2-1a5.5 5.5 0 001.4-3.6c0-1.6-.5-2.7-1.5-3.5-1-.8-2.4-1.2-4.4-1.2h-3.5v9.6zm23.8-5.3v13c0 1.3.3 2.2.9 3 .6.6 1.4 1 2.6 1 .8 0 1.6-.2 2.4-.6.7-.4 1.4-1 2.1-1.6v-14.8h5V285h-3c-.7 0-1.1-.3-1.3-.9l-.4-1.6-1.3 1.1a8 8 0 01-5 1.7c-1.1 0-2.1-.2-3-.6a6 6 0 01-2.2-1.6 7 7 0 01-1.3-2.4c-.3-1-.4-2-.4-3.2v-13h5zm17.1 20.5v-20.5h3c.7 0 1.1.3 1.3.9l.3 1.6a11.3 11.3 0 012.8-2 7.4 7.4 0 013.6-.8c1.1 0 2 .1 3 .5.8.4 1.5 1 2.1 1.6a7 7 0 011.3 2.5c.3 1 .5 2 .5 3.1V285h-5v-13c0-1.3-.2-2.3-.8-3-.6-.7-1.5-1-2.6-1-.9 0-1.7.2-2.4.6-.8.4-1.5.9-2.1 1.6V285h-5zm28.4.3c-1.8 0-3.2-.5-4.1-1.5-1-1-1.5-2.4-1.5-4.2v-11.4h-2a1 1 0 01-.7-.3c-.2-.2-.3-.4-.3-.8v-2l3.3-.5 1-5.6c0-.2.2-.4.4-.6l.7-.2h2.6v6.5h5.4v3.5h-5.4v11.1c0 .6.1 1.1.5 1.5.3.4.7.5 1.2.5h.8a4.2 4.2 0 001-.5l.3-.1h.4l.3.4 1.5 2.4c-.8.6-1.6 1-2.5 1.4-1 .3-2 .4-3 .4zm12.9-20.8V285h-5v-20.5h5zm.7-6c0 .4 0 .8-.2 1.2a3.3 3.3 0 01-1.7 1.7 3.1 3.1 0 01-3.5-.7l-.7-1a3.1 3.1 0 01.7-3.4 3.2 3.2 0 012.2-1c.5 0 .9.1 1.3.3a3.3 3.3 0 011.7 1.7l.2 1.2zm3.7 26.5v-20.5h3c.7 0 1.1.3 1.3.9l.3 1.5 1.2-1.1a7.3 7.3 0 012.7-1.4c.5-.2 1-.2 1.7-.2 1.3 0 2.3.3 3.2 1 .8.7 1.4 1.6 1.8 2.8a6.3 6.3 0 012.8-3l1.8-.6a8.8 8.8 0 015 .3c.9.3 1.6.8 2.2 1.5.6.7 1 1.5 1.4 2.4.3 1 .5 2.1.5 3.3V285h-5v-13c0-1.4-.3-2.3-.8-3-.6-.7-1.4-1-2.6-1-.5 0-1 0-1.4.3a3.4 3.4 0 00-1.9 2c-.2.4-.3 1-.3 1.6V285h-5v-13c0-1.4-.2-2.4-.7-3-.6-.7-1.4-1-2.5-1a4 4 0 00-2 .5c-.6.4-1.2.9-1.7 1.5v15h-5zm41-20.8c1.4 0 2.6.2 3.7.6a7.8 7.8 0 014.6 4.7 11 11 0 01.6 4.9c0 .2 0 .4-.2.6l-.3.3h-13.2c.1 2.2.7 3.7 1.7 4.7s2.2 1.5 3.9 1.5c.8 0 1.5-.1 2-.3l1.6-.6 1.1-.6a2 2 0 011-.3l.5.1.4.3 1.4 1.8-1.8 1.7a10.3 10.3 0 01-4.4 1.5l-2.3.2a11 11 0 01-4-.7 9 9 0 01-3.1-2.1c-1-1-1.7-2-2.2-3.4a13.3 13.3 0 010-8.8 9.2 9.2 0 015-5.4c1.2-.5 2.6-.7 4-.7zm.2 3.5c-1.4 0-2.6.4-3.4 1.2a6 6 0 00-1.5 3.5h9.2c0-.7 0-1.3-.2-1.8-.2-.6-.5-1-.8-1.5-.4-.4-.8-.8-1.4-1-.5-.3-1.2-.4-2-.4zM1382.6 66.5l-.6-.6-.2-.7.1-.7c0-.3.2-.5.4-.7l.9-.9.6-.5.7-.1c.2 0 .5 0 .7.2a2.5 2.5 0 011.2 1 2 2 0 01-.1 2l-.8-.6c.1-.2.2-.4.1-.7a1 1 0 00-.3-.6 1 1 0 00-.7-.3c-.2 0-.5.1-.7.3l-.8 1c-.2.1-.3.4-.3.6l.4.7.6.3c.3 0 .5 0 .6-.2l.8.7a1.9 1.9 0 01-2 .2c-.2 0-.4-.2-.6-.4zm3.5 3.4l-.5-.7-.2-.7.1-.7c0-.3.2-.5.4-.7l.8-.9.6-.4a1.7 1.7 0 011.5 0l.7.5.5.7c.2.2.2.4.3.7l-.1.7-.5.7-.8.8a2 2 0 01-.6.5 1.7 1.7 0 01-1.4 0l-.8-.5zm.6-2c-.2.1-.3.4-.3.6 0 .3.2.5.4.7a.9.9 0 001.4 0l.7-.9c.2-.2.3-.4.3-.7a1 1 0 00-.3-.6 1 1 0 00-.7-.3c-.3 0-.5 0-.7.3l-.8.8zm5 .5l.7.7-.6.6c.3-.2.6-.2.8-.2.3 0 .6.2.8.4.4.4.5.7.5 1.2 0 .4-.2.8-.5 1.2l-2.2 2.2-.7-.7 2-2.2.3-.6c0-.3-.1-.5-.3-.7a.8.8 0 00-.7-.2 1 1 0 00-.6.3l-2 2.2-.8-.7 3.2-3.5zm2.4 4.3l.7-.7 1 1 .4-.6c.2-.3.6-.4 1-.4.3 0 .6.2 1 .5l1 1-.6.6-1-.9a.5.5 0 00-.4-.1l-.3.1-.4.5 1.3 1.2-.6.7-1.4-1.2-2.2 2.3-.8-.7 2.3-2.4-1-.9zm2.3 5.2l1.2 1.2 1.9-2-1-1 .6-.8 1.8 1.8-2.5 2.7 1 1-.7.7-3-2.8.7-.8zm4.4-2.7l.4-.2c.2 0 .3 0 .5.2l.1.1c.2.2.2.3.2.5l-.1.4-.4.2-.5-.2-.1-.2a.6.6 0 01-.2-.4l.1-.4zm.4 4.5l.6-.5.6-.2.7.1c.2 0 .4.2.5.4.3.2.5.5.5.8l-.1.8.6-.6.7.7-3 3.2-.6.5-.7.1c-.2 0-.4 0-.7-.2-.2 0-.4-.2-.6-.4l-1-.9.6-.7 1 1 .6.2.6-.3.7-.7a1 1 0 01-.8.2c-.3 0-.6-.2-.8-.4l-.4-.6a1.5 1.5 0 010-1.3c0-.2.2-.4.4-.6l.6-.6zm.2 1.3a1 1 0 00-.2.7c0 .2 0 .4.3.6.2.2.4.3.7.3.2 0 .4 0 .6-.3l.5-.6.3-.6a1 1 0 00-.3-.7 1 1 0 00-.7-.3 1 1 0 00-.6.4l-.6.5zm2.8 4.6l.6-.3c.2 0 .4 0 .6.2l.3.6-.3.6-.6.3c-.2 0-.4 0-.6-.3a.8.8 0 01-.3-.5c0-.3.1-.5.3-.6zm3.7 4.6l-.6-.7-.2-.7.1-.7.4-.7.9-.9.6-.5.7-.1c.2 0 .5 0 .7.2a2.5 2.5 0 011.2 1 2 2 0 010 2l-.9-.6c.1-.2.2-.4.1-.7a1 1 0 00-.3-.6 1 1 0 00-.7-.3l-.6.3-.9 1c-.2.1-.3.4-.2.6 0 .2 0 .5.3.7l.6.3c.3 0 .5 0 .6-.2l.8.7a1.9 1.9 0 01-2 .2l-.6-.3zm4.3-.6l.6-.7 1 .9.4-.5c.3-.3.6-.4 1-.4s.7.1 1 .4l1 1-.6.7-1-1a.5.5 0 00-.3 0h-.3l-.4.5 1.3 1.3-.6.7-1.4-1.3-2.2 2.4-.8-.7 2.2-2.4-1-.9zm3.4 3.6l.6-.5.7-.2h.6l.6.5c.3.2.4.5.5.7 0 .3 0 .6-.2.8l.6-.5.8.7-3 3.2-.6.5-.7.1c-.2 0-.5 0-.7-.2l-.7-.4-1-1 .7-.6 1 .9c.1.2.3.3.6.3l.6-.3.7-.7a1 1 0 01-.9.2c-.2 0-.5-.2-.8-.5l-.4-.5a1.5 1.5 0 010-1.3l.5-.6.5-.6zm.3 1.3a1 1 0 00-.3.7c0 .2.1.4.4.6l.6.3c.3 0 .5-.1.7-.3l.5-.6.3-.6a1 1 0 00-.4-.7 1 1 0 00-.6-.3 1 1 0 00-.7.3l-.5.6z"/>
+    </g>
+  </g>
+</svg>
diff --git a/website/docs/usage/101/_pipelines.md b/website/docs/usage/101/_pipelines.md
index 9a63ee42d..f43219f41 100644
--- a/website/docs/usage/101/_pipelines.md
+++ b/website/docs/usage/101/_pipelines.md
@@ -32,7 +32,7 @@ the [config](/usage/training#config):
 
 ```ini
 [nlp]
-pipeline = ["tagger", "parser", "ner"]
+pipeline = ["tok2vec", "tagger", "parser", "ner"]
 ```
 
 import Accordion from 'components/accordion.js'
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index b1cf2723b..334ed03bd 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -167,8 +167,8 @@ the binary data:
 ```python
 ### spacy.load under the hood
 lang = "en"
-pipeline = ["tagger", "parser", "ner"]
-data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0"
+pipeline = ["tok2vec", "tagger", "parser", "ner"]
+data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
 
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
 nlp = cls()                            # 2. Initialize it
@@ -197,9 +197,9 @@ list of human-readable component names.
 
 ```python
 print(nlp.pipeline)
-# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
+# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
 print(nlp.pipe_names)
-# ['tagger', 'parser', 'ner']
+# ['tok2vec', 'tagger', 'parser', 'ner']
 ```
 
 ### Built-in pipeline components {#built-in}
@@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
 customize how the model is updated from examples, how it's initialized, how the
 loss is calculated and to add evaluation scores to the training output.
 
-| Name                                 | Description                                                                                                                                                                                                                                                                                                        |
-| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                |
-| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided.                                                                                                                 |
-| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                      |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
+| Name                                 | Description                                                                                                                                                                                                                                                                                                                                   |
+| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
+| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
+| [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 4c75ad771..c0658a58c 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -6,8 +6,9 @@ menu:
   - ['Introduction', 'basics']
   - ['Quickstart', 'quickstart']
   - ['Config System', 'config']
-  # - ['Data Utilities', 'data']
+  - ['Custom Training', 'config-custom']
   - ['Custom Functions', 'custom-functions']
+  - ['Data Utilities', 'data']
   - ['Parallel Training', 'parallel-training']
   - ['Internal API', 'api']
 ---
@@ -122,7 +123,7 @@ treebank.
 
 </Project>
 
-## Training config {#config}
+## Training config system {#config}
 
 Training config files include all **settings and hyperparameters** for training
 your pipeline. Instead of providing lots of arguments on the command line, you
@@ -177,6 +178,7 @@ sections of a config file are:
 | `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
 | `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining).                                                |
+| `initialize`  | Data resources and arguments passed to components when [`nlp.initialize`](/api/language#initialize) is called before training (but not at runtime).             |
 
 <Infobox title="Config format and settings" emoji="📖">
 
@@ -190,6 +192,20 @@ available for the different architectures are documented with the
 
 </Infobox>
 
+### Config lifecycle at runtime and training {#config-lifecycle}
+
+A pipeline's `config.cfg` is considered the "single source of truth", both at
+**training** and **runtime**. Under the hood,
+[`Language.from_config`](/api/language#from_config) takes care of constructing
+the `nlp` object using the settings defined in the config. An `nlp` object's
+config is available as [`nlp.config`](/api/language#config) and it includes all
+information about the pipeline, as well as the settings used to train and
+initialize it.
+
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
+<!-- TODO: explain lifecycle and initialization -->
+
 ### Overwriting config settings on the command line {#config-overrides}
 
 The config system means that you can define all settings **in one place** and in
@@ -233,6 +249,61 @@ defined in the config file.
 $ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
 ```
 
+### Using variable interpolation {#config-interpolation}
+
+Another very useful feature of the config system is that it supports variable
+interpolation for both **values and sections**. This means that you only need to
+define a setting once and can reference it across your config using the
+`${section.value}` syntax. In this example, the value of `seed` is reused within
+the `[training]` block, and the whole block of `[training.optimizer]` is reused
+in `[pretraining]` and will become `pretraining.optimizer`.
+
+```ini
+### config.cfg (excerpt) {highlight="5,18"}
+[system]
+seed = 0
+
+[training]
+seed = ${system.seed}
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 1e-8
+
+[pretraining]
+optimizer = ${training.optimizer}
+```
+
+You can also use variables inside strings. In that case, it works just like
+f-strings in Python. If the value of a variable is not a string, it's converted
+to a string.
+
+```ini
+[paths]
+version = 5
+root = "/Users/you/data"
+train = "${paths.root}/train_${paths.version}.spacy"
+# Result: /Users/you/data/train_5.spacy
+```
+
+<Infobox title="Tip: Override variables on the CLI" emoji="💡">
+
+If you need to change certain values between training runs, you can define them
+once, reference them as variables and then [override](#config-overrides) them on
+the CLI. For example, `--paths.root /other/root` will change the value of `root`
+in the block `[paths]` and the change will be reflected across all other values
+that reference this variable.
+
+</Infobox>
+
+## Customizing the pipeline and training {#config-custom}
+
 ### Defining pipeline components {#config-components}
 
 You typically train a [pipeline](/usage/processing-pipelines) of **one or more
@@ -353,59 +424,6 @@ stop = 1000
 compound = 1.001
 ```
 
-### Using variable interpolation {#config-interpolation}
-
-Another very useful feature of the config system is that it supports variable
-interpolation for both **values and sections**. This means that you only need to
-define a setting once and can reference it across your config using the
-`${section.value}` syntax. In this example, the value of `seed` is reused within
-the `[training]` block, and the whole block of `[training.optimizer]` is reused
-in `[pretraining]` and will become `pretraining.optimizer`.
-
-```ini
-### config.cfg (excerpt) {highlight="5,18"}
-[system]
-seed = 0
-
-[training]
-seed = ${system.seed}
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 1e-8
-
-[pretraining]
-optimizer = ${training.optimizer}
-```
-
-You can also use variables inside strings. In that case, it works just like
-f-strings in Python. If the value of a variable is not a string, it's converted
-to a string.
-
-```ini
-[paths]
-version = 5
-root = "/Users/you/data"
-train = "${paths.root}/train_${paths.version}.spacy"
-# Result: /Users/you/data/train_5.spacy
-```
-
-<Infobox title="Tip: Override variables on the CLI" emoji="💡">
-
-If you need to change certain values between training runs, you can define them
-once, reference them as variables and then [override](#config-overrides) them on
-the CLI. For example, `--paths.root /other/root` will change the value of `root`
-in the block `[paths]` and the change will be reflected across all other values
-that reference this variable.
-
-</Infobox>
-
 ### Model architectures {#model-architectures}
 
 > #### 💡 Model type annotations
@@ -506,17 +524,7 @@ still look good.
 
 </Accordion>
 
-<!--
-## Data Utilities {#data-utilities}
-
-* spacy convert
-* The [corpora] block
-* Custom corpus class
-* Minibatching
-* Data augmentation
--->
-
-## Custom Functions {#custom-functions}
+## Custom functions {#custom-functions}
 
 Registered functions in the training config files can refer to built-in
 implementations, but you can also plug in fully **custom implementations**. All
@@ -763,7 +771,96 @@ start = 2
 factor = 1.005
 ```
 
-#### Example: Custom data reading and batching {#custom-code-readers-batchers}
+### Defining custom architectures {#custom-architectures}
+
+Built-in pipeline components such as the tagger or named entity recognizer are
+constructed with default neural network [models](/api/architectures). You can
+change the model architecture entirely by implementing your own custom models
+and providing those in the config when creating the pipeline component. See the
+documentation on [layers and model architectures](/usage/layers-architectures)
+for more details.
+
+> ```ini
+> ### config.cfg
+> [components.tagger]
+> factory = "tagger"
+>
+> [components.tagger.model]
+> @architectures = "custom_neural_network.v1"
+> output_width = 512
+> ```
+
+```python
+### functions.py
+from typing import List
+from thinc.types import Floats2d
+from thinc.api import Model
+import spacy
+from spacy.tokens import Doc
+
+@spacy.registry.architectures("custom_neural_network.v1")
+def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
+    return create_model(output_width)
+```
+
+## Data utilities {#data}
+
+spaCy includes various features and utilities to make it easy to train from your
+own data. If you have training data in a standard format like `.conll` or
+`.conllu`, the easiest way to convert it for use with spaCy is to run
+[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
+
+```cli
+$ python -m spacy convert ./train.gold.conll ./corpus
+```
+
+<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
+
+Training workflows often consist of multiple steps, from preprocessing the data
+all the way to packaging and deploying the trained model.
+[spaCy projects](/usage/projects) let you define all steps in one file, manage
+data assets, track changes and share your end-to-end processes with your team.
+
+</Infobox>
+
+### Working with corpora {#data-corpora}
+
+> #### Example
+>
+> ```ini
+> [corpora]
+>
+> [corpora.train]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths.train}
+> gold_preproc = false
+> max_length = 0
+> limit = 0
+> augmenter = null
+>
+> [training]
+> train_corpus = "corpora.train"
+> ```
+
+The [`[corpora]`](/api/data-formats#config-corpora) block in your config lets
+you define **data resources** to use for training, evaluation, pretraining or
+any other custom workflows. `corpora.train` and `corpora.dev` are used as
+conventions within spaCy's default configs, but you can also define any other
+custom blocks. Each section in the corpora config should resolve to a
+[`Corpus`](/api/corpus) – for example, using spaCy's built-in
+[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
+file. The `train_corpus` and `dev_corpus` fields in the
+[`[training]`](/api/data-formats#config-training) block specify where to find
+the corpus in your config. This makes it easy to **swap out** different corpora
+by only changing a single config setting.
+
+Instead of making `[corpora]` a block with multiple subsections for each portion
+of the data, you can also use a single function that returns a dictionary of
+corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
+especially useful if you need to split a single file into corpora for training
+and evaluation, without loading the same file twice.
+
+### Custom data reading and batching {#custom-code-readers-batchers}
 
 Some use-cases require **streaming in data** or manipulating datasets on the
 fly, rather than generating all data beforehand and storing it to file. Instead
@@ -859,37 +956,11 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
     return create_filtered_batches
 ```
 
-### Defining custom architectures {#custom-architectures}
-
-Built-in pipeline components such as the tagger or named entity recognizer are
-constructed with default neural network [models](/api/architectures). You can
-change the model architecture entirely by implementing your own custom models
-and providing those in the config when creating the pipeline component. See the
-documentation on [layers and model architectures](/usage/layers-architectures)
-for more details.
-
-> ```ini
-> ### config.cfg
-> [components.tagger]
-> factory = "tagger"
->
-> [components.tagger.model]
-> @architectures = "custom_neural_network.v1"
-> output_width = 512
-> ```
-
-```python
-### functions.py
-from typing import List
-from thinc.types import Floats2d
-from thinc.api import Model
-import spacy
-from spacy.tokens import Doc
-
-@spacy.registry.architectures("custom_neural_network.v1")
-def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
-    return create_model(output_width)
-```
+<!-- TODO:
+* Custom corpus class
+* Minibatching
+* Data augmentation
+-->
 
 ## Parallel & distributed training with Ray {#parallel-training}
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44f902cd5..179a8fb55 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -123,13 +123,14 @@ training run, with no hidden defaults, making it easy to rerun your experiments
 and track changes. You can use the
 [quickstart widget](/usage/training#quickstart) or the `init config` command to
 get started. Instead of providing lots of arguments on the command line, you
-only need to pass your `config.cfg` file to `spacy train`.
-
+only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
 Training config files include all **settings and hyperparameters** for training
 your pipeline. Some settings can also be registered **functions** that you can
 swap out and customize, making it easy to implement your own custom models and
 architectures.
 
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
 <Infobox title="Details & Documentation" emoji="📖" list>
 
 - **Usage:** [Training pipelines and models](/usage/training)
@@ -723,7 +724,7 @@ nlp = spacy.blank("en")
 
 Because pipeline components are now added using their string names, you won't
 have to instantiate the [component classes](/api/#architecture-pipeline)
-directly anynore. To configure the component, you can now use the `config`
+directly anymore. To configure the component, you can now use the `config`
 argument on [`nlp.add_pipe`](/api/language#add_pipe).
 
 > #### config.cfg (excerpt)

From e58dca302889164230d5bfaf3761e252136824d6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 30 Sep 2020 16:52:27 +0200
Subject: [PATCH 297/516] Add read_labels

---
 spacy/training/corpus.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 85079f41c..8be56b9e6 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -43,6 +43,15 @@ def create_jsonl_reader(
     return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
+@util.registry.readers("spacy.read_labels.v1")
+def read_labels(path: Path, *, require: bool=False):
+    # I decided not to give this a generic name, because I don't want people to
+    # use it for arbitrary stuff, as I want this require arg with default False.
+    if not require and not path.exists():
+        return None
+    return srsly.read_json(path)
+
+
 def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
     path = util.ensure_path(path)
     if not path.is_dir() and path.parts[-1].endswith(file_type):

From 59294e91aa7b5cade545be4ada36ee0bc400f8bd Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 29 Sep 2020 21:33:55 +0200
Subject: [PATCH 298/516] Restore the 'jsonl' arg for init vectors

The lexemes.jsonl file is still used in our English vectors, and it may
be required by users as well. I think it's worth supporting the option.
---
 spacy/cli/init_pipeline.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index f241133ca..8befe1fd2 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -21,6 +21,7 @@ def init_vectors_cli(
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file")
     # fmt: on
 ):
     """Convert word vectors for use with spaCy. Will export an nlp object that
@@ -30,6 +31,13 @@ def init_vectors_cli(
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
+    if jsonl_loc is not None:
+        lex_attrs = srsly.read_jsonl(jsonl_loc)
+        for attrs in lex_attrs:
+            if "settings" in attrs:
+                continue
+            lexeme = nlp.vocab[attrs["orth"]]
+            lexeme.set_attrs(**attrs)
     convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
     nlp.to_disk(output_dir)

From 51282989642f1ecfb768c76e7f1738f8d459fc6b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 30 Sep 2020 20:18:45 +0200
Subject: [PATCH 299/516] Add missing augmenter

---
 spacy/default_config.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index ea4f26255..6bd1ed24d 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -53,6 +53,7 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
+augmenter = null
 
 # Training hyper-parameters and additional features.
 [training]

From a103ab5f1a038ccbd668e5e33d0bee2dabd75b4e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 23:03:47 +0200
Subject: [PATCH 300/516] Update augmenter lookups and docs

---
 spacy/errors.py                       |  6 ++++
 spacy/tests/training/test_training.py | 14 ++++++--
 spacy/training/augment.py             | 51 ++++++++++++++++++++-------
 website/docs/api/corpus.md            |  6 ++--
 website/docs/api/top-level.md         | 32 ++++++++++++++++-
 website/docs/usage/training.md        | 44 ++++++++++++++++++++---
 6 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 233ff29bd..4ba51f669 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,6 +477,12 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E912 = ("No orth_variants lookups table for data augmentation available for "
+            "language '{lang}'. If orth_variants are available in "
+            "spacy-lookups-data, make sure the package is installed and the "
+            "table is loaded in the [initialize.lookups] block of your config. "
+            "Alternatively, you can provide your own Lookups object with a "
+            "table orth_variants as the argument 'lookuos' of the augmenter.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
             "config.cfg or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 81e533a5a..af3fe63c2 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
 from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
+from spacy.lookups import Lookups
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
 import pytest
@@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
 @pytest.mark.filterwarnings("ignore::UserWarning")
 def test_make_orth_variants(doc):
     nlp = English()
+    orth_variants = {
+        "single": [
+            {"tags": ["NFP"], "variants": ["…", "..."]},
+            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
+        ]
+    }
+    lookups = Lookups()
+    lookups.add_table("orth_variants", orth_variants)
+    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "roundtrip.spacy"
         DocBin(docs=[doc]).to_disk(output_file)
         # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(
-            output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
-        )
+        reader = Corpus(output_file, augmenter=augmenter)
         list(reader(nlp))
 
 
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 95662eafa..176530a1c 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,30 +1,50 @@
-from typing import Callable
+from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
-from ..util import registry
+
+from ..util import registry, logger
 from ..tokens import Doc
+from .example import Example
+from ..lookups import Lookups
+from ..errors import Errors
 
-
-@registry.augmenters("spacy.dont_augment.v1")
-def create_null_augmenter():
-    return dont_augment
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
 
 
 @registry.augmenters("spacy.orth_variants.v1")
-def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
+def create_orth_variants_augmenter(
+    level: float, lower: float, lookups: Optional[Lookups] = None,
+) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
     """
-    return partial(orth_variants_augmenter, level=level, lower=lower)
+    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
 
 
-def dont_augment(nlp, example):
+def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
     yield example
 
 
-def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
+def orth_variants_augmenter(
+    nlp: "Language",
+    example: Example,
+    *,
+    level: float = 0.0,
+    lower: float = 0.0,
+    lookups: Optional[Lookups] = None,
+) -> Iterator[Example]:
+    table_name = "orth_variants"
+    if lookups is not None:
+        orth_variants = lookups.get_table(table_name, {})
+        logger.debug("Using data augmentation orth variants from provided lookups")
+    else:
+        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
+        logger.debug("Using data augmentation orth variants from default vocab lookups")
+        if not orth_variants:
+            raise ValueError(Errors.E912.format(lang=nlp.lang))
     if random.random() >= level:
         yield example
     else:
@@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
                 nlp,
                 raw_text,
                 orig_dict["token_annotation"],
+                orth_variants,
                 lower=raw_text is not None and random.random() < lower,
             )
             if variant_text:
@@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
             yield example.from_dict(doc, orig_dict)
 
 
-def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
+def make_orth_variants(
+    nlp: "Language",
+    raw: str,
+    token_dict: Dict[str, List[str]],
+    orth_variants: Dict[str, list],
+    *,
+    lower: bool = False,
+) -> Tuple[str, Dict[str, List[str]]]:
     orig_token_dict = copy.deepcopy(token_dict)
-    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
     words = token_dict.get("words", [])
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 37f24819d..58006a19b 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -7,9 +7,11 @@ new: 3
 ---
 
 This class manages annotated corpora and can be used for training and
-development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
+development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
 customize the data loading during training, you can register your own
-[data readers and batchers](/usage/training#custom-code-readers-batchers).
+[data readers and batchers](/usage/training#custom-code-readers-batchers). Also
+see the usage guide on [data utilities](/usage/training#data) for more details
+and examples.
 
 ## Config and implementation {#config}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 7f1b1ed7f..da24593e6 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -7,7 +7,8 @@ menu:
   - ['Loggers', 'loggers']
   - ['Readers', 'readers']
   - ['Batchers', 'batchers']
-  - ['Data & Alignment', 'gold']
+  - ['Augmenters', 'augmenters']
+  - ['Training & Alignment', 'gold']
   - ['Utility Functions', 'util']
 ---
 
@@ -313,6 +314,7 @@ factories.
 | Registry name     | Description                                                                                                                                                                                                                                        |
 | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `architectures`   | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`.                                                                           |
+| `augmenters`      | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators.                                                                                                                       |
 | `batchers`        | Registry for training and evaluation [data batchers](#batchers).                                                                                                                                                                                   |
 | `callbacks`       | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training.                                                                                                                             |
 | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                             |
@@ -618,6 +620,34 @@ sequences in the batch.
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
 
+## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
+
+<!-- TODO: intro, explain data augmentation concept -->
+
+### orth_variants {#orth_variants tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter]
+> @augmenters = "spacy.orth_variants.v1"
+> level = 0.0
+> lower = 0.0
+> lookups = null
+> ```
+
+Create a data augmentation callback that uses orth-variant replacement. The
+callback can be added to a corpus or other data iterator during training. This
+is especially useful for punctuation and case replacement, to help generalize
+beyond corpora that don't have smart quotes, or only have smart quotes etc.
+
+| Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `level`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `lower`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
+| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
+
 ## Training data and alignment {#gold source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index c0658a58c..51aa82618 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
 
 ## Data utilities {#data}
 
-spaCy includes various features and utilities to make it easy to train from your
-own data. If you have training data in a standard format like `.conll` or
-`.conllu`, the easiest way to convert it for use with spaCy is to run
-[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
+spaCy includes various features and utilities to make it easy to train models
+using your own data, manage training and evaluation corpora, convert existing
+annotations and configure data augmentation strategies for more robust models.
+
+### Converting existing corpora and annotations {#data-convert}
+
+If you have training data in a standard format like `.conll` or `.conllu`, the
+easiest way to convert it for use with spaCy is to run
+[`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
+By default, the command will pick the converter based on the file extension.
 
 ```cli
 $ python -m spacy convert ./train.gold.conll ./corpus
 ```
 
+> #### 💡 Tip: Converting from Prodigy
+>
+> If you're using the [Prodigy](https://prodi.gy) annotation tool to create
+> training data, you can run the
+> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
+> merge and export multiple datasets for use with
+> [`spacy train`](/api/cli#train). Different types of annotations on the same
+> text will be combined, giving you one corpus to train multiple components.
+
 <Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
 
 Training workflows often consist of multiple steps, from preprocessing the data
@@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.
 
 </Infobox>
 
+The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
+one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
+storage**, especially when packing multiple documents together. You can also
+create `Doc` objects manually, so you can write your own custom logic to convert
+and store existing annotations for use in spaCy.
+
+```python
+### Training data from Doc objects {highlight="6-9"}
+import spacy
+from spacy.tokens import Doc, DocBin
+
+nlp = spacy.blank("en")
+docbin = DocBin(nlp.vocab)
+words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
+spaces = [True, True, True, True, True, True, True, False]
+ents = [("ORG", 0, 1), ("GPE", 5, 6)]
+doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
+docbin.add(doc)
+docbin.to_disk("./train.spacy")
+```
+
 ### Working with corpora {#data-corpora}
 
 > #### Example

From 6f29f68f694d183b58ff8091d473e909231b52ec Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 23:48:47 +0200
Subject: [PATCH 301/516] Update errors and make Tokenizer.initialize args less
 strict

---
 spacy/errors.py           | 20 +++++---------------
 spacy/lang/zh/__init__.py |  8 +++-----
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index f8fb7dd8b..1263796b3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -554,7 +554,10 @@ class Errors:
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
             "component.")
-    E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
+    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
+            "spacy-lookups-data. If you want to initialize a blank nlp object, "
+            "make sure you have the spacy-lookups-data package installed or "
+            "remove the [initialize.lookups] block from your config.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")
     E957 = ("Writing directly to Language.factories isn't needed anymore in "
@@ -674,20 +677,7 @@ class Errors:
     E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
              "loaded. Provide the name of a pretrained model or the path to "
              "a model and initialize the pipeline:\n\n"
-             'config = {\n'
-             '    "nlp": {\n'
-             '        "tokenizer": {\n'
-             '            "@tokenizers": "spacy.zh.ChineseTokenizer",\n'
-             '            "segmenter": "pkuseg",\n'
-             '        }\n'
-             '    },\n'
-             '    "initialize": {"tokenizer": {\n'
-             '            "pkuseg_model": "default", # or /path/to/model\n'
-             '        }\n'
-             '    },\n'
-             '}\n'
-             'nlp = Chinese.from_config(config)\n'
-             'nlp.initialize()')
+             'nlp.tokenizer.initialize(pkuseg_model="default")')
     E1001 = ("Target token outside of matched span for match with tokens "
              "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
     E1002 = ("Span index out of range.")
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index ecabb6555..858f41f65 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -56,9 +56,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
 
 class ChineseTokenizer(DummyTokenizer):
     def __init__(
-        self,
-        nlp: Language,
-        segmenter: Segmenter = Segmenter.char,
+        self, nlp: Language, segmenter: Segmenter = Segmenter.char,
     ):
         self.vocab = nlp.vocab
         if isinstance(segmenter, Segmenter):
@@ -80,9 +78,9 @@ class ChineseTokenizer(DummyTokenizer):
 
     def initialize(
         self,
-        get_examples: Callable[[], Iterable[Example]],
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
         *,
-        nlp: Optional[Language],
+        nlp: Optional[Language] = None,
         pkuseg_model: Optional[str] = None,
         pkuseg_user_dict: str = "default",
     ):

From 4b6afd36114fbe1871f17998f9e3f4ec0e116f0f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 30 Sep 2020 23:49:29 +0200
Subject: [PATCH 302/516] Remove English [initialize] default block for now to
 get tests to pass

---
 spacy/lang/en/__init__.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index f4ea10f9c..cc01f1aea 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
 from ...lookups import Lookups
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class EnglishDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     lex_attr_getters = LEX_ATTRS

From 512197293020cc5252e3af67a5a5123df099617e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 1 Oct 2020 09:20:09 +0200
Subject: [PATCH 303/516] add types of Tok2Vec embedding layers

---
 spacy/ml/models/tok2vec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index fec478e21..63e79bf95 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -93,7 +93,7 @@ def build_Tok2Vec_model(
 @registry.architectures.register("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
     width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
-):
+) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
     through a feed-forward subnetwork to build a mixed representations.
@@ -166,7 +166,7 @@ def MultiHashEmbed(
 @registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
     width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
-):
+) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is

From 6787e56315880a6d1049852a02a819cb8e3665df Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 1 Oct 2020 09:21:00 +0200
Subject: [PATCH 304/516] print debugging warning before raising error if model
 not properly initialized

---
 spacy/language.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index c1d2df026..f161b2877 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -970,7 +970,8 @@ class Language:
                 raise ValueError(Errors.E003.format(component=type(proc), name=name))
             try:
                 doc = proc(doc, **component_cfg.get(name, {}))
-            except KeyError:
+            except KeyError as e:
+                warnings.warn(str(e))
                 raise ValueError(Errors.E109.format(name=name)) from None
             if doc is None:
                 raise ValueError(Errors.E005.format(name=name))

From 44160cd52fd054dea9829313fdf75876e11890b5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 10:41:19 +0200
Subject: [PATCH 305/516] Tidy up [ci skip]

---
 spacy/cli/train.py           | 4 +---
 spacy/training/initialize.py | 8 --------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index aede0e8f4..57a88159d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,15 +1,13 @@
 from typing import Optional
 from pathlib import Path
 from wasabi import msg
-from thinc.api import Config
 import typer
 import logging
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
-from ..language import Language
 from ..training.loop import train
-from ..training.initialize import init_nlp, must_reinitialize
+from ..training.initialize import init_nlp
 from .. import util
 
 
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index e248cf314..d64f211c4 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -9,7 +9,6 @@ import gzip
 import zipfile
 import tqdm
 
-from .loop import create_before_to_disk_callback
 from ..lookups import Lookups
 from ..vectors import Vectors
 from ..errors import Errors
@@ -39,7 +38,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     dot_names = [T["train_corpus"], T["dev_corpus"]]
     train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
     optimizer = T["optimizer"]
-    before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
     # Sourced components that require resume_training
@@ -55,15 +53,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     # Verify the config after calling 'initialize' to ensure labels
     # are properly initialized
     verify_config(nlp)
-    nlp = before_to_disk(nlp)
     return nlp
 
 
-def must_reinitialize(train_config: Config, init_config: Config) -> bool:
-    # TODO: do this better and more fine-grained
-    return train_config.interpolate().to_str() == init_config.interpolate().to_str()
-
-
 def init_vocab(
     nlp: "Language",
     *,

From 0a8a124a6e0457f3bb9e49b95dbb0fa78be8a06e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 12:15:53 +0200
Subject: [PATCH 306/516] Update docs [ci skip]

---
 spacy/cli/init_pipeline.py                    |  4 +-
 website/docs/api/cli.md                       | 43 +++++++++----------
 website/docs/api/data-formats.md              |  8 ++--
 website/docs/api/top-level.md                 | 31 +++++++------
 website/docs/usage/embeddings-transformers.md |  2 +-
 website/docs/usage/linguistic-features.md     | 18 ++++----
 website/docs/usage/v3.md                      |  2 +-
 website/meta/type-annotations.json            |  2 +
 8 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 8befe1fd2..e2935184c 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -19,13 +19,13 @@ def init_vectors_cli(
     output_dir: Path = Arg(..., help="Pipeline output directory"),
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
-    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file")
     # fmt: on
 ):
     """Convert word vectors for use with spaCy. Will export an nlp object that
-    you can use in the [initialize.vocab] block of your config to initialize
+    you can use in the [initialize] block of your config to initialize
     a model with vectors.
     """
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 66e26f11f..347ce1683 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -170,38 +170,37 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                          |
 | **CREATES**            | Complete and auto-filled config file for training.                                                                                  |
 
-### init vocab {#init-vocab new="3" tag="command"}
+### init vectors {#init-vectors new="3" tag="command"}
 
-Create a blank pipeline directory from raw data, like word frequencies, Brown
-clusters and word vectors. Note that in order to populate the vocabulary, you
-need to pass in a JSONL-formatted
-[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
-`id` values that correspond to the vectors table. Just loading in vectors will
-not automatically populate the vocab.
+Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
+with spaCy. Will export an `nlp` object that you can use in the
+[`[initialize]`](/api/data-formats#config-initialize) block of your config to
+initialize a model with vectors. See the usage guide on
+[static vectors](/usage/embeddings-transformers#static-vectors) for details on
+how to use vectors in your model.
 
 <Infobox title="New in v3.0" variant="warning" id="init-model">
 
-This command was previously called `init-model`.
+This functionality was previously available as part of the command `init-model`.
 
 </Infobox>
 
 ```cli
-$ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
 ```
 
-| Name                                                    | Description                                                                                                                                                                                                                                                                         |
-| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`                                                  | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                                |
-| `output_dir`                                            | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                                               |
-| `--jsonl-loc`, `-j`                                     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                                         |
-| `--vectors-loc`, `-v`                                   | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ |
-| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                                  |
-| `--prune-vectors`, `-V`                                 | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                                     |
-| `--vectors-name`, `-vn`                                 | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                                   |
-| `--meta-name`, `-mn`                                    | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~                                                                                                                                                                                                     |
-| `--base`, `-b`                                          | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~                                                                                                                                         |
-| `--help`, `-h`                                          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                          |
-| **CREATES**                                             | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                                        |
+| Name                    | Description                                                                                                                                                                                                                                                         |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang`                  | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
+| `vectors_loc`           | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
+| `output_dir`            | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
+| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                         |
+| `--truncate`, `-t`      | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
+| `--prune`, `-p`         | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
+| `--name`, `-n`          | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--verbose`, `-V`       | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
+| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
+| **CREATES**             | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
 
 ## convert {#convert tag="command"}
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index cac98c70a..825d95def 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -246,7 +246,7 @@ without requiring them at runtime when you load the trained pipeline back in.
 | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                                                                                                                                                                                |
 | `lookups`      | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                                                                                                                                                                                       |
 | `tokenizer`    | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ |
-| `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                               |
+| `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                           |
 | `vocab_data`   | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~                                                                                                                                                                                                                                                                                           |
 
 ## Training data {#training}
@@ -274,8 +274,8 @@ Typically, the extension for these binary files is `.spacy`, and they are used
 as input format for specifying a [training corpus](/api/corpus) and for spaCy's
 CLI [`train`](/api/cli#train) command. The built-in
 [`convert`](/api/cli#convert) command helps you convert spaCy's previous
-[JSON format](#json-input) to the new binary format. It also supports
-conversion of the `.conllu` format used by the
+[JSON format](#json-input) to the new binary format. It also supports conversion
+of the `.conllu` format used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies).
 
 ### JSON training format {#json-input tag="deprecated"}
@@ -455,7 +455,7 @@ example = Example.from_dict(doc, gold_dict)
 ## Lexical data for vocabulary {#vocab-jsonl new="2"}
 
 To populate a pipeline's vocabulary, you can use the
-[`spacy init vocab`](/api/cli#init-vocab) command and load in a
+[`spacy init vectors`](/api/cli#init-vectors) command and load in a
 [newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
 lexical entry per line via the `--jsonl-loc` option. The first line defines the
 language and vocabulary settings. All other lines are expected to be JSON
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index da24593e6..c16983c78 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -372,7 +372,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
 using one of the built-in loggers listed here, you can also
 [implement your own](/usage/training#custom-logging).
 
-#### ConsoleLogger {#ConsoleLogger tag="registered function"}
+#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
 
 > #### Example config
 >
@@ -418,7 +418,7 @@ start decreasing across epochs.
 
  </Accordion>
 
-#### WandbLogger {#WandbLogger tag="registered function"}
+#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
 
 > #### Installation
 >
@@ -480,7 +480,7 @@ with your own registered function in the
 [`@readers` registry](/api/top-level#registry) to customize the data loading and
 streaming.
 
-### Corpus {#corpus}
+### spacy.Corpus.v1 {#corpus tag="registered function"}
 
 The `Corpus` reader manages annotated corpora and can be used for training and
 development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
@@ -507,8 +507,9 @@ the [`Corpus`](/api/corpus) class.
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          |
 | `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
+| **CREATES**     | The corpus reader. ~~Corpus~~                                                                                                                                                                                                                                                            |
 
-### JsonlReader {#jsonlreader}
+### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
 
 Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
 file of texts keyed by `"text"`. Can be used to read the raw text corpus for
@@ -535,6 +536,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~       |
 | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
+| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
 
 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 
@@ -550,7 +552,7 @@ Instead of using one of the built-in batchers listed here, you can also
 [implement your own](/usage/training#custom-code-readers-batchers), which may or
 may not use a custom schedule.
 
-### batch_by_words {#batch_by_words tag="registered function"}
+### spacy.batch_by_words.v1 {#batch_by_words tag="registered function"}
 
 Create minibatches of roughly a given number of words. If any examples are
 longer than the specified batch length, they will appear in a batch by
@@ -576,8 +578,9 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
 | `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                       |
 | `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
+| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
 
-### batch_by_sequence {#batch_by_sequence tag="registered function"}
+### spacy.batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
 
 > #### Example config
 >
@@ -594,8 +597,9 @@ Create a batcher that creates batches of the specified size.
 | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
 | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 |
+| **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     |
 
-### batch_by_padded {#batch_by_padded tag="registered function"}
+### spacy.batch_by_padded.v1 {#batch_by_padded tag="registered function"}
 
 > #### Example config
 >
@@ -619,20 +623,21 @@ sequences in the batch.
 | `buffer`           | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
 | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     |
 | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     |
+| **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                                                                         |
 
 ## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
 
 <!-- TODO: intro, explain data augmentation concept -->
 
-### orth_variants {#orth_variants tag="registered function"}
+### spacy.orth_variants.v1 {#orth_variants tag="registered function"}
 
 > #### Example config
 >
 > ```ini
 > [corpora.train.augmenter]
 > @augmenters = "spacy.orth_variants.v1"
-> level = 0.0
-> lower = 0.0
+> level = 0.1
+> lower = 0.5
 > lookups = null
 > ```
 
@@ -643,10 +648,10 @@ beyond corpora that don't have smart quotes, or only have smart quotes etc.
 
 | Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `level`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| `lower`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `lower`     | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
-| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
+| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
 
 ## Training data and alignment {#gold source="spacy/training"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 59b4ba465..d5c7ee93a 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -496,7 +496,7 @@ algorithms for learning word vector tables. You can train a word vectors table
 using tools such as [Gensim](https://radimrehurek.com/gensim/),
 [FastText](https://fasttext.cc/) or
 [GloVe](https://nlp.stanford.edu/projects/glove/), or download existing
-pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you
+pretrained vectors. The [`init vectors`](/api/cli#init-vectors) command lets you
 convert vectors for use with spaCy and will give you a directory you can load or
 refer to in your [training configs](/usage/training#config).
 
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index d9a894398..25b6c2fac 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1834,10 +1834,12 @@ word vector libraries output an easy-to-read text-based format, where each line
 consists of the word followed by its vector. For everyday use, we want to
 convert the vectors into a binary format that loads faster and takes up less
 space on disk. The easiest way to do this is the
-[`init vocab`](/api/cli#init-vocab) command-line utility. This will output a
+[`init vectors`](/api/cli#init-vectors) command-line utility. This will output a
 blank spaCy pipeline in the directory `/tmp/la_vectors_wiki_lg`, giving you
 access to some nice Latin vectors. You can then pass the directory path to
-[`spacy.load`](/api/top-level#spacy.load).
+[`spacy.load`](/api/top-level#spacy.load) or use it in the
+[`[initialize]`](/api/data-formats#config-initialize) of your config when you
+[train](/usage/training) a model.
 
 > #### Usage example
 >
@@ -1850,7 +1852,7 @@ access to some nice Latin vectors. You can then pass the directory path to
 
 ```cli
 $ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
-$ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
+$ python -m spacy init vectors en cc.la.300.vec.gz /tmp/la_vectors_wiki_lg
 ```
 
 <Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
@@ -1858,9 +1860,9 @@ $ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.
 To help you strike a good balance between coverage and memory usage, spaCy's
 [`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
 row** of the table. If you're using the
-[`spacy init vocab`](/api/cli#init-vocab) command to create a vocabulary,
-pruning the vectors will be taken care of automatically if you set the
-`--prune-vectors` flag. You can also do it manually in the following steps:
+[`spacy init vectors`](/api/cli#init-vectors) command to create a vocabulary,
+pruning the vectors will be taken care of automatically if you set the `--prune`
+flag. You can also do it manually in the following steps:
 
 1. Start with a **word vectors package** that covers a huge vocabulary. For
    instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
@@ -1905,12 +1907,12 @@ the two words.
 In the example above, the vector for "Shore" was removed and remapped to the
 vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
 the vector of "leaving", which is identical. If you're using the
-[`init vocab`](/api/cli#init-vocab) command, you can set the `--prune-vectors`
+[`init vectors`](/api/cli#init-vectors) command, you can set the `--prune`
 option to easily reduce the size of the vectors as you add them to a spaCy
 pipeline:
 
 ```cli
-$ python -m spacy init vocab en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
+$ python -m spacy init vectors en la.300d.vec.tgz /tmp/la_vectors_web_md --prune 10000
 ```
 
 This will create a blank spaCy pipeline with vectors for the first 10,000 words
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 179a8fb55..36840f0f3 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -564,7 +564,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               |
 | `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               |
 | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
-| `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                |
+| `spacy init-model`                                                                           | [`spacy init vectors`](/api/cli#init-vectors)                                                                                                                                                                            |
 | `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                |
 | `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          |
 | `spacy link`, `util.set_data_path`, `util.get_data_path`                                     | not needed, symlinks are deprecated                                                                                                                                                                                      |
diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json
index 79d4d357d..43a524e93 100644
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@@ -23,6 +23,8 @@
     "PhraseMatcher": "/api/phrasematcher",
     "TransformerData": "/api/transformer#transformerdata",
     "FullTransformerBatch": "/api/transformer#fulltransformerbatch",
+    "Corpus": "/api/corpus",
+    "JsonlTexts": "/api/corpus#jsonltexts",
     "LexemeC": "/api/cython-structs#lexemec",
     "TokenC": "/api/cython-structs#tokenc",
     "Config": "https://thinc.ai/docs/api-config#config",

From 3243ddac8f699a69ce2e4e39ae80c62cfd30ad12 Mon Sep 17 00:00:00 2001
From: Yohei Tamura <tamuhey@gmail.com>
Date: Thu, 1 Oct 2020 21:01:52 +0900
Subject: [PATCH 307/516] Fix/span.sent (#6083)

* add fail test

* fix test

* fix span.sent

* Remove incorrect implicit check

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/doc/test_span.py | 20 +++++++++++++++++---
 spacy/tokens/span.pyx        |  5 ++---
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 107078df9..df41aedf5 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -174,19 +174,25 @@ def test_spans_by_character(doc):
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
 
-    span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
+    span2 = doc.char_span(
+        span1.start_char, span1.end_char, label="GPE", alignment_mode="strict"
+    )
     assert span1.start_char == span2.start_char
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
 
     # alignment mode "contract"
-    span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
+    span2 = doc.char_span(
+        span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
+    )
     assert span1.start_char == span2.start_char
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
 
     # alignment mode "expand"
-    span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
+    span2 = doc.char_span(
+        span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand"
+    )
     assert span1.start_char == span2.start_char
     assert span1.end_char == span2.end_char
     assert span2.label_ == "GPE"
@@ -318,3 +324,11 @@ def test_span_boundaries(doc):
         _ = span[-5]
     with pytest.raises(IndexError):
         _ = span[5]
+
+
+def test_sent(en_tokenizer):
+    doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
+    span = doc[1:3]
+    assert not span.doc.is_sentenced
+    with pytest.raises(ValueError):
+        span.sent
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 29b87fa8d..cf0775bae 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -391,8 +391,6 @@ cdef class Span:
         """RETURNS (Span): The sentence span that the span is a part of."""
         if "sent" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["sent"](self)
-        # This should raise if not parsed / no custom sentence boundaries
-        self.doc.sents
         # Use `sent_start` token attribute to find sentence boundaries
         cdef int n = 0
         if self.doc.is_sentenced:
@@ -402,13 +400,14 @@ cdef class Span:
                 start += -1
             # Find end of the sentence
             end = self.end
-            n = 0
             while end < self.doc.length and self.doc.c[end].sent_start != 1:
                 end += 1
                 n += 1
                 if n >= self.doc.length:
                     break
             return self.doc[start:end]
+        else:
+            raise ValueError(Errors.E030)
 
     @property
     def ents(self):

From df98d3ef9fe6eb94f75aabb5c05e766bda7b822b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 1 Oct 2020 16:21:49 +0200
Subject: [PATCH 308/516] Update import from collections.abc (#6174)

---
 spacy/training/example.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index f2c78203a..ca93b6464 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,4 @@
-from collections import Iterable as IterableInstance
+from collections.abc import Iterable as IterableInstance
 import warnings
 import numpy
 from murmurhash.mrmr cimport hash64

From 73538782a0c3c15d113adec391acc8f7d8b28026 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 1 Oct 2020 16:22:18 +0200
Subject: [PATCH 309/516] Switch Doc.__init__(ents=) to IOB tags (#6173)

* Switch Doc.__init__(ents=) to IOB tags

* Fix check for "-"

* Allow "" or None as missing IOB tag
---
 spacy/tests/doc/test_doc_api.py               | 47 ++++++++++++++++
 spacy/tests/doc/test_retokenize_merge.py      | 18 ++++++-
 spacy/tests/doc/test_to_json.py               |  2 +-
 spacy/tests/regression/test_issue3001-3500.py |  2 +-
 spacy/tests/test_scorer.py                    |  4 +-
 spacy/tests/training/test_training.py         |  7 ++-
 spacy/tokens/doc.pyx                          | 54 ++++++++++++++++---
 7 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index b4b853701..55a1c1ad2 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -533,5 +533,52 @@ def test_doc_ents_setter():
     assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
     vocab = Vocab()
     ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
     doc = Doc(vocab, words=words, ents=ents)
     assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+
+
+def test_doc_init_iob():
+    """Test ents validation/normalization in Doc.__init__"""
+    words = ["a", "b", "c", "d", "e"]
+    ents = ["O"] * len(words)
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert doc.ents == ()
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    ents = ["B-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 3
+
+    # None is missing
+    ents = ["B-PERSON", "I-PERSON", "O", None, "I-GPE"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # empty tag is missing
+    ents = ["", "B-PERSON", "O", "B-PERSON", "I-PERSON"]
+    doc = Doc(Vocab(), words=words, ents=ents)
+    assert len(doc.ents) == 2
+
+    # invalid IOB
+    ents = ["Q-PERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no dash
+    ents = ["OPERSON", "I-PERSON", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # no ent type
+    ents = ["O", "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
+
+    # not strings or None
+    ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
+    with pytest.raises(ValueError):
+        doc = Doc(Vocab(), words=words, ents=ents)
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index 806c4b46f..ab186b062 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -201,6 +201,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
     heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
     tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
     ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
+    ents = ["O"] * len(heads)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[10] = "B-GPE"
+    ents[13] = "B-PERSON"
+    ents[14] = "I-PERSON"
     # fmt: on
     tokens = en_tokenizer(text)
     doc = Doc(
@@ -269,7 +275,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
     # if there is a parse, span.root provides default values
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
     heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-fg"
+    ents[6] = "I-ent-fg"
     deps = ["dep"] * len(words)
     en_vocab.strings.add("ent-de")
     en_vocab.strings.add("ent-fg")
@@ -292,7 +302,11 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
     # check that B is preserved if span[start] is B
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
     heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
-    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
+    ents = ["O"] * len(words)
+    ents[3] = "B-ent-de"
+    ents[4] = "I-ent-de"
+    ents[5] = "B-ent-de"
+    ents[6] = "I-ent-de"
     deps = ["dep"] * len(words)
     doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     with doc.retokenize() as retokenizer:
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index c9bcafcfa..9abe5779d 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -9,7 +9,7 @@ def doc(en_vocab):
     tags = ["VBP", "NN", "NN"]
     heads = [0, 0, 0]
     deps = ["ROOT", "dobj", "dobj"]
-    ents = [("ORG", 1, 2)]
+    ents = ["O", "B-ORG", "O"]
     return Doc(
         en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
     )
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 6fc42e83f..01f58ae77 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -59,7 +59,7 @@ def test_issue3012(en_vocab):
     words = ["This", "is", "10", "%", "."]
     tags = ["DT", "VBZ", "CD", "NN", "."]
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [("PERCENT", 2, 4)]
+    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
     assert doc.has_annotation("TAG")
     expected = ("10", "NUM", "CD", "PERCENT")
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 89864d579..187aa1b52 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -184,7 +184,7 @@ def test_ner_per_type(en_vocab):
         doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
+            ents=["B-CARDINAL", "O", "B-CARDINAL"],
         )
         entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
@@ -209,7 +209,7 @@ def test_ner_per_type(en_vocab):
         doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
+            ents=["B-ORG", "O", "O", "O", "O", "B-GPE", "B-ORG", "O", "O", "O"],
         )
         entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index af3fe63c2..28a411e6d 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -30,7 +30,12 @@ def doc(en_vocab):
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
+    ents = ["O"] * len(words)
+    ents[0] = "B-PERSON"
+    ents[1] = "I-PERSON"
+    ents[5] = "B-LOC"
+    ents[6] = "I-LOC"
+    ents[8] = "B-GPE"
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
     doc = Doc(
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b4027f87e..29fbb6076 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -213,8 +213,9 @@ cdef class Doc:
         sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
             the same length as words, to assign as token.is_sent_start. Will be
             overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
-            (label, start, end) tuples to assign as doc.ents. Defaults to None.
+        ents (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, as IOB tags to assign as token.ent_iob and
+            token.ent_type. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -275,16 +276,55 @@ cdef class Doc:
                     sent_starts[i] = -1
                 elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
                     sent_starts[i] = 0
+        ent_iobs = None
+        ent_types = None
+        if ents is not None:
+            iob_strings = Token.iob_strings()
+            # make valid IOB2 out of IOB1 or IOB2
+            for i, ent in enumerate(ents):
+                if ent is "":
+                    ents[i] = None
+                elif ent is not None and not isinstance(ent, str):
+                    raise ValueError(Errors.E177.format(tag=ent))
+                if i < len(ents) - 1:
+                    # OI -> OB
+                    if (ent is None or ent.startswith("O")) and \
+                            (ents[i+1] is not None and ents[i+1].startswith("I")):
+                        ents[i+1] = "B" + ents[i+1][1:]
+                    # B-TYPE1 I-TYPE2 or I-TYPE1 I-TYPE2 -> B/I-TYPE1 B-TYPE2
+                    if ent is not None and ents[i+1] is not None and \
+                            (ent.startswith("B") or ent.startswith("I")) and \
+                            ents[i+1].startswith("I") and \
+                            ent[1:] != ents[i+1][1:]:
+                        ents[i+1] = "B" + ents[i+1][1:]
+            ent_iobs = []
+            ent_types = []
+            for ent in ents:
+                if ent is None:
+                    ent_iobs.append(iob_strings.index(""))
+                    ent_types.append("")
+                elif ent == "O":
+                    ent_iobs.append(iob_strings.index(ent))
+                    ent_types.append("")
+                else:
+                    if len(ent) < 3 or ent[1] != "-":
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob, ent_type = ent.split("-", 1) 
+                    if ent_iob not in iob_strings:
+                        raise ValueError(Errors.E177.format(tag=ent))
+                    ent_iob = iob_strings.index(ent_iob)
+                    ent_iobs.append(ent_iob)
+                    ent_types.append(ent_type)
         headings = []
         values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts, ent_iobs, ent_types]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START, ENT_IOB, ENT_TYPE]
         for a, annot in enumerate(annotations):
             if annot is not None:
                 if len(annot) != len(words):
                     raise ValueError(Errors.E189)
                 headings.append(possible_headings[a])
-                if annot is not heads and annot is not sent_starts:
+                if annot is not heads and annot is not sent_starts and annot is not ent_iobs:
                     values.extend(annot)
         for value in values:
             self.vocab.strings.add(value)
@@ -296,7 +336,7 @@ cdef class Doc:
             j = 0
             for annot in annotations:
                 if annot:
-                    if annot is heads or annot is sent_starts:
+                    if annot is heads or annot is sent_starts or annot is ent_iobs:
                         for i in range(len(words)):
                             if attrs.ndim == 1:
                                 attrs[i] = annot[i]
@@ -317,8 +357,6 @@ cdef class Doc:
                                 attrs[i, j] = self.vocab.strings[annot[i]]
                     j += 1
             self.from_array(headings, attrs)
-        if ents is not None:
-            self.ents = ents
 
     @property
     def _(self):

From a22215f427308c3d311b2e1de7fe0e690ed78215 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 1 Oct 2020 16:22:48 +0200
Subject: [PATCH 310/516] Add FeatureExtractor from Thinc (#6170)

* move featureextractor from Thinc

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/ml/featureextractor.py                  | 25 +++++++++++++++++++
 spacy/ml/models/textcat.py                    |  3 ++-
 spacy/ml/models/tok2vec.py                    |  8 +++---
 website/docs/api/architectures.md             | 24 +++++++++++++++---
 website/docs/usage/embeddings-transformers.md |  3 ++-
 5 files changed, 54 insertions(+), 9 deletions(-)
 create mode 100644 spacy/ml/featureextractor.py

diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
new file mode 100644
index 000000000..dcf212628
--- /dev/null
+++ b/spacy/ml/featureextractor.py
@@ -0,0 +1,25 @@
+from typing import List, Union, Callable, Tuple
+from thinc.types import Ints2d, Doc
+from thinc.api import Model, registry
+
+
+
+@registry.layers("spacy.FeatureExtractor.v1")
+def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
+    return Model("extract_features", forward, attrs={"columns": columns})
+
+
+def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
+    columns = model.attrs["columns"]
+    features: List[Ints2d] = []
+    for doc in docs:
+        if hasattr(doc, "to_array"):
+            attrs = doc.to_array(columns)
+        else:
+            attrs = doc.doc.to_array(columns)[doc.start : doc.end]
+        if attrs.ndim == 1:
+            attrs = attrs.reshape((attrs.shape[0], 1))
+        features.append(model.ops.asarray2i(attrs, dtype="uint64"))
+
+    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
+    return features, backprop
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 16293cda4..1117b4fde 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 from thinc.api import HashEmbed, with_array, with_cpu, uniqued
-from thinc.api import Relu, residual, expand_window, FeatureExtractor
+from thinc.api import Relu, residual, expand_window
 
 from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 
 
 @registry.architectures.register("spacy.TextCatCNN.v1")
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index fec478e21..95f9c66df 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,14 +1,14 @@
 from typing import Optional, List
-from thinc.api import chain, clone, concatenate, with_array, with_padded
-from thinc.api import Model, noop, list2ragged, ragged2list
-from thinc.api import FeatureExtractor, HashEmbed
-from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 from thinc.types import Floats2d
+from thinc.api import chain, clone, concatenate, with_array, with_padded
+from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
+from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 
 from ...tokens import Doc
 from ...util import registry
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
+from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
 from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
 
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 3e6fbb283..5cee45ba5 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
 a feed-forward subnetwork to build mixed representations. The features used are
-the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
-depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
-static vectors can also be incorporated into the concatenated representation.
+the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
+[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
+vectors can also be incorporated into the concatenated representation.
 
 | Name                      | Description                                                                                                                                                                                                       |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
 | `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 
+### spacy.FeatureExtractor.v1 {#FeatureExtractor}
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.FeatureExtractor.v1"
+> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+> ```
+
+Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
+of feature names to extract, which should refer to token attributes.
+
+| Name        |  Description                                                             |
+| ----------- | ------------------------------------------------------------------------ |
+| `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
+| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
+
 ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 
 The following architectures are provided by the package
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index d5c7ee93a..1b78b8dc5 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
 embeddings.
 
 ```python
-from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
+from thinc.api import add, chain, remap_ids, Embed
 from spacy.ml.staticvectors import StaticVectors
+from spacy.ml.featureextractor import FeatureExtractor
 from spacy.util import registry
 
 @registry.architectures("my_example.MyEmbedding.v1")

From 27cbffff1bda2c41e20cee90591118dc9abb6592 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 1 Oct 2020 16:23:42 +0200
Subject: [PATCH 311/516] Minor edit to CoNLL-U converter (#6172)

This doesn't make a difference given how the `merged_morph` values
override the `morph` values for all the final docs, but could have led
to unexpected bugs in the future if the converter is modified.
---
 spacy/training/converters/conllu_to_docs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py
index 18a2b6a93..2e6084ae5 100644
--- a/spacy/training/converters/conllu_to_docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -207,6 +207,7 @@ def conllu_sentence_to_doc(
         pos=poses,
         deps=deps,
         lemmas=lemmas,
+        morphs=morphs,
         heads=heads,
     )
     for i in range(len(doc)):

From 7f68f4bd92b1bbafe3b4bb8c91d1da08818b06ab Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 16:44:17 +0200
Subject: [PATCH 312/516] Hide jsonl_loc on init vectors and tidy up [ci skip]

---
 spacy/cli/init_pipeline.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index e2935184c..62d9096d9 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -7,6 +7,7 @@ import srsly
 
 from .. import util
 from ..training.initialize import init_nlp, convert_vectors
+from ..language import Language
 from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
 
@@ -19,9 +20,9 @@ def init_vectors_cli(
     output_dir: Path = Arg(..., help="Pipeline output directory"),
     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
-    jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
     # fmt: on
 ):
     """Convert word vectors for use with spaCy. Will export an nlp object that
@@ -32,12 +33,7 @@ def init_vectors_cli(
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
-        lex_attrs = srsly.read_jsonl(jsonl_loc)
-        for attrs in lex_attrs:
-            if "settings" in attrs:
-                continue
-            lexeme = nlp.vocab[attrs["orth"]]
-            lexeme.set_attrs(**attrs)
+        update_lexemes(nlp, jsonl_loc)
     convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name)
     msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
     nlp.to_disk(output_dir)
@@ -48,6 +44,16 @@ def init_vectors_cli(
     )
 
 
+def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
+    # Mostly used for backwards-compatibility and may be removed in the future
+    lex_attrs = srsly.read_jsonl(jsonl_loc)
+    for attrs in lex_attrs:
+        if "settings" in attrs:
+            continue
+        lexeme = nlp.vocab[attrs["orth"]]
+        lexeme.set_attrs(**attrs)
+
+
 @init_cli.command(
     "nlp",
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},

From 1328c9fd1452fc16f42fb4ee6516e53ca055a4db Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 1 Oct 2020 16:59:22 +0200
Subject: [PATCH 313/516] consistently use --code instead of --code-path

---
 website/docs/api/cli.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index a6cb41e5e..ade62e3db 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -256,7 +256,7 @@ fixed. To auto-fill a partial config and save the result, you can use the
 [`init fill-config`](/api/cli#init-fill-config) command.
 
 ```cli
-$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides]
+$ python -m spacy debug config [config_path] [--code] [--show-functions] [--show-variables] [overrides]
 ```
 
 > #### Example
@@ -399,7 +399,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | Name                     | Description                                                                                                                                                                                                                    |
 | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                                                    |
-| `--code-path`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
@@ -1162,7 +1162,7 @@ examples, see the usage guide on
 [integration](/usage/projects#ray).
 
 ```cli
-$ python -m spacy ray train [config_path] [--code-path] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
+$ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--address] [--gpu-id] [--verbose] [overrides]
 ```
 
 > #### Example

From f2627157c85aeda99969df8b1bf9730539d43c1f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 17:38:17 +0200
Subject: [PATCH 314/516] Update docs [ci skip]

---
 spacy/cli/init_pipeline.py           |  2 +-
 spacy/pipeline/tagger.pyx            |  3 ++
 spacy/pipeline/textcat.py            |  3 ++
 spacy/training/corpus.py             |  2 +-
 website/docs/api/cli.md              | 58 +++++++++++++++++++++-------
 website/docs/api/data-formats.md     | 25 ++++++------
 website/docs/api/dependencyparser.md | 32 ++++++++++-----
 website/docs/api/doc.md              | 34 ++++++++--------
 website/docs/api/entitylinker.md     |  2 +-
 website/docs/api/entityrecognizer.md | 29 ++++++++++----
 website/docs/api/language.md         |  2 +-
 website/docs/api/morphologizer.md    | 25 +++++++++---
 website/docs/api/pipe.md             |  2 +-
 website/docs/api/tagger.md           | 27 +++++++++----
 website/docs/api/textcategorizer.md  | 27 +++++++++----
 website/docs/api/top-level.md        | 26 +++++++++++++
 website/docs/usage/training.md       | 20 +++++++++-
 17 files changed, 235 insertions(+), 84 deletions(-)

diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 62d9096d9..1c0233539 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -95,7 +95,7 @@ def init_labels_cli(
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
-    """Generate a JSON file for labels in the data. This helps speed up the
+    """Generate JSON files for the labels in the data. This helps speed up the
     training process, since spaCy won't have to preprocess the data to
     extract the labels."""
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index a4f9d395f..37ad42b88 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -268,6 +268,9 @@ class Tagger(Pipe):
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects..
         nlp (Language): The current nlp object the component is part of.
+        labels: The labels to add to the component, typically generated by the
+            `init labels` command. If no labels are provided, the get_examples
+            callback is used to extract the labels from the data.
 
         DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index c5b8b615b..a092d960f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -355,6 +355,9 @@ class TextCategorizer(Pipe):
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
         nlp (Language): The current nlp object the component is part of.
+        labels: The labels to add to the component, typically generated by the
+            `init labels` command. If no labels are provided, the get_examples
+            callback is used to extract the labels from the data.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 9d8e4ff5c..57787cf76 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -46,7 +46,7 @@ def create_jsonl_reader(
 
 
 @util.registry.readers("spacy.read_labels.v1")
-def read_labels(path: Path, *, require: bool=False):
+def read_labels(path: Path, *, require: bool = False):
     # I decided not to give this a generic name, because I don't want people to
     # use it for arbitrary stuff, as I want this require arg with default False.
     if not require and not path.exists():
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 347ce1683..436582780 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -186,21 +186,53 @@ This functionality was previously available as part of the command `init-model`.
 </Infobox>
 
 ```cli
-$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose]
+$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
 ```
 
-| Name                    | Description                                                                                                                                                                                                                                                         |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`                  | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
-| `vectors_loc`           | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
-| `output_dir`            | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
-| `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                         |
-| `--truncate`, `-t`      | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
-| `--prune`, `-p`         | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
-| `--name`, `-n`          | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
-| `--verbose`, `-V`       | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
-| `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
-| **CREATES**             | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
+| Name               | Description                                                                                                                                                                                                                                                         |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang`             | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                |
+| `vectors_loc`      | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ |
+| `output_dir`       | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               |
+| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  |
+| `--prune`, `-p`    | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     |
+| `--name`, `-n`     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   |
+| `--verbose`, `-V`  | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      |
+| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          |
+| **CREATES**        | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        |
+
+### init labels {#init-labels new="3" tag="command"}
+
+Generate JSON files for the labels in the data. This helps speed up the training
+process, since spaCy won't have to preprocess the data to extract the labels.
+After generating the labels, you can provide them to components that accept a
+`labels` argument on initialization via the
+[`[initialize]`](/api/data-formats#config-initialize) block of your config.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json
+> ```
+
+```cli
+$ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [--gpu-id] [overrides]
+```
+
+| Name              | Description                                                                                                                                                                                |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
+| `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                       |
+| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
+| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
+| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
+| **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
 
 ## convert {#convert tag="command"}
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 825d95def..22a0076cd 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -238,8 +238,6 @@ without requiring them at runtime when you load the trained pipeline back in.
 > data_path = "/path/to/component_data"
 > ```
 
-<!-- TODO: -->
-
 | Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
 | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `components`   | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~                                                                      |
@@ -454,15 +452,20 @@ example = Example.from_dict(doc, gold_dict)
 
 ## Lexical data for vocabulary {#vocab-jsonl new="2"}
 
-To populate a pipeline's vocabulary, you can use the
-[`spacy init vectors`](/api/cli#init-vectors) command and load in a
-[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one
-lexical entry per line via the `--jsonl-loc` option. The first line defines the
-language and vocabulary settings. All other lines are expected to be JSON
-objects describing an individual lexeme. The lexical attributes will be then set
-as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab`
-command outputs a ready-to-use spaCy pipeline with a `Vocab` containing the
-lexical data.
+This data file can be provided via the `vocab_data` setting in the
+`[initialize]` block of the training config to pre-define the lexical data to
+initialize the `nlp` object's vocabulary with. The file should contain one
+lexical entry per line. The first line defines the language and vocabulary
+settings. All other lines are expected to be JSON objects describing an
+individual lexeme. The lexical attributes will be then set as attributes on
+spaCy's [`Lexeme`](/api/lexeme#attributes) object.
+
+> #### Example config
+>
+> ```ini
+> [initialize]
+> vocab_data = "/path/to/vocab-data.jsonl"
+> ```
 
 ```python
 ### First line
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index ad627b006..ea4b779c7 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -21,8 +21,9 @@ non-projective parses.
 The parser is trained using an **imitation learning objective**. It follows the
 actions predicted by the current weights, and at each state, determines which
 actions are compatible with the optimal parse that could be reached from the
-current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
-that more than one action may be optimal for a given state.
+current state. The weights are updated such that the scores assigned to the set
+of optimal actions is increased, while scores assigned to other actions are
+decreased. Note that more than one action may be optimal for a given state.
 
 ## Config and implementation {#config}
 
@@ -139,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## DependencyParser.initialize {#initialize tag="method"}
+## DependencyParser.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@@ -148,7 +149,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -162,12 +166,22 @@ This method was previously called `begin_training`.
 > parser = nlp.add_pipe("parser")
 > parser.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.parser]
+>
+> [initialize.components.parser.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/parser.json
+> ```
 
-| Name           | Description                                                                                                                           |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                       |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| Name           | Description                                                                                                                                                                                                                                                                                                         |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 
 ## DependencyParser.predict {#predict tag="method"}
 
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 43d968c3a..d511dc889 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name                                     | Description                                                                                                                                                                                                       |
-| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                                  |
-| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                                |
-| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~                      |
-| _keyword-only_                           |                                                                                                                                                                                                                   |
-| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                                |
-| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
-| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
-| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
-| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
-| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~                |
-| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
-| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~                 |
-| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
+| Name                                     | Description                                                                                                                                                                                        |
+| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
+| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
+| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
+| _keyword-only_                           |                                                                                                                                                                                                    |
+| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |
+| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
+| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
+| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
+| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
+| `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 
@@ -503,7 +503,9 @@ invalidated, although they may accidentally continue to work.
 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
-`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided
+using the `"_"` key and specifying a dictionary that maps attribute names to
+values.
 
 > #### Example
 >
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index b3c3f20f5..169a175e2 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityLinker.initialize {#initialize tag="method"}
+## EntityLinker.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 65dcfc17c..5fbd0b229 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
 
 | Setting                       | Description                                                                                                                                                                                                                                         |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                       |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                     |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
 
@@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityRecognizer.initialize {#initialize tag="method"}
+## EntityRecognizer.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@@ -138,7 +138,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -152,12 +155,22 @@ This method was previously called `begin_training`.
 > ner = nlp.add_pipe("ner")
 > ner.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json
+> ```
 
-| Name           | Description                                                                                                                           |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                       |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| Name           | Description                                                                                                                                                                                                                                                                                                         |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 
 ## EntityRecognizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index d8d3b3edc..9f0612b2b 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -202,7 +202,7 @@ more efficient than processing texts one-by-one.
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
 
-## Language.initialize {#initialize tag="method"}
+## Language.initialize {#initialize tag="method" new="3"}
 
 Initialize the pipeline for training and return an
 [`Optimizer`](https://thinc.ai/docs/api-optimizers). Under the hood, it uses the
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 68e096ab7..50e2bb33a 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -126,7 +126,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 > #### Example
 >
@@ -134,12 +137,22 @@ by [`Language.initialize`](/api/language#initialize).
 > morphologizer = nlp.add_pipe("morphologizer")
 > morphologizer.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.morphologizer]
+>
+> [initialize.components.morphologizer.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/morphologizer.json
+> ```
 
-| Name           | Description                                                                                                                           |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                       |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| Name           | Description                                                                                                                                                                                                                                                                                                         |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 
 ## Morphologizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 385ad7ec9..4f5ac6f61 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Pipe.initialize {#initialize tag="method"}
+## Pipe.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index ff9763e61..d7c56be67 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Tagger.initialize {#initialize tag="method"}
+## Tagger.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@@ -121,7 +121,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -135,12 +138,22 @@ This method was previously called `begin_training`.
 > tagger = nlp.add_pipe("tagger")
 > tagger.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.tagger]
+>
+> [initialize.components.tagger.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/tagger.json
+> ```
 
-| Name           | Description                                                                                                                           |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                       |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| Name           | Description                                                                                                                                                                                                                                                                                                         |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
 
 ## Tagger.predict {#predict tag="method"}
 
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index f279189f6..dd8c81040 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## TextCategorizer.initialize {#initialize tag="method"}
+## TextCategorizer.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@@ -134,7 +134,10 @@ training data or a representative sample. Initialization includes validating the
 network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize).
+by [`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
@@ -148,12 +151,22 @@ This method was previously called `begin_training`.
 > textcat = nlp.add_pipe("textcat")
 > textcat.initialize(lambda: [], nlp=nlp)
 > ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.textcat]
+>
+> [initialize.components.textcat.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/textcat.json
+> ```
 
-| Name           | Description                                                                                                                           |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
-| _keyword-only_ |                                                                                                                                       |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| Name           | Description                                                                                                                                                                                                                                                                                                         |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
+| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 
 ## TextCategorizer.predict {#predict tag="method"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index c16983c78..68d7a3039 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -538,6 +538,32 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 | **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
 
+### spacy.read_labels.v1 {#read_labels tag="registered function"}
+
+Read a JSON-formatted labels file generated with
+[`init labels`](/api/cli#init-labels). Typically used in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the training
+config to speed up the model initialization process and provide pre-generated
+label sets.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components]
+>
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json"
+> ```
+
+| Name        | Description                                                                                                                                                                                                               |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
+| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
+| **CREATES** | The                                                                                                                                                                                                                       |
+
 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 
 A data batcher implements a batching strategy that essentially turns a stream of
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 51aa82618..c6c05ac5b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -204,7 +204,19 @@ initialize it.
 
 ![Illustration of pipeline lifecycle](../images/lifecycle.svg)
 
-<!-- TODO: explain lifecycle and initialization -->
+At runtime spaCy will only use the `[nlp]` and `[components]` blocks of the
+config and load all data, including tokenization rules, model weights and other
+resources from the pipeline directory. The `[training]` block contains the
+settings for training the model and is only used during training. Similarly, the
+`[initialize]` block defines how the initial `nlp` object should be set up
+before training and whether it should be initialized with vectors or pretrained
+tok2vec weights, or any other data needed by the components.
+
+The initialization settings are only loaded and used when
+[`nlp.initialize`](/api/language#initialize) is called (typically right before
+training). This allows you to set up your pipeline using local data resources
+and custom functions, and preserve the information in your config – but without
+requiring it to be available at runtime
 
 ### Overwriting config settings on the command line {#config-overrides}
 
@@ -803,6 +815,10 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
     return create_model(output_width)
 ```
 
+<!-- TODO:
+### Customizing the initialization {#initialization}
+-->
+
 ## Data utilities {#data}
 
 spaCy includes various features and utilities to make it easy to train models
@@ -853,7 +869,7 @@ nlp = spacy.blank("en")
 docbin = DocBin(nlp.vocab)
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
-ents = [("ORG", 0, 1), ("GPE", 5, 6)]
+ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
 doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
 docbin.add(doc)
 docbin.to_disk("./train.spacy")

From b6b73a3ca8c3352b74c6f4ae89338a018a52f092 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 17:45:29 +0200
Subject: [PATCH 315/516] Update docs [ci skip]

---
 website/docs/usage/v3.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 36840f0f3..4ce57af01 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -104,7 +104,6 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 >
 > ```ini
 > [training]
-> vectors = null
 > accumulate_gradient = 3
 >
 > [training.optimizer]
@@ -430,6 +429,8 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config)                        | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config).                                                                     |
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
 | [`init config`](/api/cli#init-config), [`init fill-config`](/api/cli#init-fill-config), [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training).                                                                                                   |
+| [`init vectors`](/api/cli#init-vectors)                                                                                         | Convert word vectors for use with spaCy.                                                                                                                                                         |
+| [`init labels`](/api/cli#init-labels)                                                                                           | Generate JSON files for the labels in the data to speed up training.                                                                                                                             |
 | [`project`](/api/cli#project)                                                                                                   | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects).                                                                                                       |
 | [`ray`](/api/cli#ray)                                                                                                           | Suite of CLI commands for parallel training with [Ray](https://ray.io/), provided by the [`spacy-ray`](https://github.com/explosion/spacy-ray) extension package.                                |
 

From 1700c8541e4f0b696b52c37f1a15eb1b1c5be17e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 17:57:16 +0200
Subject: [PATCH 316/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index fbe772d25..8c64c2d76 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a26"
+__version__ = "3.0.0a27"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From d48ddd6c9aa983f922d3f310eeba6a272d4c8cbd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 21:54:33 +0200
Subject: [PATCH 317/516] Remove default initialize lookups

---
 spacy/lang/da/__init__.py      | 12 ------------
 spacy/lang/de/__init__.py      | 12 ------------
 spacy/lang/el/__init__.py      | 12 ------------
 spacy/lang/id/__init__.py      | 12 ------------
 spacy/lang/lb/__init__.py      | 12 ------------
 spacy/lang/pt/__init__.py      | 12 ------------
 spacy/lang/ru/__init__.py      | 12 ------------
 spacy/lang/sr/__init__.py      | 12 ------------
 spacy/lang/ta/__init__.py      | 12 ------------
 spacy/lang/th/__init__.py      |  7 -------
 spacy/tests/parser/test_ner.py |  1 -
 11 files changed, 116 deletions(-)

diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index 7128338af..8cac30b26 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class DanishDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 99c161961..b645d3480 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class GermanDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 818405842..1a7b19914 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -9,21 +9,9 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIX
 from .lemmatizer import GreekLemmatizer
 from ...lookups import Lookups
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class GreekDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 46bef57ca..87373551c 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -4,21 +4,9 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class IndonesianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     prefixes = TOKENIZER_PREFIXES
     suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index ead5f5d10..da6fe55d7 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -3,21 +3,9 @@ from .punctuation import TOKENIZER_INFIXES
 from .lex_attrs import LEX_ATTRS
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class LuxembourgishDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     lex_attr_getters = LEX_ATTRS
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 1c95c11d9..0447099f0 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -3,21 +3,9 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class PortugueseDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
     prefixes = TOKENIZER_PREFIXES
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 857e197e9..6436ae0c7 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -7,21 +7,9 @@ from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
 from ...lookups import Lookups
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class RussianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index 5da19c6f3..165e54975 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -2,21 +2,9 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class SerbianDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index 7a5a3ac8f..ac5fc7124 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,21 +1,9 @@
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
-from ...util import load_config_from_str
-
-
-DEFAULT_CONFIG = """
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
-"""
 
 
 class TamilDefaults(Language.Defaults):
-    config = load_config_from_str(DEFAULT_CONFIG)
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
 
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 834fe1871..219c50c1a 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -10,13 +10,6 @@ DEFAULT_CONFIG = """
 
 [nlp.tokenizer]
 @tokenizers = "spacy.th.ThaiTokenizer"
-
-[initialize]
-
-[initialize.lookups]
-@misc = "spacy.LookupsDataLoader.v1"
-lang = ${nlp.lang}
-tables = ["lexeme_norm"]
 """
 
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 78a20c1e8..b657ae2e8 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -339,7 +339,6 @@ def test_ner_warns_no_lookups(caplog):
     nlp.vocab.lookups = Lookups()
     assert not len(nlp.vocab.lookups)
     nlp.add_pipe("ner")
-    nlp.config["initialize"]["lookups"] = None
     with caplog.at_level(logging.DEBUG):
         nlp.initialize()
         assert "W033" in caplog.text

From da30701cd1a351285bc7c865e28fd12f8beb0482 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 21:58:11 +0200
Subject: [PATCH 318/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 8c64c2d76..18fc77184 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a27"
+__version__ = "3.0.0a28"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 684a77870b478228dbb3d5ab45a2798ef83c9b1a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 1 Oct 2020 22:17:26 +0200
Subject: [PATCH 319/516] Allow CharacterEmbed to specify feature

---
 spacy/ml/models/tok2vec.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index fec478e21..888dc9caa 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Union
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list
 from thinc.api import FeatureExtractor, HashEmbed
@@ -165,7 +165,8 @@ def MultiHashEmbed(
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
-    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
+    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
+    feature: Union[int, str]="NORM"
 ):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
@@ -183,7 +184,8 @@ def CharacterEmbed(
     also concatenated on, and the result is then passed through a feed-forward
     network to construct a single vector to represent the information.
 
-    width (int): The width of the output vector and the NORM hash embedding.
+    feature (int or str): An attribute to embed, to concatenate with the characters.
+    width (int): The width of the output vector and the feature embedding.
     rows (int): The number of rows in the NORM hash embedding table.
     nM (int): The dimensionality of the character embeddings. Recommended values
         are between 16 and 64.
@@ -193,12 +195,15 @@ def CharacterEmbed(
     also_use_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
+    feature = intify_attr(feature)
+    if feature is None:
+        raise ValueError("Invalid feature: Must be a token attribute.")
     if also_use_static_vectors:
         model = chain(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
                 chain(
-                    FeatureExtractor([NORM]),
+                    FeatureExtractor([feature]),
                     list2ragged(),
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),
@@ -214,7 +219,7 @@ def CharacterEmbed(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
                 chain(
-                    FeatureExtractor([NORM]),
+                    FeatureExtractor([feature]),
                     list2ragged(),
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),

From b854bca15c0e4cf62d2e1c0f896dc1e6a454c099 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 1 Oct 2020 22:17:58 +0200
Subject: [PATCH 320/516] Default to LOWER in character embed

---
 spacy/ml/models/tok2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 888dc9caa..907a7a293 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -166,7 +166,7 @@ def MultiHashEmbed(
 @registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
     width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
-    feature: Union[int, str]="NORM"
+    feature: Union[int, str]="LOWER"
 ):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for

From 86c3ec9c2b3ad28797b26fb75b808bf573087b35 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 1 Oct 2020 22:21:46 +0200
Subject: [PATCH 321/516] Refactor Token morph setting (#6175)

* Refactor Token morph setting

* Remove `Token.morph_`
* Add `Token.set_morph()`
  * `0` resets `token.c.morph` to unset
  * Any other values are passed to `Morphology.add`

* Add token.morph setter to set from MorphAnalysis
---
 spacy/errors.py                             |  3 ++
 spacy/pipeline/morphologizer.pyx            |  4 +-
 spacy/tests/doc/test_array.py               |  6 +--
 spacy/tests/doc/test_doc_api.py             | 38 +++++++++++++++----
 spacy/tests/doc/test_morphanalysis.py       | 42 ++++++++++-----------
 spacy/tests/doc/test_retokenize_merge.py    |  4 +-
 spacy/tests/doc/test_retokenize_split.py    |  4 +-
 spacy/tests/matcher/test_matcher_api.py     | 24 ++++++------
 spacy/tests/matcher/test_phrase_matcher.py  |  4 +-
 spacy/tests/pipeline/test_attributeruler.py | 30 +++++++--------
 spacy/tests/pipeline/test_morphologizer.py  |  4 +-
 spacy/tests/test_scorer.py                  |  6 +--
 spacy/tests/training/test_new_example.py    |  2 +-
 spacy/tests/training/test_training.py       |  4 +-
 spacy/tokens/_serialize.py                  |  2 +-
 spacy/tokens/doc.pyx                        |  2 +-
 spacy/tokens/token.pyx                      | 26 ++++++-------
 spacy/training/example.pyx                  |  2 +-
 spacy/training/gold_io.pyx                  |  2 +-
 19 files changed, 118 insertions(+), 91 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 1c934d188..5236992e9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -710,6 +710,9 @@ class Errors:
              "options: {modes}")
     E1012 = ("Entity spans and blocked/missing/outside spans should be "
              "provided to doc.set_ents as lists of `Span` objects.")
+    E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
+             "token itself. To set the morph from this MorphAnalysis, set from "
+             "the string value with: `token.set_morph(str(other_morph))`.")
 
 
 @add_codes
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 60ad10a2b..ab0554692 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -149,7 +149,7 @@ class Morphologizer(Tagger):
         for example in get_examples():
             for i, token in enumerate(example.reference):
                 pos = token.pos_
-                morph = token.morph_
+                morph = str(token.morph)
                 # create and add the combined morph+POS label
                 morph_dict = Morphology.feats_to_dict(morph)
                 if pos:
@@ -167,7 +167,7 @@ class Morphologizer(Tagger):
             gold_array = []
             for i, token in enumerate(example.reference):
                 pos = token.pos_
-                morph = token.morph_
+                morph = str(token.morph)
                 morph_dict = Morphology.feats_to_dict(morph)
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index 9c050f740..ef54c581c 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
     words = ["Eat", "blue", "ham"]
     morph = ["Feat=V", "Feat=J", "Feat=N"]
     doc = Doc(en_vocab, words=words, morphs=morph)
-    assert morph[0] == doc[0].morph_
-    assert morph[1] == doc[1].morph_
-    assert morph[2] == doc[2].morph_
+    assert morph[0] == str(doc[0].morph)
+    assert morph[1] == str(doc[1].morph)
+    assert morph[2] == str(doc[2].morph)
 
     feats_array = doc.to_array((ORTH, MORPH))
     assert feats_array[0][1] == doc[0].morph.key
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 55a1c1ad2..e3e056685 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
     words = ["I", "live", "in", "New", "York", "."]
     morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
     # fmt: on
-    doc = Doc(en_vocab, words=words)
-    for i, morph in enumerate(morphs):
-        doc[i].morph_ = morph
+    doc = Doc(en_vocab, words=words, morphs=morphs)
     attrs = [MORPH]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-    assert [t.morph_ for t in new_doc] == morphs
-    assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
+    assert [str(t.morph) for t in new_doc] == morphs
+    assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
 
 
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
 
     doc[0].tag_ = "A"
     doc[0].pos_ = "X"
-    doc[0].morph_ = "Feat=Val"
+    doc[0].set_morph("Feat=Val")
     doc[0].lemma_ = "a"
     doc[0].dep_ = "dep"
     doc[0].head = doc[1]
@@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
 
     doc[1].tag_ = "A"
     doc[1].pos_ = "X"
-    doc[1].morph_ = ""
+    doc[1].set_morph("")
     doc[1].lemma_ = "a"
     doc[1].dep_ = "dep"
     doc.ents = [Span(doc, 0, 2, label="HELLO")]
@@ -538,6 +536,32 @@ def test_doc_ents_setter():
     assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
 
 
+def test_doc_morph_setter(en_tokenizer, de_tokenizer):
+    doc1 = en_tokenizer("a b")
+    doc1b = en_tokenizer("c d")
+    doc2 = de_tokenizer("a b")
+
+    # unset values can be copied
+    doc1[0].morph = doc1[1].morph
+    assert doc1[0].morph.key == 0
+    assert doc1[1].morph.key == 0
+
+    # morph values from the same vocab can be copied
+    doc1[0].set_morph("Feat=Val")
+    doc1[1].morph = doc1[0].morph
+    assert doc1[0].morph == doc1[1].morph
+
+    # ... also across docs
+    doc1b[0].morph = doc1[0].morph
+    assert doc1[0].morph == doc1b[0].morph
+
+    doc2[0].set_morph("Feat2=Val2")
+
+    # the morph value must come from the same vocab
+    with pytest.raises(ValueError):
+        doc1[0].morph = doc2[0].morph
+
+
 def test_doc_init_iob():
     """Test ents validation/normalization in Doc.__init__"""
     words = ["a", "b", "c", "d", "e"]
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index f378ce042..56c80dd66 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -4,13 +4,13 @@ import pytest
 @pytest.fixture
 def i_has(en_tokenizer):
     doc = en_tokenizer("I has")
-    doc[0].morph_ = {"PronType": "prs"}
-    doc[1].morph_ = {
+    doc[0].set_morph({"PronType": "prs"})
+    doc[1].set_morph({
         "VerbForm": "fin",
         "Tense": "pres",
         "Number": "sing",
         "Person": "three",
-    }
+    })
 
     return doc
 
@@ -47,20 +47,20 @@ def test_morph_get(i_has):
 def test_morph_set(i_has):
     assert i_has[0].morph.get("PronType") == ["prs"]
     # set by string
-    i_has[0].morph_ = "PronType=unk"
+    i_has[0].set_morph("PronType=unk")
     assert i_has[0].morph.get("PronType") == ["unk"]
     # set by string, fields are alphabetized
-    i_has[0].morph_ = "PronType=123|NounType=unk"
-    assert i_has[0].morph_ == "NounType=unk|PronType=123"
+    i_has[0].set_morph("PronType=123|NounType=unk")
+    assert str(i_has[0].morph) == "NounType=unk|PronType=123"
     # set by dict
-    i_has[0].morph_ = {"AType": "123", "BType": "unk"}
-    assert i_has[0].morph_ == "AType=123|BType=unk"
+    i_has[0].set_morph({"AType": "123", "BType": "unk"})
+    assert str(i_has[0].morph) == "AType=123|BType=unk"
     # set by string with multiple values, fields and values are alphabetized
-    i_has[0].morph_ = "BType=c|AType=b,a"
-    assert i_has[0].morph_ == "AType=a,b|BType=c"
+    i_has[0].set_morph("BType=c|AType=b,a")
+    assert str(i_has[0].morph) == "AType=a,b|BType=c"
     # set by dict with multiple values, fields and values are alphabetized
-    i_has[0].morph_ = {"AType": "b,a", "BType": "c"}
-    assert i_has[0].morph_ == "AType=a,b|BType=c"
+    i_has[0].set_morph({"AType": "b,a", "BType": "c"})
+    assert str(i_has[0].morph) == "AType=a,b|BType=c"
 
 
 def test_morph_str(i_has):
@@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
     doc = tokenizer("a dog")
 
     # set through token.morph_
-    doc[0].morph_ = "PronType=prs"
-    assert doc[0].morph_ == "PronType=prs"
+    doc[0].set_morph("PronType=prs")
+    assert str(doc[0].morph) == "PronType=prs"
     assert doc.to_array(["MORPH"])[0] != 0
 
     # unset with token.morph
-    doc[0].morph = 0
+    doc[0].set_morph(0)
     assert doc.to_array(["MORPH"])[0] == 0
 
     # empty morph is equivalent to "_"
-    doc[0].morph_ = ""
-    assert doc[0].morph_ == ""
+    doc[0].set_morph("")
+    assert str(doc[0].morph) == ""
     assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 
     # "_" morph is also equivalent to empty morph
-    doc[0].morph_ = "_"
-    assert doc[0].morph_ == ""
+    doc[0].set_morph("_")
+    assert str(doc[0].morph) == ""
     assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
 
     # set through existing hash with token.morph
     tokenizer.vocab.strings.add("Feat=Val")
-    doc[0].morph = tokenizer.vocab.strings.add("Feat=Val")
-    assert doc[0].morph_ == "Feat=Val"
+    doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
+    assert str(doc[0].morph) == "Feat=Val"
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index ab186b062..cb886545a 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
     assert doc[4].text == "the beach boys"
     assert doc[4].text_with_ws == "the beach boys "
     assert doc[4].tag_ == "NAMED"
-    assert doc[4].morph_ == "Number=Plur"
+    assert str(doc[4].morph) == "Number=Plur"
     assert doc[5].text == "all night"
     assert doc[5].text_with_ws == "all night"
     assert doc[5].tag_ == "NAMED"
-    assert doc[5].morph_ == "Number=Plur"
+    assert str(doc[5].morph) == "Number=Plur"
 
 
 def test_doc_retokenize_merge_children(en_tokenizer):
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 4d4b170f9..238e36d59 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
     assert doc[0].text == "Los"
     assert doc[0].head.text == "Angeles"
     assert doc[0].idx == 0
-    assert doc[0].morph_ == "Number=Sing"
+    assert str(doc[0].morph) == "Number=Sing"
     assert doc[1].idx == 3
     assert doc[1].text == "Angeles"
     assert doc[1].head.text == "start"
-    assert doc[1].morph_ == "Number=Sing"
+    assert str(doc[1].morph) == "Number=Sing"
     assert doc[2].text == "start"
     assert doc[2].head.text == "."
     assert doc[3].text == "."
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 627110cdd..77b09f376 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
     matcher.add("M", [pattern])
     doc = Doc(en_vocab, words=["a", "b", "c"])
     assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val"
+    doc[0].set_morph("Feat=Val")
     assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    doc[0].set_morph("Feat=Val|Feat2=Val2")
     assert len(matcher(doc)) == 3
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
     assert len(matcher(doc)) == 2
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
     assert len(matcher(doc)) == 2
 
     # IS_SUBSET acts like "IN" for attrs other than MORPH
@@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
     matcher.add("M", [pattern])
     doc = Doc(en_vocab, words=["a", "b", "c"])
     assert len(matcher(doc)) == 0
-    doc[0].morph_ = "Feat=Val|Feat2=Val2"
+    doc[0].set_morph("Feat=Val|Feat2=Val2")
     assert len(matcher(doc)) == 0
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
     assert len(matcher(doc)) == 1
-    doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
+    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
     assert len(matcher(doc)) == 1
 
     # IS_SUPERSET with more than one value only matches for MORPH
@@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
     doc = Doc(en_vocab, words=["a", "b", "c"])
     assert len(matcher(doc)) == 0
 
-    doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
+    doc[0].set_morph("Feat2=Val2|Feat1=Val1")
     assert len(matcher(doc)) == 2
-    doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
+    doc[0].set_morph("Feat1=Val1|Feat2=Val2")
     assert len(matcher(doc)) == 2
 
     # multiple values are split
@@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
     doc = Doc(en_vocab, words=["a", "b", "c"])
     assert len(matcher(doc)) == 0
 
-    doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
+    doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
     assert len(matcher(doc)) == 1
-    doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
+    doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
     assert len(matcher(doc)) == 2
 
 
@@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
     doc2 = Doc(en_vocab, words=["Test"])
     doc2[0].tag_ = "TAG"
     doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
     doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
     # DEP requires DEP
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 522356ffc..1b81fd780 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
     doc2 = Doc(en_vocab, words=["Test"])
     doc2[0].tag_ = "TAG"
     doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
     doc3 = Doc(en_vocab, words=["Test"])
     matcher = PhraseMatcher(en_vocab, validate=True)
     with pytest.warns(UserWarning):
@@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
     doc2 = Doc(en_vocab, words=["Test"])
     doc2[0].tag_ = "TAG"
     doc2[0].pos_ = "X"
-    doc2[0].morph_ = "Feat=Val"
+    doc2[0].set_morph("Feat=Val")
     doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
     # DEP requires DEP
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index b9e5894dd..5773127af 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
         a.add(**p)
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
     assert doc.has_annotation("LEMMA")
     assert doc.has_annotation("MORPH")
 
@@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
     assert doc.has_annotation("LEMMA")
     assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
@@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     )
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
     assert doc.has_annotation("LEMMA")
     assert doc.has_annotation("MORPH")
 
@@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
     nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
-    assert doc[2].morph_ == "Case=Nom|Number=Plur"
+    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
-    assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
 
     dev_examples = [
         Example.from_dict(
@@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
     for i in range(len(doc)):
         if i == 4:
             assert doc[i].pos_ == "PUNCT"
-            assert doc[i].morph_ == "PunctType=peri"
+            assert str(doc[i].morph) == "PunctType=peri"
         else:
             assert doc[i].pos_ == ""
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""
 
 
 def test_attributeruler_morph_rules(nlp, morph_rules):
@@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
     for i in range(len(doc)):
         if i != 2:
             assert doc[i].pos_ == ""
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""
         else:
             assert doc[2].pos_ == "DET"
             assert doc[2].lemma_ == "a"
-            assert doc[2].morph_ == "Case=Nom"
+            assert str(doc[2].morph) == "Case=Nom"
 
 
 def test_attributeruler_indices(nlp):
@@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
     for i in range(len(doc)):
         if i == 1:
             assert doc[i].lemma_ == "was"
-            assert doc[i].morph_ == "Case=Nom|Number=Sing"
+            assert str(doc[i].morph) == "Case=Nom|Number=Sing"
         elif i == 2:
             assert doc[i].lemma_ == "the"
-            assert doc[i].morph_ == "Case=Nom|Number=Plur"
+            assert str(doc[i].morph) == "Case=Nom|Number=Plur"
         elif i == 3:
             assert doc[i].lemma_ == "cat"
         else:
-            assert doc[i].morph_ == ""
+            assert str(doc[i].morph) == ""
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
     with pytest.raises(ValueError):
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 5d605f4e6..af81129c0 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -91,7 +91,7 @@ def test_overfitting_IO():
     doc = nlp(test_text)
     gold_morphs = ["Feat=N", "Feat=V", "", ""]
     gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
-    assert [t.morph_ for t in doc] == gold_morphs
+    assert [str(t.morph) for t in doc] == gold_morphs
     assert [t.pos_ for t in doc] == gold_pos_tags
 
     # Also test the results are still the same after IO
@@ -99,5 +99,5 @@ def test_overfitting_IO():
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        assert [t.morph_ for t in doc2] == gold_morphs
+        assert [str(t.morph) for t in doc2] == gold_morphs
         assert [t.pos_ for t in doc2] == gold_pos_tags
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 187aa1b52..039f3d4d8 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -76,7 +76,7 @@ def tagged_doc():
     for i in range(len(tags)):
         doc[i].tag_ = tags[i]
         doc[i].pos_ = pos[i]
-        doc[i].morph_ = morphs[i]
+        doc[i].set_morph(morphs[i])
         if i > 0:
             doc[i].is_sent_start = False
     return doc
@@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
     gold = {
         "tags": [t.tag_ for t in tagged_doc],
         "pos": [t.pos_ for t in tagged_doc],
-        "morphs": [t.morph_ for t in tagged_doc],
+        "morphs": [str(t.morph) for t in tagged_doc],
         "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
     }
     example = Example.from_dict(tagged_doc, gold)
@@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
     tags[0] = "NN"
     pos = [t.pos_ for t in tagged_doc]
     pos[1] = "X"
-    morphs = [t.morph_ for t in tagged_doc]
+    morphs = [str(t.morph) for t in tagged_doc]
     morphs[1] = "Number=sing"
     morphs[2] = "Number=plur"
     gold = {
diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index 81207b640..06db86a12 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
     predicted = Doc(vocab, words=annots["words"])
     example = Example.from_dict(predicted, annots)
     for i, token in enumerate(example.reference):
-        assert token.morph_ == annots["morphs"][i]
+        assert str(token.morph) == annots["morphs"][i]
 
 
 @pytest.mark.parametrize(
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 28a411e6d..405801f62 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -460,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
     idx = [t.idx for t in doc]
     tags = [t.tag_ for t in doc]
     pos = [t.pos_ for t in doc]
-    morphs = [t.morph_ for t in doc]
+    morphs = [str(t.morph) for t in doc]
     lemmas = [t.lemma_ for t in doc]
     deps = [t.dep_ for t in doc]
     heads = [t.head.i for t in doc]
@@ -482,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
     assert idx == [t.idx for t in reloaded_example.reference]
     assert tags == [t.tag_ for t in reloaded_example.reference]
     assert pos == [t.pos_ for t in reloaded_example.reference]
-    assert morphs == [t.morph_ for t in reloaded_example.reference]
+    assert morphs == [str(t.morph) for t in reloaded_example.reference]
     assert lemmas == [t.lemma_ for t in reloaded_example.reference]
     assert deps == [t.dep_ for t in reloaded_example.reference]
     assert heads == [t.head.i for t in reloaded_example.reference]
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 2d4e9af9d..ed283a86b 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -101,7 +101,7 @@ class DocBin:
             self.strings.add(token.text)
             self.strings.add(token.tag_)
             self.strings.add(token.lemma_)
-            self.strings.add(token.morph_)
+            self.strings.add(str(token.morph))
             self.strings.add(token.dep_)
             self.strings.add(token.ent_type_)
             self.strings.add(token.ent_kb_id_)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 29fbb6076..9dfa6e714 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1248,7 +1248,7 @@ cdef class Doc:
         for token in self:
             strings.add(token.tag_)
             strings.add(token.lemma_)
-            strings.add(token.morph_)
+            strings.add(str(token.morph))
             strings.add(token.dep_)
             strings.add(token.ent_type_)
             strings.add(token.ent_kb_id_)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 239de4559..8099abd92 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -215,20 +215,20 @@ cdef class Token:
         def __get__(self):
             return MorphAnalysis.from_id(self.vocab, self.c.morph)
 
-        def __set__(self, attr_t morph):
-            if morph == 0:
-                self.c.morph = morph
-            elif morph in self.vocab.strings:
-                self.morph_ = self.vocab.strings[morph]
-            else:
-                raise ValueError(Errors.E1009.format(val=morph))
+        def __set__(self, MorphAnalysis morph):
+            # Check that the morph has the same vocab
+            if self.vocab != morph.vocab:
+                raise ValueError(Errors.E1013)
+            self.c.morph = morph.c.key
 
-    property morph_:
-        def __get__(self):
-            return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
-
-        def __set__(self, features):
-            cdef hash_t key = self.vocab.morphology.add(features)
+    def set_morph(self, features):
+        cdef hash_t key
+        if features is 0:
+            self.c.morph = 0
+        else:
+            if isinstance(features, int):
+                features = self.vocab.strings[features]
+            key = self.vocab.morphology.add(features)
             self.c.morph = key
 
     @property
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index ca93b6464..f6225135c 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -226,7 +226,7 @@ cdef class Example:
                 "TAG": [t.tag_ for t in self.reference],
                 "LEMMA": [t.lemma_ for t in self.reference],
                 "POS": [t.pos_ for t in self.reference],
-                "MORPH": [t.morph_ for t in self.reference],
+                "MORPH": [str(t.morph) for t in self.reference],
                 "HEAD": [t.head.i for t in self.reference],
                 "DEP": [t.dep_ for t in self.reference],
                 "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 8b9f5ab2b..8fb6b8565 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                 if include_annotation["POS"]:
                     json_token["pos"] = token.pos_
                 if include_annotation["MORPH"]:
-                    json_token["morph"] = token.morph_
+                    json_token["morph"] = str(token.morph)
                 if include_annotation["LEMMA"]:
                     json_token["lemma"] = token.lemma_
                 if include_annotation["DEP"]:

From 5762876dcc1be42e982ad989335f9a485a7c3be3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 22:27:37 +0200
Subject: [PATCH 322/516] Update default config [ci skip]

---
 spacy/default_config.cfg | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 6bd1ed24d..d7fc46ea0 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -2,7 +2,6 @@
 train = null
 dev = null
 vectors = null
-vocab_data = null
 init_tok2vec = null
 
 [system]
@@ -11,8 +10,13 @@ gpu_allocator = null
 
 [nlp]
 lang = null
+# List of pipeline component names, in order. The names should correspond to
+# components defined in the [components block]
 pipeline = []
+# Components that are loaded but disabled by default
 disabled = []
+# Optional callbacks to modify the nlp object before it's initialized, after
+# it's created and after the pipeline has been set up
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@@ -20,6 +24,7 @@ after_pipeline_creation = null
 [nlp.tokenizer]
 @tokenizers = "spacy.Tokenizer.v1"
 
+# The pipeline components and their models
 [components]
 
 # Readers for corpora like dev and train.
@@ -38,8 +43,7 @@ max_length = 0
 limit = 0
 # Apply some simply data augmentation, where we replace tokens with variations.
 # This is especially useful for punctuation and case replacement, to help
-# generalize beyond corpora that don't have smart-quotes, or only have smart
-# quotes, etc.
+# generalize beyond corpora that don't/only have smart quotes etc.
 augmenter = null
 
 [corpora.dev]
@@ -53,6 +57,7 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
+# Optional callback for data augmentation
 augmenter = null
 
 # Training hyper-parameters and additional features.
@@ -102,17 +107,18 @@ use_averages = false
 eps = 1e-8
 learn_rate = 0.001
 
-# The 'initialize' step is run before training or pretraining. Components and
-# the tokenizer can each define their own arguments via their .initialize
-# methods that are populated by the config. This lets them gather resources like
-# lookup tables and build label sets, construct vocabularies, etc.
+# These settings are used when nlp.initialize() is called (typically before
+# training or pretraining). Components and the tokenizer can each define their
+# own arguments via their initialize methods that are populated by the config.
+# This lets them gather data resources, build label sets etc.
 [initialize]
-vocab_data = ${paths.vocab_data}
-lookups = null
 vectors = ${paths.vectors}
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
+# Data and lookups for vocabulary
+vocab_data = null
+lookups = null
 # Arguments passed to the tokenizer's initialize method
 tokenizer = {}
-# Arguments passed to the initialize methods of the components (keyed by component name)
+# Arguments for initialize methods of the components (keyed by component)
 components = {}

From 50162b8726641248802bfa43d4d63ca26f8efd09 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 1 Oct 2020 22:27:45 +0200
Subject: [PATCH 323/516] Try to work around Sharp build issue [ci skip]

---
 website/gatsby-config.js | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index c1a2f9ab9..4650711ac 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -1,6 +1,11 @@
 const autoprefixer = require('autoprefixer')
 const path = require('path')
 
+// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/
+const sharp = require('sharp')
+sharp.cache(false)
+sharp.simd(false)
+
 // Markdown plugins
 const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js')
 const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')

From 300e5a9928fd226dfddbf7d5c22558f696bfa1af Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 1 Oct 2020 23:05:55 +0200
Subject: [PATCH 324/516] Avoid relying on NORM in default v3 models (#6176)

* Allow CharacterEmbed to specify feature

* Default to LOWER in character embed

* Update tok2vec

* Use LOWER, not NORM
---
 spacy/ml/models/tok2vec.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 95f9c66df..120e9b02c 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional, List, Union
 from thinc.types import Floats2d
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@@ -10,7 +10,7 @@ from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
+from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
 
 
 @registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -98,7 +98,7 @@ def MultiHashEmbed(
     attributes using hash embedding, concatenates the results, and passes it
     through a feed-forward subnetwork to build a mixed representations.
 
-    The features used are the NORM, PREFIX, SUFFIX and SHAPE, which can have
+    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
     varying definitions depending on the Vocab of the Doc object passed in.
     Vectors from pretrained static vectors can also be incorporated into the
     concatenated representation.
@@ -115,7 +115,7 @@ def MultiHashEmbed(
     also_use_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
     seed = 7
 
     def make_hash_embed(feature):
@@ -123,7 +123,7 @@ def MultiHashEmbed(
         seed += 1
         return HashEmbed(
             width,
-            rows if feature == NORM else rows // 2,
+            rows if feature == LOWER else rows // 2,
             column=cols.index(feature),
             seed=seed,
             dropout=0.0,
@@ -131,13 +131,13 @@ def MultiHashEmbed(
 
     if also_embed_subwords:
         embeddings = [
-            make_hash_embed(NORM),
+            make_hash_embed(LOWER),
             make_hash_embed(PREFIX),
             make_hash_embed(SUFFIX),
             make_hash_embed(SHAPE),
         ]
     else:
-        embeddings = [make_hash_embed(NORM)]
+        embeddings = [make_hash_embed(LOWER)]
     concat_size = width * (len(embeddings) + also_use_static_vectors)
     if also_use_static_vectors:
         model = chain(
@@ -165,7 +165,8 @@ def MultiHashEmbed(
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
-    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
+    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
+    feature: Union[int, str]="LOWER"
 ):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
@@ -179,12 +180,13 @@ def CharacterEmbed(
     of being in an arbitrary position depending on the word length.
 
     The characters are embedded in a embedding table with a given number of rows,
-    and the vectors concatenated. A hash-embedded vector of the NORM of the word is
+    and the vectors concatenated. A hash-embedded vector of the LOWER of the word is
     also concatenated on, and the result is then passed through a feed-forward
     network to construct a single vector to represent the information.
 
-    width (int): The width of the output vector and the NORM hash embedding.
-    rows (int): The number of rows in the NORM hash embedding table.
+    feature (int or str): An attribute to embed, to concatenate with the characters.
+    width (int): The width of the output vector and the feature embedding.
+    rows (int): The number of rows in the LOWER hash embedding table.
     nM (int): The dimensionality of the character embeddings. Recommended values
         are between 16 and 64.
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
@@ -193,12 +195,15 @@ def CharacterEmbed(
     also_use_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
+    feature = intify_attr(feature)
+    if feature is None:
+        raise ValueError("Invalid feature: Must be a token attribute.")
     if also_use_static_vectors:
         model = chain(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
                 chain(
-                    FeatureExtractor([NORM]),
+                    FeatureExtractor([feature]),
                     list2ragged(),
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),
@@ -214,7 +219,7 @@ def CharacterEmbed(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
                 chain(
-                    FeatureExtractor([NORM]),
+                    FeatureExtractor([feature]),
                     list2ragged(),
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),

From 6b94cee4687e70514fc30f8295bf13ea3fd2c194 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:11:19 +0200
Subject: [PATCH 325/516] Fix docs [ci skip]

---
 website/docs/api/top-level.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 68d7a3039..22de0ea83 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -269,11 +269,11 @@ If a setting is not present in the options, the default value will be used.
 > displacy.serve(doc, style="ent", options=options)
 > ```
 
-| Name                                    | Description                                                                                                                                                                                                                               |
-| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `ents`                                  | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~                                                                                                                                                      |
-| `colors`                                | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~                                                                                                                                               |
-| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
+| Name                                    | Description                                                                                                                                                                                                                                 |
+| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `ents`                                  | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~                                                                                                                                                        |
+| `colors`                                | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~                                                                                                                                                 |
+| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
 
 By default, displaCy comes with colors for all entity types used by
 [spaCy's trained pipelines](/models). If you're using custom entity types, you

From e59ecb12c0d6298c75b713ad9cc2f4a1a1682227 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:12:30 +0200
Subject: [PATCH 326/516] Auto-format

---
 spacy/ml/featureextractor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index dcf212628..3d189008a 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -9,7 +9,9 @@ def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[In
     return Model("extract_features", forward, attrs={"columns": columns})
 
 
-def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
+def forward(
+    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
+) -> Tuple[List[Ints2d], Callable]:
     columns = model.attrs["columns"]
     features: List[Ints2d] = []
     for doc in docs:

From af282ae73259dc966bc741de632ee5cab41633a9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:12:34 +0200
Subject: [PATCH 327/516] Fix import

---
 spacy/ml/featureextractor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index 3d189008a..ed2918f02 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -1,7 +1,8 @@
 from typing import List, Union, Callable, Tuple
-from thinc.types import Ints2d, Doc
+from thinc.types import Ints2d
 from thinc.api import Model, registry
 
+from ..tokens import Doc
 
 
 @registry.layers("spacy.FeatureExtractor.v1")

From 01c1538c720f529f433163d495c351ecbd13ccc2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:36:06 +0200
Subject: [PATCH 328/516] Integrate file readers

---
 pyproject.toml                                |   2 +-
 requirements.txt                              |   4 +-
 setup.cfg                                     |   6 +-
 spacy/default_config_pretraining.cfg          |   2 +-
 spacy/errors.py                               |   6 -
 spacy/tests/training/test_training.py         |   6 +-
 spacy/training/augment.py                     |  40 +++---
 spacy/training/corpus.py                      |  10 +-
 spacy/util.py                                 |   4 -
 website/docs/api/corpus.md                    |  16 +--
 website/docs/api/data-formats.md              |   4 +-
 website/docs/api/top-level.md                 | 115 ++++++++++++------
 website/docs/usage/embeddings-transformers.md |   2 +-
 website/meta/type-annotations.json            |   2 +-
 14 files changed, 126 insertions(+), 93 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e88ba7db9..611a95d27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a42,<8.0.0a50",
+    "thinc>=8.0.0a43,<8.0.0a50",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 064efed42..44dad38e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a42,<8.0.0a50
+thinc>=8.0.0a43,<8.0.0a50
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
-srsly>=2.1.0,<3.0.0
+srsly>=2.3.0,<3.0.0
 catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy
diff --git a/setup.cfg b/setup.cfg
index 36ab64bd9..7a3a2cb30 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,16 +34,16 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a42,<8.0.0a50
+    thinc>=8.0.0a43,<8.0.0a50
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
-    srsly>=2.1.0,<3.0.0
+    srsly>=2.3.0,<3.0.0
     catalogue>=2.0.1,<2.1.0
     typer>=0.3.0,<0.4.0
     pathy
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 4011159a4..66987171a 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -34,7 +34,7 @@ learn_rate = 0.001
 [corpora]
 
 [corpora.pretrain]
-@readers = "spacy.JsonlReader.v1"
+@readers = "spacy.JsonlCorpus.v1"
 path = ${paths.raw_text}
 min_length = 5
 max_length = 500
diff --git a/spacy/errors.py b/spacy/errors.py
index 5236992e9..881a697f6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,12 +477,6 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
-    E912 = ("No orth_variants lookups table for data augmentation available for "
-            "language '{lang}'. If orth_variants are available in "
-            "spacy-lookups-data, make sure the package is installed and the "
-            "table is loaded in the [initialize.lookups] block of your config. "
-            "Alternatively, you can provide your own Lookups object with a "
-            "table orth_variants as the argument 'lookuos' of the augmenter.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
             "config.cfg or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 405801f62..c53042ef1 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -504,9 +504,9 @@ def test_make_orth_variants(doc):
             {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
         ]
     }
-    lookups = Lookups()
-    lookups.add_table("orth_variants", orth_variants)
-    augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
+    augmenter = create_orth_variants_augmenter(
+        level=0.2, lower=0.5, orth_variants=orth_variants
+    )
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "roundtrip.spacy"
         DocBin(docs=[doc]).to_disk(output_file)
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 176530a1c..8965c5457 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,27 +1,43 @@
-from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
+from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
 import random
 import itertools
 import copy
 from functools import partial
+from pydantic import BaseModel, StrictStr
 
 from ..util import registry, logger
 from ..tokens import Doc
 from .example import Example
-from ..lookups import Lookups
-from ..errors import Errors
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
+class OrthVariantsSingle(BaseModel):
+    tags: List[StrictStr]
+    variants: List[StrictStr]
+
+
+class OrthVariantsPaired(BaseModel):
+    tags: List[StrictStr]
+    variants: List[List[StrictStr]]
+
+
+class OrthVariants(BaseModel):
+    paired: List[OrthVariantsPaired] = {}
+    single: List[OrthVariantsSingle] = {}
+
+
 @registry.augmenters("spacy.orth_variants.v1")
 def create_orth_variants_augmenter(
-    level: float, lower: float, lookups: Optional[Lookups] = None,
+    level: float, lower: float, orth_variants: OrthVariants,
 ) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
     """
-    return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
+    return partial(
+        orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
+    )
 
 
 def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
@@ -31,20 +47,11 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
 def orth_variants_augmenter(
     nlp: "Language",
     example: Example,
+    orth_variants: dict,
     *,
     level: float = 0.0,
     lower: float = 0.0,
-    lookups: Optional[Lookups] = None,
 ) -> Iterator[Example]:
-    table_name = "orth_variants"
-    if lookups is not None:
-        orth_variants = lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from provided lookups")
-    else:
-        orth_variants = nlp.vocab.lookups.get_table(table_name, {})
-        logger.debug("Using data augmentation orth variants from default vocab lookups")
-        if not orth_variants:
-            raise ValueError(Errors.E912.format(lang=nlp.lang))
     if random.random() >= level:
         yield example
     else:
@@ -74,13 +81,14 @@ def make_orth_variants(
     nlp: "Language",
     raw: str,
     token_dict: Dict[str, List[str]],
-    orth_variants: Dict[str, list],
+    orth_variants: Dict[str, List[Dict[str, List[str]]]],
     *,
     lower: bool = False,
 ) -> Tuple[str, Dict[str, List[str]]]:
     orig_token_dict = copy.deepcopy(token_dict)
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
+    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
     words = token_dict.get("words", [])
     tags = token_dict.get("tags", [])
     # keep unmodified if words or tags are not defined
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 57787cf76..b3ff30e66 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -38,11 +38,11 @@ def create_docbin_reader(
     )
 
 
-@util.registry.readers("spacy.JsonlReader.v1")
+@util.registry.readers("spacy.JsonlCorpus.v1")
 def create_jsonl_reader(
     path: Path, min_length: int = 0, max_length: int = 0, limit: int = 0
 ) -> Callable[["Language"], Iterable[Doc]]:
-    return JsonlTexts(path, min_length=min_length, max_length=max_length, limit=limit)
+    return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 
 
 @util.registry.readers("spacy.read_labels.v1")
@@ -193,7 +193,7 @@ class Corpus:
                             break
 
 
-class JsonlTexts:
+class JsonlCorpus:
     """Iterate Doc objects from a file or directory of jsonl
     formatted raw text files.
 
@@ -206,7 +206,7 @@ class JsonlTexts:
     limit (int): Limit corpus to a subset of examples, e.g. for debugging.
         Defaults to 0, which indicates no limit.
 
-    DOCS: https://nightly.spacy.io/api/corpus#jsonltexts
+    DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus
     """
 
     file_type = "jsonl"
@@ -230,7 +230,7 @@ class JsonlTexts:
         nlp (Language): The current nlp object.
         YIELDS (Example): The example objects.
 
-        DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
+        DOCS: https://nightly.spacy.io/api/corpus#jsonlcorpus-call
         """
         for loc in walk_corpus(self.path, ".jsonl"):
             records = srsly.read_jsonl(loc)
diff --git a/spacy/util.py b/spacy/util.py
index 8a96ba4fe..f234927d6 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -103,10 +103,6 @@ class registry(thinc.registry):
     cli = catalogue.create("spacy", "cli", entry_points=True)
 
 
-# We want json loading in the registry, so manually register srsly.read_json.
-registry.readers("srsly.read_json.v0", srsly.read_json)
-
-
 class SimpleFrozenDict(dict):
     """Simplified implementation of a frozen dict, mainly used as default
     function or method argument (for arguments that should default to empty
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 58006a19b..986c6f458 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -100,7 +100,7 @@ Yield examples from the data.
 | `nlp`      | The current `nlp` object. ~~Language~~ |
 | **YIELDS** | The examples. ~~Example~~              |
 
-## JsonlTexts {#jsonltexts tag="class"}
+## JsonlCorpus {#jsonlcorpus tag="class"}
 
 Iterate Doc objects from a file or directory of JSONL (newline-delimited JSON)
 formatted raw text files. Can be used to read the raw text corpus for language
@@ -126,22 +126,22 @@ file.
 {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."}
 ```
 
-### JsonlTexts.\_\init\_\_ {#jsonltexts-init tag="method"}
+### JsonlCorpus.\_\init\_\_ {#jsonlcorpus tag="method"}
 
 Initialize the reader.
 
 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 >
-> corpus = JsonlTexts("./data/texts.jsonl")
+> corpus = JsonlCorpus("./data/texts.jsonl")
 > ```
 >
 > ```ini
 > ### Example config
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = "corpus/raw_text.jsonl"
 > min_length = 0
 > max_length = 0
@@ -156,17 +156,17 @@ Initialize the reader.
 | `max_length`   | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`        | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
 
-### JsonlTexts.\_\_call\_\_ {#jsonltexts-call tag="method"}
+### JsonlCorpus.\_\_call\_\_ {#jsonlcorpus-call tag="method"}
 
 Yield examples from the data.
 
 > #### Example
 >
 > ```python
-> from spacy.training import JsonlTexts
+> from spacy.training import JsonlCorpus
 > import spacy
 >
-> corpus = JsonlTexts("./texts.jsonl")
+> corpus = JsonlCorpus("./texts.jsonl")
 > nlp = spacy.blank("en")
 > data = corpus(nlp)
 > ```
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 22a0076cd..c1b9bfef4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -135,7 +135,7 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 > path = ${paths:dev}
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.raw}
 >
 > [corpora.my_custom_data]
@@ -146,7 +146,7 @@ This section defines a **dictionary** mapping of string keys to functions. Each
 function takes an `nlp` object and yields [`Example`](/api/example) objects. By
 default, the two keys `train` and `dev` are specified and each refer to a
 [`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
-section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+section is added that defaults to a [`JsonlCorpus`](/api/top-level#JsonlCorpus).
 You can also register custom functions that return a callable.
 
 | Name       | Description                                                                                                                                                                 |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 22de0ea83..876006774 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -327,7 +327,7 @@ factories.
 | `losses`          | Registry for functions that create [losses](https://thinc.ai/docs/api-loss).                                                                                                                                                                       |
 | `misc`            | Registry for miscellaneous functions that return data assets, knowledge bases or anything else you may need.                                                                                                                                       |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
-| `readers`         | Registry for training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                                                    |
+| `readers`         | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus).                                                                                                                                   |
 | `schedules`       | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules).                                                                                                                                                               |
 | `tokenizers`      | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable.                                                                   |
 
@@ -470,7 +470,65 @@ logging the results.
 
 </Project>
 
-## Readers {#readers source="spacy/training/corpus.py" new="3"}
+## Readers {#readers}
+
+### File readers {#file-readers source="github.com/explosion/srsly" new="3"}
+
+The following file readers are provided by our serialization library
+[`srsly`](https://github.com/explosion/srsly). All registered functions take one
+argument `path`, pointing to the file path to load.
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
+> ```
+
+| Name                    | Description                                           |
+| ----------------------- | ----------------------------------------------------- |
+| `srsly.read_json.v1`    | Read data from a JSON file.                           |
+| `srsly.read_jsonl.v1`   | Read data from a JSONL (newline-delimited JSON) file. |
+| `srsly.read_yaml.v1`    | Read data from a YAML file.                           |
+| `srsly.read_msgpack.v1` | Read data from a binary MessagePack file.             |
+
+<Infobox title="Important note" variant="warning">
+
+Since the file readers expect a local path, you should only use them in config
+blocks that are **not executed at runtime** – for example, in `[training]` and
+`[corpora]` (to load data or resources like data augmentation tables) or in
+`[initialize]` (to pass data to pipeline components).
+
+</Infobox>
+
+#### spacy.read_labels.v1 {#read_labels tag="registered function"}
+
+Read a JSON-formatted labels file generated with
+[`init labels`](/api/cli#init-labels). Typically used in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the training
+config to speed up the model initialization process and provide pre-generated
+label sets.
+
+> #### Example config
+>
+> ```ini
+> [initialize.components]
+>
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json"
+> ```
+
+| Name        | Description                                                                                                                                                                                                               |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
+| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
+| **CREATES** | The                                                                                                                                                                                                                       |
+
+### Corpus readers {#corpus-readers source="spacy/training/corpus.py" new="3"}
 
 Corpus readers are registered functions that load data and return a function
 that takes the current `nlp` object and yields [`Example`](/api/example) objects
@@ -480,7 +538,7 @@ with your own registered function in the
 [`@readers` registry](/api/top-level#registry) to customize the data loading and
 streaming.
 
-### spacy.Corpus.v1 {#corpus tag="registered function"}
+#### spacy.Corpus.v1 {#corpus tag="registered function"}
 
 The `Corpus` reader manages annotated corpora and can be used for training and
 development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see
@@ -509,12 +567,12 @@ the [`Corpus`](/api/corpus) class.
 | `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ |
 | **CREATES**     | The corpus reader. ~~Corpus~~                                                                                                                                                                                                                                                            |
 
-### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"}
+#### spacy.JsonlCorpus.v1 {#jsonlcorpus tag="registered function"}
 
 Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON)
 file of texts keyed by `"text"`. Can be used to read the raw text corpus for
 language model [pretraining](/usage/embeddings-transformers#pretraining) from a
-JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
+JSONL file. Also see the [`JsonlCorpus`](/api/corpus#jsonlcorpus) class.
 
 > #### Example config
 >
@@ -523,7 +581,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 > pretrain = "corpus/raw_text.jsonl"
 >
 > [corpora.pretrain]
-> @readers = "spacy.JsonlReader.v1"
+> @readers = "spacy.JsonlCorpus.v1"
 > path = ${paths.pretrain}
 > min_length = 0
 > max_length = 0
@@ -536,33 +594,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~       |
 | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        |
 | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  |
-| **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                |
-
-### spacy.read_labels.v1 {#read_labels tag="registered function"}
-
-Read a JSON-formatted labels file generated with
-[`init labels`](/api/cli#init-labels). Typically used in the
-[`[initialize]`](/api/data-formats#config-initialize) block of the training
-config to speed up the model initialization process and provide pre-generated
-label sets.
-
-> #### Example config
->
-> ```ini
-> [initialize.components]
->
-> [initialize.components.ner]
->
-> [initialize.components.ner.labels]
-> @readers = "spacy.read_labels.v1"
-> path = "corpus/labels/ner.json"
-> ```
-
-| Name        | Description                                                                                                                                                                                                               |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path`      | The path to the labels file generated with [`init labels`](/api/cli#init-labels). ~~Path~~                                                                                                                                |
-| `require`   | Whether to require the file to exist. If set to `False` and the labels file doesn't exist, the loader will return `None` and the `initialize` method will extract the labels from the data. Defaults to `False`. ~~bool~~ |
-| **CREATES** | The                                                                                                                                                                                                                       |
+| **CREATES**  | The corpus reader. ~~JsonlCorpus~~                                                                                               |
 
 ## Batchers {#batchers source="spacy/training/batchers.py" new="3"}
 
@@ -664,7 +696,10 @@ sequences in the batch.
 > @augmenters = "spacy.orth_variants.v1"
 > level = 0.1
 > lower = 0.5
-> lookups = null
+>
+> [corpora.train.augmenter.orth_variants]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/en_orth_variants.json"
 > ```
 
 Create a data augmentation callback that uses orth-variant replacement. The
@@ -672,12 +707,12 @@ callback can be added to a corpus or other data iterator during training. This
 is especially useful for punctuation and case replacement, to help generalize
 beyond corpora that don't have smart quotes, or only have smart quotes etc.
 
-| Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| `lower`     | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
-| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   |
+| Name            | Description                                                                                                                                                                                                                                                                                               |
+| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `level`         | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                 |
+| `lower`         | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                |
+| `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
+| **CREATES**     | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                              |
 
 ## Training data and alignment {#gold source="spacy/training"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 1b78b8dc5..c615097d6 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -622,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/top-level#jsonlreader).
+the [`JsonlCorpus`](/api/top-level#jsonlcorpus).
 
 > #### Raw text format
 >
diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json
index 43a524e93..acbc88ae2 100644
--- a/website/meta/type-annotations.json
+++ b/website/meta/type-annotations.json
@@ -24,7 +24,7 @@
     "TransformerData": "/api/transformer#transformerdata",
     "FullTransformerBatch": "/api/transformer#fulltransformerbatch",
     "Corpus": "/api/corpus",
-    "JsonlTexts": "/api/corpus#jsonltexts",
+    "JsonlCorpus": "/api/corpus#jsonlcorpus",
     "LexemeC": "/api/cython-structs#lexemec",
     "TokenC": "/api/cython-structs#tokenc",
     "Config": "https://thinc.ai/docs/api-config#config",

From 568768643e62dbc00662dd64f33b8919de6e4b13 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 01:50:13 +0200
Subject: [PATCH 329/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 18fc77184..acf386ace 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a28"
+__version__ = "3.0.0a29"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 77e08c398f7242f62d8c25cb6814e057b2786bb3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 08:25:15 +0200
Subject: [PATCH 330/516] Switch reset value for set_morph to None

---
 spacy/tokens/token.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 8099abd92..322c9a54c 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -223,7 +223,7 @@ cdef class Token:
 
     def set_morph(self, features):
         cdef hash_t key
-        if features is 0:
+        if features is None:
             self.c.morph = 0
         else:
             if isinstance(features, int):

From 65dfaa4f4b94a2602bb5af74677d7edae6d88ff6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 08:33:43 +0200
Subject: [PATCH 331/516] Also accept MorphAnalysis in set_morph

---
 spacy/tokens/token.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 322c9a54c..2075c3cc8 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -225,6 +225,8 @@ cdef class Token:
         cdef hash_t key
         if features is None:
             self.c.morph = 0
+        elif isinstance(features, MorphAnalysis):
+            self.morph = features
         else:
             if isinstance(features, int):
                 features = self.vocab.strings[features]

From fd09e6b140c1334f6fc110f32dec8d2f93c927b1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 08:48:28 +0200
Subject: [PATCH 332/516] Update docs for Token.morph / Token.set_morph

---
 website/docs/api/token.md                 | 156 ++++++++++++----------
 website/docs/usage/linguistic-features.md |  13 +-
 2 files changed, 92 insertions(+), 77 deletions(-)

diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 068a1d2d2..b3bb63d6c 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -172,6 +172,25 @@ Get a neighboring token.
 | `i`         | The relative position of the token to get. Defaults to `1`. ~~int~~ |
 | **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~               |
 
+## Token.set_morph {#set_morph tag="method"}
+
+Set the morphological analysis from a UD FEATS string, hash value of a UD FEATS
+string, features dict or `MorphAnalysis`. The value `None` can be used to reset
+the morph to an unset state.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Give it back! He pleaded.")
+> doc[0].set_morph("Mood=Imp|VerbForm=Fin")
+> assert "Mood=Imp" in doc[0].morph
+> assert doc[0].morph.get("Mood") == ["Imp"]
+> ```
+
+| Name     | Description                                                                       |
+| -------- | --------------------------------------------------------------------------------- |
+| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
+
 ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
 
 Check whether this token is a parent, grandparent, etc. of another in the
@@ -392,74 +411,73 @@ The L2 norm of the token's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                         | Description                                                                                                                                                                                                                                                            |
-| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doc`                                        | The parent document. ~~Doc~~                                                                                                                                                                                                                                           |
-| `lex` <Tag variant="new">3</Tag>             | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                      |
-| `sent` <Tag variant="new">2.0.12</Tag>       | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                               |
-| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                         |
-| `text_with_ws`                               | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                        |
-| `whitespace_`                                | Trailing space character if present. ~~str~~                                                                                                                                                                                                                           |
-| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                               |
-| `orth_`                                      | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                    |
-| `vocab`                                      | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                        |
-| `tensor` <Tag variant="new">2.1.7</Tag>      | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                     |
-| `head`                                       | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                          |
-| `left_edge`                                  | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                    |
-| `right_edge`                                 | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                   |
-| `i`                                          | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                             |
-| `ent_type`                                   | Named entity type. ~~int~~                                                                                                                                                                                                                                             |
-| `ent_type_`                                  | Named entity type. ~~str~~                                                                                                                                                                                                                                             |
-| `ent_iob`                                    | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                   |
-| `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                    |
-| `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                             |
-| `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                             |
-| `ent_id`                                     | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                          |
-| `ent_id_`                                    | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                          |
-| `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                         |
-| `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                         |
-| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~                                                                                                 |
-| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~                                                                                                 |
-| `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                   |
-| `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                          |
+| Name                                         | Description                                                                                                                                                                                                                                                           |
+| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                        | The parent document. ~~Doc~~                                                                                                                                                                                                                                          |
+| `lex` <Tag variant="new">3</Tag>             | The underlying lexeme. ~~Lexeme~~                                                                                                                                                                                                                                     |
+| `sent` <Tag variant="new">2.0.12</Tag>       | The sentence span that this token is a part of. ~~Span~~                                                                                                                                                                                                              |
+| `text`                                       | Verbatim text content. ~~str~~                                                                                                                                                                                                                                        |
+| `text_with_ws`                               | Text content, with trailing space character if present. ~~str~~                                                                                                                                                                                                       |
+| `whitespace_`                                | Trailing space character if present. ~~str~~                                                                                                                                                                                                                          |
+| `orth`                                       | ID of the verbatim text content. ~~int~~                                                                                                                                                                                                                              |
+| `orth_`                                      | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~                                                                                                                                                   |
+| `vocab`                                      | The vocab object of the parent `Doc`. ~~vocab~~                                                                                                                                                                                                                       |
+| `tensor` <Tag variant="new">2.1.7</Tag>      | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~                                                                                                                                                                                                    |
+| `head`                                       | The syntactic parent, or "governor", of this token. ~~Token~~                                                                                                                                                                                                         |
+| `left_edge`                                  | The leftmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                   |
+| `right_edge`                                 | The rightmost token of this token's syntactic descendants. ~~Token~~                                                                                                                                                                                                  |
+| `i`                                          | The index of the token within the parent document. ~~int~~                                                                                                                                                                                                            |
+| `ent_type`                                   | Named entity type. ~~int~~                                                                                                                                                                                                                                            |
+| `ent_type_`                                  | Named entity type. ~~str~~                                                                                                                                                                                                                                            |
+| `ent_iob`                                    | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~                                                                                  |
+| `ent_iob_`                                   | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~                                                                                   |
+| `ent_kb_id` <Tag variant="new">2.2</Tag>     | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~                                                                                                                                                                            |
+| `ent_kb_id_` <Tag variant="new">2.2</Tag>    | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~                                                                                                                                                                            |
+| `ent_id`                                     | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~                                                                                                                                         |
+| `ent_id_`                                    | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                         |
+| `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                        |
+| `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                        |
+| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~                                                                                                |
+| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~                                                                                                |
+| `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                  |
+| `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                         |
 | `shape`                                      | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
 | `shape_`                                     | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
-| `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                             |
-| `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                           |
-| `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                               |
-| `suffix_`                                    | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                               |
-| `is_alpha`                                   | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                        |
-| `is_ascii`                                   | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                            |
-| `is_digit`                                   | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                       |
-| `is_lower`                                   | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                              |
-| `is_upper`                                   | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                              |
-| `is_title`                                   | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                              |
-| `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                     |
-| `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                            |
-| `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                           |
-| `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                        |
-| `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                       |
-| `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                                |
-| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                               |
-| `like_url`                                   | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                                |
-| `like_num`                                   | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                             |
-| `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                     |
-| `is_oov`                                     | Does the token have a word vector? ~~bool~~                                                                                                                                                                                                                            |
-| `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                           |
-| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~                                                                                                                                                 |
-| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~                                                                                                                                                 |
-| `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                   |
-| `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                   |
-| `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                              |
-| `morph_` <Tag variant="new">3</Tag>          | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~                                                                                                                     |
-| `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                                 |
-| `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                                 |
-| `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                  |
-| `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                  |
-| `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                        |
-| `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                  |
-| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                         |
-| `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                   |
-| `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                   |
-| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                              |
-| `_`                                          | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                          |
+| `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                            |
+| `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                          |
+| `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                              |
+| `suffix_`                                    | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~                                                                                                                                                                                              |
+| `is_alpha`                                   | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~                                                                                                                                                                       |
+| `is_ascii`                                   | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~                                                                                                                                                           |
+| `is_digit`                                   | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~                                                                                                                                                                                      |
+| `is_lower`                                   | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~                                                                                                                                                                                             |
+| `is_upper`                                   | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~                                                                                                                                                                                             |
+| `is_title`                                   | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~                                                                                                                                                                                             |
+| `is_punct`                                   | Is the token punctuation? ~~bool~~                                                                                                                                                                                                                                    |
+| `is_left_punct`                              | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~                                                                                                                                                                                                           |
+| `is_right_punct`                             | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~                                                                                                                                                                                                          |
+| `is_space`                                   | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~                                                                                                                                                                       |
+| `is_bracket`                                 | Is the token a bracket? ~~bool~~                                                                                                                                                                                                                                      |
+| `is_quote`                                   | Is the token a quotation mark? ~~bool~~                                                                                                                                                                                                                               |
+| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~                                                                                                                                                                                                                              |
+| `like_url`                                   | Does the token resemble a URL? ~~bool~~                                                                                                                                                                                                                               |
+| `like_num`                                   | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~                                                                                                                                                                                            |
+| `like_email`                                 | Does the token resemble an email address? ~~bool~~                                                                                                                                                                                                                    |
+| `is_oov`                                     | Does the token have a word vector? ~~bool~~                                                                                                                                                                                                                           |
+| `is_stop`                                    | Is the token part of a "stop list"? ~~bool~~                                                                                                                                                                                                                          |
+| `pos`                                        | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~                                                                                                                                                |
+| `pos_`                                       | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~                                                                                                                                                |
+| `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                  |
+| `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                  |
+| `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                             |
+| `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                                |
+| `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                                |
+| `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                 |
+| `lang_`                                      | Language of the parent document's vocabulary. ~~str~~                                                                                                                                                                                                                 |
+| `prob`                                       | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~                                                                                                                                                       |
+| `idx`                                        | The character offset of the token within the parent document. ~~int~~                                                                                                                                                                                                 |
+| `sentiment`                                  | A scalar value indicating the positivity or negativity of the token. ~~float~~                                                                                                                                                                                        |
+| `lex_id`                                     | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                  |
+| `rank`                                       | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~                                                                                                                                                                  |
+| `cluster`                                    | Brown cluster ID. ~~int~~                                                                                                                                                                                                                                             |
+| `_`                                          | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                                                                                                         |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 25b6c2fac..7b9aaa0b9 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -56,16 +56,13 @@ create a surface form. Here are some examples:
 
 Morphological features are stored in the [`MorphAnalysis`](/api/morphanalysis)
 under `Token.morph`, which allows you to access individual morphological
-features. The attribute `Token.morph_` provides the morphological analysis in
-the Universal Dependencies
-[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
-format.
+features.
 
 > #### 📝 Things to try
 >
 > 1. Change "I" to "She". You should see that the morphological features change
 >    and express that it's a pronoun in the third person.
-> 2. Inspect `token.morph_` for the other tokens.
+> 2. Inspect `token.morph` for the other tokens.
 
 ```python
 ### {executable="true"}
@@ -75,7 +72,7 @@ nlp = spacy.load("en_core_web_sm")
 print("Pipeline:", nlp.pipe_names)
 doc = nlp("I was reading the paper.")
 token = doc[0]  # 'I'
-print(token.morph_)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
+print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
 print(token.morph.get("PronType"))  # ['Prs']
 ```
 
@@ -91,7 +88,7 @@ import spacy
 
 nlp = spacy.load("de_core_news_sm")
 doc = nlp("Wo bist du?") # English: 'Where are you?'
-print(doc[2].morph_)  # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
+print(doc[2].morph)  # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
 print(doc[2].pos_) # 'PRON'
 ```
 
@@ -117,7 +114,7 @@ import spacy
 
 nlp = spacy.load("en_core_web_sm")
 doc = nlp("Where are you?")
-print(doc[2].morph_)  # 'Case=Nom|Person=2|PronType=Prs'
+print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs'
 print(doc[2].pos_)  # 'PRON'
 ```
 

From 3908fff8994f43d23eed1d6cdbb3d37d46bbc3b4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 09:07:55 +0200
Subject: [PATCH 333/516] Remove tag map sidebar

---
 website/docs/usage/training.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index c6c05ac5b..a7c23baa7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1124,17 +1124,6 @@ a dictionary with keyword arguments specifying the annotations, like `tags` or
 annotations, the model can be updated to learn a sentence of three words with
 their assigned part-of-speech tags.
 
-> #### About the tag map
->
-> The tag map is part of the vocabulary and defines the annotation scheme. If
-> you're training a new pipeline, this will let you map the tags present in the
-> treebank you train on to spaCy's tag scheme:
->
-> ```python
-> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}
-> vocab = Vocab(tag_map=tag_map)
-> ```
-
 ```python
 words = ["I", "like", "stuff"]
 tags = ["NOUN", "VERB", "NOUN"]

From 7670df04dd4838d55573abe32a734e51f78648f3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 10:09:03 +0200
Subject: [PATCH 334/516] Update Chinese usage docs

---
 website/docs/usage/models.md | 50 +++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index 9b686c947..5e9bd688f 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -85,7 +85,8 @@ import the `MultiLanguage` class directly, or call
 
 ### Chinese language support {#chinese new=2.3}
 
-The Chinese language class supports three word segmentation options:
+The Chinese language class supports three word segmentation options, `char`,
+`jieba` and `pkuseg`:
 
 > ```python
 > from spacy.lang.zh import Chinese
@@ -95,11 +96,12 @@ The Chinese language class supports three word segmentation options:
 >
 > # Jieba
 > cfg = {"segmenter": "jieba"}
-> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
+> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
 >
 > # PKUSeg with "default" model provided by pkuseg
-> cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
-> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
+> cfg = {"segmenter": "pkuseg"}
+> nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
+> nlp.tokenizer.initialize(pkuseg_model="default")
 > ```
 
 1. **Character segmentation:** Character segmentation is the default
@@ -116,41 +118,34 @@ The Chinese language class supports three word segmentation options:
 <Infobox variant="warning">
 
 In spaCy v3.0, the default Chinese word segmenter has switched from Jieba to
-character segmentation. Also note that
-[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with
-pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can
-install it from our fork and compile it locally:
-
-```bash
-$ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
-```
+character segmentation.
 
 </Infobox>
 
 <Accordion title="Details on spaCy's Chinese API">
 
-The `meta` argument of the `Chinese` language class supports the following
-following tokenizer config settings:
+The `initialize` method for the Chinese tokenizer class supports the following
+config settings for loading pkuseg models:
 
-| Name               | Description                                                                                                     |
-| ------------------ | --------------------------------------------------------------------------------------------------------------- |
-| `segmenter`        | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~                                        |
-| `pkuseg_model`     | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
-| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~    |
+| Name               | Description                                                                                                                           |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
+| `pkuseg_model`     | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~                                                  |
+| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
 
 ```python
 ### Examples
+# Initialize the pkuseg tokenizer
+cfg = {"segmenter": "pkuseg"}
+nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
+
 # Load "default" model
-cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
-nlp = Chinese(config={"tokenizer": {"config": cfg}})
+nlp.tokenizer.initialize(pkuseg_model="default")
 
 # Load local model
-cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
-nlp = Chinese(config={"tokenizer": {"config": cfg}})
+nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
 
 # Override the user directory
-cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
-nlp = Chinese(config={"tokenizer": {"config": cfg}})
+nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict")
 ```
 
 You can also modify the user dictionary on-the-fly:
@@ -185,8 +180,11 @@ from spacy.lang.zh import Chinese
 
 # Train pkuseg model
 pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model")
+
 # Load pkuseg model in spaCy Chinese tokenizer
-nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}})
+cfg = {"segmenter": "pkuseg"}
+nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
+nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
 ```
 
 </Accordion>

From 351f352cdc7ffe2d6c41675e45c1d75ec84180c8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 10:12:44 +0200
Subject: [PATCH 335/516] Update Japanese docs and pin for sudachipy

---
 setup.cfg                    | 2 +-
 website/docs/usage/models.md | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 36ab64bd9..babe5fe8b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -84,7 +84,7 @@ cuda102 =
     cupy-cuda102>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
-    sudachipy>=0.4.5
+    sudachipy>=0.4.9
     sudachidict_core>=20200330
 ko =
     natto-py==0.9.0
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index 5e9bd688f..6792f691c 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -199,20 +199,19 @@ nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
 >
 > # Load SudachiPy with split mode B
 > cfg = {"split_mode": "B"}
-> nlp = Japanese(meta={"tokenizer": {"config": cfg}})
+> nlp = Japanese.from_config({"nlp": {"tokenizer": cfg}})
 > ```
 
 The Japanese language class uses
 [SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
 segmentation and part-of-speech tagging. The default Japanese language class and
-the provided Japanese pipelines use SudachiPy split mode `A`. The `meta`
-argument of the `Japanese` language class can be used to configure the split
-mode to `A`, `B` or `C`.
+the provided Japanese pipelines use SudachiPy split mode `A`. The tokenizer
+config can be used to configure the split mode to `A`, `B` or `C`.
 
 <Infobox variant="warning">
 
 If you run into errors related to `sudachipy`, which is currently under active
-development, we suggest downgrading to `sudachipy==0.4.5`, which is the version
+development, we suggest downgrading to `sudachipy==0.4.9`, which is the version
 used for training the current [Japanese pipelines](/models/ja).
 
 </Infobox>

From f83dfe62dadfce31697989d4c078500ae941a244 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 10:17:26 +0200
Subject: [PATCH 336/516] Fix test

---
 spacy/tests/doc/test_morphanalysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 56c80dd66..b44b13d4c 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -77,7 +77,7 @@ def test_morph_property(tokenizer):
     assert doc.to_array(["MORPH"])[0] != 0
 
     # unset with token.morph
-    doc[0].set_morph(0)
+    doc[0].set_morph(None)
     assert doc.to_array(["MORPH"])[0] == 0
 
     # empty morph is equivalent to "_"

From acc391c2a841936b44e91a243f39ae864d661400 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 2 Oct 2020 11:05:59 +0200
Subject: [PATCH 337/516] remove redundant str() call

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 1cc7abf57..d919b161e 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1070,7 +1070,7 @@ def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
     RETURNS: The loaded module.
     """
     loc = str(loc)
-    spec = importlib.util.spec_from_file_location(name, str(loc))
+    spec = importlib.util.spec_from_file_location(name, loc)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module

From c41a4332e4f21627db1a7c5e057c3cfd70f5fea7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 11:37:56 +0200
Subject: [PATCH 338/516] Add test for custom data augmentation

---
 spacy/tests/training/test_training.py | 35 ++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index c53042ef1..7d41c8908 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -7,11 +7,11 @@ from spacy.training.converters import json_to_docs
 from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
-from spacy.lookups import Lookups
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
 import pytest
 import srsly
+import random
 
 from ..util import make_tempdir
 
@@ -515,6 +515,39 @@ def test_make_orth_variants(doc):
         list(reader(nlp))
 
 
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_custom_data_augmentation(doc):
+    def create_spongebob_augmenter(randomize: bool = False):
+        def augment(nlp, example):
+            text = example.text
+            if randomize:
+                ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
+            else:
+                ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
+            example_dict = example.to_dict()
+            doc = nlp.make_doc("".join(ch))
+            example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
+            yield example
+            yield example.from_dict(doc, example_dict)
+
+        return augment
+
+    nlp = English()
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / "roundtrip.spacy"
+        DocBin(docs=[doc]).to_disk(output_file)
+        reader = Corpus(output_file, augmenter=create_spongebob_augmenter())
+        corpus = list(reader(nlp))
+    orig_text = "Sarah 's sister flew to Silicon Valley via London . "
+    augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . "
+    assert corpus[0].text == orig_text
+    assert corpus[0].reference.text == orig_text
+    assert corpus[0].predicted.text == orig_text
+    assert corpus[1].text == augmented
+    assert corpus[1].reference.text == augmented
+    assert corpus[1].predicted.text == augmented
+
+
 @pytest.mark.skip("Outdated")
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",

From 32cdc1c4f45148e16b5b166b7c7b50077679cb47 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 11:38:03 +0200
Subject: [PATCH 339/516] Update docs [ci skip]

---
 website/docs/api/top-level.md  |   6 +-
 website/docs/usage/training.md | 125 ++++++++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 876006774..a65a279a9 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -685,7 +685,11 @@ sequences in the batch.
 
 ## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
 
-<!-- TODO: intro, explain data augmentation concept -->
+Data augmentation is the process of applying small modifications to the training
+data. It can be especially useful for punctuation and case replacement – for
+example, if your corpus only uses smart quotes and you want to include
+variations using regular quotes, or to make the model less sensitive to
+capitalization by including a mix of capitalized and lowercase examples. See the [usage guide](/usage/training#data-augmentation) for details and examples.
 
 ### spacy.orth_variants.v1 {#orth_variants tag="registered function"}
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index c6c05ac5b..5c584cfd3 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1011,9 +1011,132 @@ def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Examp
 <!-- TODO:
 * Custom corpus class
 * Minibatching
-* Data augmentation
 -->
 
+### Data augmentation {#data-augmentation}
+
+Data augmentation is the process of applying small **modifications** to the
+training data. It can be especially useful for punctuation and case replacement
+– for example, if your corpus only uses smart quotes and you want to include
+variations using regular quotes, or to make the model less sensitive to
+capitalization by including a mix of capitalized and lowercase examples.
+
+The easiest way to use data augmentation during training is to provide an
+`augmenter` to the training corpus, e.g. in the `[corpora.train]` section of
+your config. The built-in [`orth_variants`](/api/top-level#orth_variants)
+augmenter creates a data augmentation callback that uses orth-variant
+replacement.
+
+```ini
+### config.cfg (excerpt) {highlight="8,14"}
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+
+[corpora.train.augmenter]
+@augmenters = "spacy.orth_variants.v1"
+# Percentage of texts that will be augmented / lowercased
+level = 0.1
+lower = 0.5
+
+[corpora.train.augmenter.orth_variants]
+@readers = "srsly.read_json.v1"
+path = "corpus/orth_variants.json"
+```
+
+The `orth_variants` argument lets you pass in a dictionary of replacement rules,
+typically loaded from a JSON file. There are two types of orth variant rules:
+`"single"` for single tokens that should be replaced (e.g. hyphens) and
+`"paired"` for pairs of tokens (e.g. quotes).
+
+<!-- prettier-ignore -->
+```json
+### orth_variants.json
+{
+  "single": [{ "tags": ["NFP"], "variants": ["…", "..."] }],
+  "paired": [{ "tags": ["``", "''"], "variants": [["'", "'"], ["‘", "’"]] }]
+}
+```
+
+<Accordion title="Full examples for English and German" spaced>
+
+```json
+https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json
+```
+
+```json
+https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/de_orth_variants.json
+```
+
+</Accordion>
+
+<Infobox title="Important note" variant="warning">
+
+When adding data augmentation, keep in mind that it typically only makes sense
+to apply it to the **training corpus**, not the development data.
+
+</Infobox>
+
+#### Writing custom data augmenters {#data-augmentation-custom}
+
+Using the [`@spacy.augmenters`](/api/top-level#registry) registry, you can also
+register your own data augmentation callbacks. The callback should be a function
+that takes the current `nlp` object and a training [`Example`](/api/example) and
+yields `Example` objects. Keep in mind that the augmenter should yield **all
+examples** you want to use in your corpus, not only the augmented examples
+(unless you want to augment all examples).
+
+Here'a an example of a custom augmentation callback that produces text variants
+in ["SpOnGeBoB cAsE"](https://knowyourmeme.com/memes/mocking-spongebob). The
+registered function takes one argument `randomize` that can be set via the
+config and decides whether the uppercase/lowercase transformation is applied
+randomly or not. The augmenter yields two `Example` objects: the original
+example and the augmented example.
+
+> #### config.cfg
+>
+> ```ini
+> [corpora.train.augmenter]
+> @augmenters = "spongebob_augmenter.v1"
+> randomize = false
+> ```
+
+```python
+import spacy
+import random
+
+@spacy.registry.augmenters("spongebob_augmenter.v1")
+def create_augmenter(randomize: bool = False):
+    def augment(nlp, example):
+        text = example.text
+        if randomize:
+            # Randomly uppercase/lowercase characters
+            chars = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
+        else:
+            # Uppercase followed by lowercase
+            chars = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
+        # Create augmented training example
+        example_dict = example.to_dict()
+        doc = nlp.make_doc("".join(chars))
+        example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
+        # Original example followed by augmented example
+        yield example
+        yield example.from_dict(doc, example_dict)
+
+    return augment
+```
+
+An easy way to create modified `Example` objects is to use the
+[`Example.from_dict`](/api/example#from_dict) method with a new reference
+[`Doc`](/api/doc) created from the modified text. In this case, only the
+capitalization changes, so only the `ORTH` values of the tokens will be
+different between the original and augmented examples.
+
+<!-- TODO: mention alignment -->
+
 ## Parallel & distributed training with Ray {#parallel-training}
 
 > #### Installation

From df06f7a7921533fd6dd05b8ff17d8cb46867c603 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 13:24:33 +0200
Subject: [PATCH 340/516] Update docs [ci skip]

---
 website/docs/api/language.md               | 10 +--
 website/docs/api/lemmatizer.md             |  3 +-
 website/docs/api/token.md                  |  4 +-
 website/docs/api/tokenizer.md              | 15 ++---
 website/docs/api/top-level.md              |  2 +-
 website/docs/usage/models.md               | 74 ++++++++++++++++------
 website/docs/usage/processing-pipelines.md |  4 ++
 website/docs/usage/saving-loading.md       |  2 +-
 website/docs/usage/spacy-101.md            |  4 +-
 website/docs/usage/training.md             | 14 ++--
 10 files changed, 88 insertions(+), 44 deletions(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 9f0612b2b..6257199c9 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -8,8 +8,8 @@ source: spacy/language.py
 Usually you'll load this once per process as `nlp` and pass the instance around
 your application. The `Language` class is created when you call
 [`spacy.load`](/api/top-level#spacy.load) and contains the shared vocabulary and
-[language data](/usage/adding-languages), optional binary weights, e.g. provided
-by a [trained pipeline](/models), and the
+[language data](/usage/linguistic-features#language-data), optional binary
+weights, e.g. provided by a [trained pipeline](/models), and the
 [processing pipeline](/usage/processing-pipelines) containing components like
 the tagger or parser that are called on a document in order. You can also add
 your own processing pipeline components that take a `Doc` object, modify it and
@@ -210,7 +210,9 @@ settings defined in the [`[initialize]`](/api/data-formats#config-initialize)
 config block to set up the vocabulary, load in vectors and tok2vec weights and
 pass optional arguments to the `initialize` methods implemented by pipeline
 components or the tokenizer. This method is typically called automatically when
-you run [`spacy train`](/api/cli#train).
+you run [`spacy train`](/api/cli#train). See the usage guide on the
+[config lifecycle](/usage/training#config-lifecycle) and
+[initialization](/usage/training#initialization) for details.
 
 `get_examples` should be a function that returns an iterable of
 [`Example`](/api/example) objects. The data examples can either be the full
@@ -928,7 +930,7 @@ Serialize the current state to a binary string.
 
 Load state from a binary string. Note that this method is commonly used via the
 subclasses like `English` or `German` to make language-specific functionality
-like the [lexical attribute getters](/usage/adding-languages#lex-attrs)
+like the [lexical attribute getters](/usage/linguistic-features#language-data)
 available to the loaded object.
 
 > #### Example
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 3693429c4..f980756e5 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -130,8 +130,7 @@ applied to the `Doc` in order.
 ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
 
 Lemmatize a token using a lookup-based approach. If no lemma is found, the
-original string is returned. Languages can provide a
-[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
+original string is returned.
 
 | Name        | Description                                         |
 | ----------- | --------------------------------------------------- |
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index b3bb63d6c..e7e66e931 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -437,8 +437,8 @@ The L2 norm of the token's vector representation.
 | `ent_id_`                                    | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~                                                                                                                                         |
 | `lemma`                                      | Base form of the token, with no inflectional suffixes. ~~int~~                                                                                                                                                                                                        |
 | `lemma_`                                     | Base form of the token, with no inflectional suffixes. ~~str~~                                                                                                                                                                                                        |
-| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~                                                                                                |
-| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~                                                                                                |
+| `norm`                                       | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~int~~                                                                                                    |
+| `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/linguistic-features#language-data). ~~str~~                                                                                                    |
 | `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                  |
 | `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                         |
 | `shape`                                      | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 8ea5a1f65..8809c10bc 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -22,9 +22,8 @@ like punctuation and special case rules from the
 
 ## Tokenizer.\_\_init\_\_ {#init tag="method"}
 
-Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
-of how to construct a custom tokenizer with different tokenization rules, see
-the
+Create a `Tokenizer` to create `Doc` objects given unicode text. For examples of
+how to construct a custom tokenizer with different tokenization rules, see the
 [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
 
 > #### Example
@@ -87,7 +86,7 @@ Tokenize a stream of texts.
 | ------------ | ------------------------------------------------------------------------------------ |
 | `texts`      | A sequence of unicode texts. ~~Iterable[str]~~                                       |
 | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
-| **YIELDS**   | The tokenized `Doc` objects, in order. ~~Doc~~                                         |
+| **YIELDS**   | The tokenized `Doc` objects, in order. ~~Doc~~                                       |
 
 ## Tokenizer.find_infix {#find_infix tag="method"}
 
@@ -121,10 +120,10 @@ if no suffix rules match.
 ## Tokenizer.add_special_case {#add_special_case tag="method"}
 
 Add a special-case tokenization rule. This mechanism is also used to add custom
-tokenizer exceptions to the language data. See the usage guide on
-[adding languages](/usage/adding-languages#tokenizer-exceptions) and
-[linguistic features](/usage/linguistic-features#special-cases) for more details
-and examples.
+tokenizer exceptions to the language data. See the usage guide on the
+[languages data](/usage/linguistic-features#language-data) and
+[tokenizer special cases](/usage/linguistic-features#special-cases) for more
+details and examples.
 
 > #### Example
 >
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index a65a279a9..d7273b651 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -827,7 +827,7 @@ utilities.
 ### util.get_lang_class {#util.get_lang_class tag="function"}
 
 Import and load a `Language` class. Allows lazy-loading
-[language data](/usage/adding-languages) and importing languages using the
+[language data](/usage/linguistic-features#language-data) and importing languages using the
 two-letter language code. To add a language code for a custom language class,
 you can register it using the [`@registry.languages`](/api/top-level#registry)
 decorator.
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index 6792f691c..dc41385f2 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -30,7 +30,7 @@ import QuickstartModels from 'widgets/quickstart-models.js'
 ## Language support {#languages}
 
 spaCy currently provides support for the following languages. You can help by
-[improving the existing language data](/usage/adding-languages#language-data)
+improving the existing [language data](/usage/linguistic-features#language-data)
 and extending the tokenization patterns.
 [See here](https://github.com/explosion/spaCy/issues/3056) for details on how to
 contribute to development.
@@ -83,55 +83,81 @@ To train a pipeline using the neutral multi-language class, you can set
 import the `MultiLanguage` class directly, or call
 [`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
 
-### Chinese language support {#chinese new=2.3}
+### Chinese language support {#chinese new="2.3"}
 
 The Chinese language class supports three word segmentation options, `char`,
-`jieba` and `pkuseg`:
+`jieba` and `pkuseg`.
 
+> #### Manual setup
+>
 > ```python
 > from spacy.lang.zh import Chinese
 >
 > # Character segmentation (default)
 > nlp = Chinese()
->
 > # Jieba
 > cfg = {"segmenter": "jieba"}
 > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
->
 > # PKUSeg with "default" model provided by pkuseg
 > cfg = {"segmenter": "pkuseg"}
 > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
 > nlp.tokenizer.initialize(pkuseg_model="default")
 > ```
 
-1. **Character segmentation:** Character segmentation is the default
-   segmentation option. It's enabled when you create a new `Chinese` language
-   class or call `spacy.blank("zh")`.
-2. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
-   segmentation with the tokenizer option `{"segmenter": "jieba"}`.
-3. **PKUSeg**: As of spaCy v2.3.0, support for
-   [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
-   better segmentation for Chinese OntoNotes and the provided
-   [Chinese pipelines](/models/zh). Enable PKUSeg with the tokenizer option
-   `{"segmenter": "pkuseg"}`.
+```ini
+### config.cfg
+[nlp.tokenizer]
+@tokenizers = "spacy.zh.ChineseTokenizer"
+segmenter = "char"
+```
 
-<Infobox variant="warning">
+| Segmenter | Description                                                                                                                                                                                                                                                                                |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `char`    | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`.                                                                                                            |
+| `jieba`   | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`.                                                                                                                                                          |
+| `pkuseg`  | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
 
-In spaCy v3.0, the default Chinese word segmenter has switched from Jieba to
-character segmentation.
+<Infobox title="Changed in v3.0" variant="warning">
+
+In v3.0, the default word segmenter has switched from Jieba to character
+segmentation. Because the `pkuseg` segmenter depends on a model that can be
+loaded from a file, the model is loaded on
+[initialization](/usage/training#config-lifecycle) (typically before training).
+This ensures that your packaged Chinese model doesn't depend on a local path at
+runtime.
 
 </Infobox>
 
 <Accordion title="Details on spaCy's Chinese API">
 
 The `initialize` method for the Chinese tokenizer class supports the following
-config settings for loading pkuseg models:
+config settings for loading `pkuseg` models:
 
 | Name               | Description                                                                                                                           |
 | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `pkuseg_model`     | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~                                                  |
 | `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
 
+The initialization settings are typically provided in the
+[training config](/usage/training#config) and the data is loaded in before
+training and serialized with the model. This allows you to load the data from a
+local path and save out your pipeline and config, without requiring the same
+local path at runtime. See the usage guide on the
+[config lifecycle](/usage/training#config-lifecycle) for more background on
+this.
+
+```ini
+### config.cfg
+[initialize]
+
+[initialize.tokenizer]
+pkuseg_model = "/path/to/model"
+pkuseg_user_dict = "default"
+```
+
+You can also initialize the tokenizer for a blank language class by calling its
+`initialize` method:
+
 ```python
 ### Examples
 # Initialize the pkuseg tokenizer
@@ -191,12 +217,13 @@ nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
 
 ### Japanese language support {#japanese new=2.3}
 
+> #### Manual setup
+>
 > ```python
 > from spacy.lang.ja import Japanese
 >
 > # Load SudachiPy with split mode A (default)
 > nlp = Japanese()
->
 > # Load SudachiPy with split mode B
 > cfg = {"split_mode": "B"}
 > nlp = Japanese.from_config({"nlp": {"tokenizer": cfg}})
@@ -208,6 +235,13 @@ segmentation and part-of-speech tagging. The default Japanese language class and
 the provided Japanese pipelines use SudachiPy split mode `A`. The tokenizer
 config can be used to configure the split mode to `A`, `B` or `C`.
 
+```ini
+### config.cfg
+[nlp.tokenizer]
+@tokenizers = "spacy.ja.JapaneseTokenizer"
+split_mode = "A"
+```
+
 <Infobox variant="warning">
 
 If you run into errors related to `sudachipy`, which is currently under active
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 334ed03bd..c98bd08bc 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -895,6 +895,10 @@ the name. Registered functions can also take **arguments** by the way that can
 be defined in the config as well – you can read more about this in the docs on
 [training with custom code](/usage/training#custom-code).
 
+### Initializing components with data {#initialization}
+
+<!-- TODO: -->
+
 ### Python type hints and pydantic validation {#type-hints new="3"}
 
 spaCy's configs are powered by our machine learning library Thinc's
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 06fb18591..f8a5eea2a 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -291,7 +291,7 @@ installed in the same environment – that's it.
 | Entry point                                                                    | Description                                                                                                                                                                                                                                              |
 | ------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [`spacy_factories`](#entry-points-components)                                  | Group of entry points for pipeline component factories, keyed by component name. Can be used to expose custom components defined by another package.                                                                                                     |
-| [`spacy_languages`](#entry-points-languages)                                   | Group of entry points for custom [`Language` subclasses](/usage/adding-languages), keyed by language shortcut.                                                                                                                                           |
+| [`spacy_languages`](#entry-points-languages)                                   | Group of entry points for custom [`Language` subclasses](/usage/linguistic-features#language-data), keyed by language shortcut.                                                                                                                          |
 | `spacy_lookups` <Tag variant="new">2.2</Tag>                                   | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package.                                                                  |
 | [`spacy_displacy_colors`](#entry-points-displacy) <Tag variant="new">2.2</Tag> | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. |
 
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index cd1b2cb0c..5d7c7d7a5 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -200,7 +200,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
 To learn more about how spaCy's tokenization rules work in detail, how to
 **customize and replace** the default tokenizer and how to **add
 language-specific data**, see the usage guides on
-[adding languages](/usage/adding-languages) and
+[language data](/usage/linguistic-features#language-data) and
 [customizing the tokenizer](/usage/linguistic-features#tokenization).
 
 </Infobox>
@@ -479,7 +479,7 @@ find a "Suggest edits" link at the bottom of each page that points you to the
 source.
 
 Another way of getting involved is to help us improve the
-[language data](/usage/adding-languages#language-data) – especially if you
+[language data](/usage/linguistic-features#language-data) – especially if you
 happen to speak one of the languages currently in
 [alpha support](/usage/models#languages). Even adding simple tokenizer
 exceptions, stop words or lemmatizer data can make a big difference. It will
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 54daa6a15..1dd57fd4a 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -216,7 +216,9 @@ The initialization settings are only loaded and used when
 [`nlp.initialize`](/api/language#initialize) is called (typically right before
 training). This allows you to set up your pipeline using local data resources
 and custom functions, and preserve the information in your config – but without
-requiring it to be available at runtime
+requiring it to be available at runtime. You can also use this mechanism to
+provide data paths to custom pipeline components and custom tokenizers – see the
+section on [custom initialization](#initialization) for details.
 
 ### Overwriting config settings on the command line {#config-overrides}
 
@@ -815,9 +817,9 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
     return create_model(output_width)
 ```
 
-<!-- TODO:
 ### Customizing the initialization {#initialization}
--->
+
+<!-- TODO: -->
 
 ## Data utilities {#data}
 
@@ -1135,7 +1137,11 @@ An easy way to create modified `Example` objects is to use the
 capitalization changes, so only the `ORTH` values of the tokens will be
 different between the original and augmented examples.
 
-<!-- TODO: mention alignment -->
+Note that if your data augmentation strategy involves changing the tokenization
+(for instance, removing or adding tokens) and your training examples include
+token-based annotations like the dependency parse or entity labels, you'll need
+to take care to adjust the `Example` object so its annotations match and remain
+valid.
 
 ## Parallel & distributed training with Ray {#parallel-training}
 

From 22158dc24a8f78775d82060767788cdafc392aac Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 15:06:16 +0200
Subject: [PATCH 341/516] Add morphologizer to quickstart template

---
 spacy/cli/templates/quickstart_training.jinja | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 69dac0aa1..3bd237b0a 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -37,6 +37,22 @@ tokenizer_config = {"use_fast": true}
 window = 128
 stride = 96
 
+{% if "morphologizer" in components %}
+[components.morphologizer]
+factory = "morphologizer"
+
+[components.morphologizer.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.morphologizer.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.morphologizer.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{%- endif %}
+
 {% if "tagger" in components %}
 [components.tagger]
 factory = "tagger"
@@ -166,6 +182,19 @@ depth = {{ 4 if optimize == "efficiency" else 8 }}
 window_size = 1
 maxout_pieces = 3
 
+{% if "morphologizer" in components %}
+[components.morphologizer]
+factory = "morphologizer"
+
+[components.morphologizer.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.morphologizer.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{%- endif %}
+
 {% if "tagger" in components %}
 [components.tagger]
 factory = "tagger"
@@ -257,7 +286,7 @@ no_output_layer = false
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
+{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "entity_linker"] %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"

From f0b30aedade0d9b3cebc7cb7fabd905b9eecd52d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 2 Oct 2020 15:42:36 +0200
Subject: [PATCH 342/516] Make lemmatizers use initialize logic (#6182)

* Make lemmatizer use initialize logic and tidy up

* Fix typo

* Raise for uninitialized tables
---
 spacy/errors.py                         |  15 +-
 spacy/lang/bn/__init__.py               |  14 +-
 spacy/lang/el/__init__.py               |  14 +-
 spacy/lang/en/__init__.py               |  14 +-
 spacy/lang/fa/__init__.py               |  14 +-
 spacy/lang/fr/__init__.py               |  14 +-
 spacy/lang/nb/__init__.py               |  14 +-
 spacy/lang/nl/__init__.py               |  15 +-
 spacy/lang/pl/__init__.py               |  13 +-
 spacy/lang/ru/__init__.py               |  13 +-
 spacy/lang/sv/__init__.py               |  14 +-
 spacy/lang/uk/__init__.py               |  13 +-
 spacy/pipeline/lemmatizer.py            | 182 ++++++++++++------------
 spacy/tests/lang/test_lemmatizers.py    |  23 ++-
 spacy/tests/pipeline/test_lemmatizer.py |  86 +++++------
 website/docs/api/lemmatizer.md          |  59 ++++++--
 16 files changed, 236 insertions(+), 281 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 881a697f6..4edd1cbae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -477,6 +477,8 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
+            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
             "config.cfg or override it on the CLI?")
     E914 = ("Executing {name} callback failed. Expected the function to "
@@ -556,10 +558,10 @@ class Errors:
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
             "component.")
-    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
-            "spacy-lookups-data. If you want to initialize a blank nlp object, "
-            "make sure you have the spacy-lookups-data package installed or "
-            "remove the [initialize.lookups] block from your config.")
+    E955 = ("Can't find table(s) {table} for language '{lang}' in "
+            "spacy-lookups-data. Make sure you have the package installed or "
+            "provide your own lookup tables if no default lookups are available "
+            "for your language.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")
     E957 = ("Writing directly to Language.factories isn't needed anymore in "
@@ -685,9 +687,8 @@ class Errors:
     E1002 = ("Span index out of range.")
     E1003 = ("Unsupported lemmatizer mode '{mode}'.")
     E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
-             "Required tables '{tables}', found '{found}'. If you are not "
-             "providing custom lookups, make sure you have the package "
-             "spacy-lookups-data installed.")
+             "Required tables: {tables}. Found: {found}. Maybe you forgot to "
+             "call nlp.initialize() to load in the data?")
     E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
              "'{chunk}'. Tokenizer exceptions are only allowed to specify "
              "`ORTH` and `NORM`.")
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 923e29a17..879229888 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 
 
@@ -24,18 +23,11 @@ class Bengali(Language):
 @Bengali.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Bengali"]
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 1a7b19914..53069334e 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lemmatizer import GreekLemmatizer
-from ...lookups import Lookups
 from ...language import Language
 
 
@@ -29,18 +28,11 @@ class Greek(Language):
 @Greek.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Greek"]
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index cc01f1aea..3a3ebeefd 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES
 from .lemmatizer import EnglishLemmatizer
 from ...language import Language
-from ...lookups import Lookups
 
 
 class EnglishDefaults(Language.Defaults):
@@ -27,18 +26,11 @@ class English(Language):
 @English.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["English"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index f3a6635dc..77ee3bca3 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 
 
@@ -27,18 +26,11 @@ class Persian(Language):
 @Persian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Persian"]
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 72e641d1f..1e0011fba 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lemmatizer import FrenchLemmatizer
-from ...lookups import Lookups
 from ...language import Language
 
 
@@ -32,18 +31,11 @@ class French(Language):
 @French.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["French"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 9672dfd6e..62d7707f3 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 
 
@@ -27,18 +26,11 @@ class Norwegian(Language):
 @Norwegian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Norwegian"]
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 15b6b9de2..a3591f1bf 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,5 +1,4 @@
 from typing import Optional
-
 from thinc.api import Model
 
 from .stop_words import STOP_WORDS
@@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .lemmatizer import DutchLemmatizer
-from ...lookups import Lookups
 from ...language import Language
 
 
@@ -29,18 +27,11 @@ class Dutch(Language):
 @Dutch.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Dutch"]
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 573dbc6f9..f7be8a6c2 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -34,18 +34,11 @@ class Polish(Language):
 @Polish.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pos_lookup", "lookups": None},
+    default_config={"model": None, "mode": "pos_lookup"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Polish"]
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 6436ae0c7..1d59ca043 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import RussianLemmatizer
 from ...language import Language
-from ...lookups import Lookups
 
 
 class RussianDefaults(Language.Defaults):
@@ -23,17 +22,11 @@ class Russian(Language):
 @Russian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Russian"]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index ea314f487..2490eb9ec 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 
 
@@ -30,18 +29,11 @@ class Swedish(Language):
 @Swedish.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "rule", "lookups": None},
+    default_config={"model": None, "mode": "rule"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Swedish"]
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 006a1cf7f..73c065379 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import UkrainianLemmatizer
 from ...language import Language
-from ...lookups import Lookups
 
 
 class UkrainianDefaults(Language.Defaults):
@@ -24,17 +23,11 @@ class Ukrainian(Language):
 @Ukrainian.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
+    default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(
-    nlp: Language,
-    model: Optional[Model],
-    name: str,
-    mode: str,
-    lookups: Optional[Lookups],
-):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
 
 
 __all__ = ["Ukrainian"]
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 391769604..9be596868 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -1,26 +1,25 @@
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
+from typing import Tuple
 from thinc.api import Model
+from pathlib import Path
 
 from .pipe import Pipe
 from ..errors import Errors
 from ..language import Language
+from ..training import Example
 from ..lookups import Lookups, load_lookups
 from ..scorer import Scorer
 from ..tokens import Doc, Token
 from ..vocab import Vocab
 from ..training import validate_examples
+from ..util import logger, SimpleFrozenList
 from .. import util
 
 
 @Language.factory(
     "lemmatizer",
     assigns=["token.lemma"],
-    default_config={
-        "model": None,
-        "mode": "lookup",
-        "lookups": None,
-        "overwrite": False,
-    },
+    default_config={"model": None, "mode": "lookup", "overwrite": False},
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
@@ -28,13 +27,9 @@ def make_lemmatizer(
     model: Optional[Model],
     name: str,
     mode: str,
-    lookups: Optional[Lookups],
     overwrite: bool = False,
 ):
-    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
-    return Lemmatizer(
-        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
-    )
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
 class Lemmatizer(Pipe):
@@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
     """
 
     @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
         """Returns the lookups configuration settings for a given mode for use
         in Lemmatizer.load_lookups.
 
         mode (str): The lemmatizer mode.
-        RETURNS (dict): The lookups configuration settings for this mode.
-
-        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
+        RETURNS (Tuple[List[str], List[str]]): The required and optional
+            lookup tables for this mode.
         """
         if mode == "lookup":
-            return {
-                "required_tables": ["lemma_lookup"],
-            }
+            return (["lemma_lookup"], [])
         elif mode == "rule":
-            return {
-                "required_tables": ["lemma_rules"],
-                "optional_tables": ["lemma_exc", "lemma_index"],
-            }
-        return {}
-
-    @classmethod
-    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
-        """Load and validate lookups tables. If the provided lookups is None,
-        load the default lookups tables according to the language and mode
-        settings. Confirm that all required tables for the language and mode
-        are present.
-
-        lang (str): The language code.
-        mode (str): The lemmatizer mode.
-        lookups (Lookups): The provided lookups, may be None if the default
-            lookups should be loaded.
-        RETURNS (Lookups): The Lookups object.
-
-        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
-        """
-        config = cls.get_lookups_config(mode)
-        required_tables = config.get("required_tables", [])
-        optional_tables = config.get("optional_tables", [])
-        if lookups is None:
-            lookups = load_lookups(lang=lang, tables=required_tables)
-            optional_lookups = load_lookups(
-                lang=lang, tables=optional_tables, strict=False
-            )
-            for table in optional_lookups.tables:
-                lookups.set_table(table, optional_lookups.get_table(table))
-        for table in required_tables:
-            if table not in lookups:
-                raise ValueError(
-                    Errors.E1004.format(
-                        mode=mode, tables=required_tables, found=lookups.tables
-                    )
-                )
-        return lookups
+            return (["lemma_rules"], ["lemma_exc", "lemma_index"])
+        return ([], [])
 
     def __init__(
         self,
@@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
         name: str = "lemmatizer",
         *,
         mode: str = "lookup",
-        lookups: Optional[Lookups] = None,
         overwrite: bool = False,
     ) -> None:
         """Initialize a Lemmatizer.
@@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
         model (Model): A model (not yet implemented).
         name (str): The component name. Defaults to "lemmatizer".
         mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
-        lookups (Lookups): The lookups object containing the (optional) tables
-            such as "lemma_rules", "lemma_index", "lemma_exc" and
-            "lemma_lookup". Defaults to None
         overwrite (bool): Whether to overwrite existing lemmas. Defaults to
             `False`.
 
@@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
         self.model = model
         self.name = name
         self._mode = mode
-        self.lookups = lookups if lookups is not None else Lookups()
+        self.lookups = Lookups()
         self.overwrite = overwrite
+        self._validated = False
         if self.mode == "lookup":
             self.lemmatize = self.lookup_lemmatize
         elif self.mode == "rule":
@@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
 
         DOCS: https://nightly.spacy.io/api/lemmatizer#call
         """
+        if not self._validated:
+            self._validate_tables(Errors.E1004)
         for token in doc:
             if self.overwrite or token.lemma == 0:
                 token.lemma_ = self.lemmatize(token)[0]
         return doc
 
-    def pipe(self, stream, *, batch_size=128):
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        nlp: Optional[Language] = None,
+        lookups: Optional[Lookups] = None,
+    ):
+        """Initialize the lemmatizer and load in data.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+        lookups (Lookups): The lookups object containing the (optional) tables
+            such as "lemma_rules", "lemma_index", "lemma_exc" and
+            "lemma_lookup". Defaults to None.
+        """
+        required_tables, optional_tables = self.get_lookups_config(self.mode)
+        if lookups is None:
+            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
+            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
+            optional_lookups = load_lookups(
+                lang=self.vocab.lang, tables=optional_tables, strict=False
+            )
+            for table in optional_lookups.tables:
+                lookups.set_table(table, optional_lookups.get_table(table))
+        self.lookups = lookups
+        self._validate_tables(Errors.E1004)
+
+    def _validate_tables(self, error_message: str = Errors.E912) -> None:
+        """Check that the lookups are correct for the current mode."""
+        required_tables, optional_tables = self.get_lookups_config(self.mode)
+        for table in required_tables:
+            if table not in self.lookups:
+                raise ValueError(
+                    error_message.format(
+                        mode=self.mode,
+                        tables=required_tables,
+                        found=self.lookups.tables,
+                    )
+                )
+        self._validated = True
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
         """
         return False
 
-    def score(self, examples, **kwargs) -> Dict[str, Any]:
+    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         """Score a batch of examples.
 
         examples (Iterable[Example]): The examples to score.
@@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
         validate_examples(examples, "Lemmatizer.score")
         return Scorer.score_token_attr(examples, "lemma", **kwargs)
 
-    def to_disk(self, path, *, exclude=tuple()):
-        """Save the current state to a directory.
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ):
+        """Serialize the pipe to disk.
 
-        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist.
-        exclude (list): String names of serialization fields to exclude.
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
 
-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
         """
         serialize = {}
         serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["lookups"] = lambda p: self.lookups.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
-    def from_disk(self, path, *, exclude=tuple()):
-        """Loads state from a directory. Modifies the object in place and
-        returns it.
+    def from_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "Lemmatizer":
+        """Load the pipe from disk. Modifies the object in place and returns it.
 
-        path (unicode or Path): A path to a directory.
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (Vocab): The modified `Vocab` object.
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Lemmatizer): The modified Lemmatizer object.
 
-        DOCS: https://nightly.spacy.io/api/vocab#to_disk
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
         """
         deserialize = {}
         deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
         util.from_disk(path, deserialize, exclude)
+        self._validate_tables()
+        return self
 
-    def to_bytes(self, *, exclude=tuple()) -> bytes:
-        """Serialize the current state to a binary string.
+    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
+        """Serialize the pipe to a bytestring.
 
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized form of the `Vocab` object.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.
 
-        DOCS: https://nightly.spacy.io/api/vocab#to_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
         """
         serialize = {}
         serialize["vocab"] = self.vocab.to_bytes
         serialize["lookups"] = self.lookups.to_bytes
         return util.to_bytes(serialize, exclude)
 
-    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
-        """Load state from a binary string.
+    def from_bytes(
+        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "Lemmatizer":
+        """Load the pipe from a bytestring.
 
-        bytes_data (bytes): The data to load from.
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (Vocab): The `Vocab` object.
+        bytes_data (bytes): The serialized pipe.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Lemmatizer): The loaded Lemmatizer.
 
-        DOCS: https://nightly.spacy.io/api/vocab#from_bytes
+        DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
         """
         deserialize = {}
         deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
         util.from_bytes(bytes_data, deserialize, exclude)
+        self._validate_tables()
+        return self
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 6e7f82341..5f45664eb 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
     @registry.misc("lemmatizer_init_lookups")
     def lemmatizer_init_lookups():
         lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
         lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
         lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
         lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
         return lookups
 
-    """Test that languages can be initialized."""
+    # Test that languages can be initialized
     nlp = get_lang_class(lang)()
-    nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+    assert not lemmatizer.lookups.tables
+    nlp.config["initialize"]["components"]["lemmatizer"] = {
+        "lookups": {"@misc": "lemmatizer_init_lookups"}
+    }
+    with pytest.raises(ValueError):
+        nlp("x")
+    nlp.initialize()
+    assert lemmatizer.lookups.tables
+    doc = nlp("x")
     # Check for stray print statements (see #3342)
-    doc = nlp("test")  # noqa: F841
     captured = capfd.readouterr()
     assert not captured.out
+    assert doc[0].lemma_ == "y"
+
+    # Test initialization by calling .initialize() directly
+    nlp = get_lang_class(lang)()
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
+    lemmatizer.initialize(lookups=lemmatizer_init_lookups())
+    assert nlp("x")[0].lemma_ == "y"
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 05e15bc16..d37c87059 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -8,61 +8,52 @@ from ..util import make_tempdir
 
 @pytest.fixture
 def nlp():
-    return English()
-
-
-@pytest.fixture
-def lemmatizer(nlp):
     @registry.misc("cope_lookups")
     def cope_lookups():
         lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
         lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
         lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
         lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
         return lookups
 
-    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
-    )
-    return lemmatizer
+    nlp = English()
+    nlp.config["initialize"]["components"]["lemmatizer"] = {
+        "lookups": {"@misc": "cope_lookups"}
+    }
+    return nlp
 
 
 def test_lemmatizer_init(nlp):
-    @registry.misc("cope_lookups")
-    def cope_lookups():
-        lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
-        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
-        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
-        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
-        return lookups
-
-    lemmatizer = nlp.add_pipe(
-        "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
-    )
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
     assert isinstance(lemmatizer.lookups, Lookups)
+    assert not lemmatizer.lookups.tables
     assert lemmatizer.mode == "lookup"
+    with pytest.raises(ValueError):
+        nlp("test")
+    nlp.initialize()
+    assert lemmatizer.lookups.tables
+    assert nlp("cope")[0].lemma_ == "cope"
+    assert nlp("coped")[0].lemma_ == "cope"
     # replace any tables from spacy-lookups-data
     lemmatizer.lookups = Lookups()
-    doc = nlp("coping")
     # lookup with no tables sets text as lemma
-    assert doc[0].lemma_ == "coping"
-
+    assert nlp("cope")[0].lemma_ == "cope"
+    assert nlp("coped")[0].lemma_ == "coped"
     nlp.remove_pipe("lemmatizer")
-
-    @registry.misc("empty_lookups")
-    def empty_lookups():
-        return Lookups()
-
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
     with pytest.raises(ValueError):
-        nlp.add_pipe(
-            "lemmatizer",
-            config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
-        )
+        # Can't initialize without required tables
+        lemmatizer.initialize(lookups=Lookups())
+    lookups = Lookups()
+    lookups.add_table("lemma_lookup", {})
+    lemmatizer.initialize(lookups=lookups)
 
 
-def test_lemmatizer_config(nlp, lemmatizer):
+def test_lemmatizer_config(nlp):
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+    nlp.initialize()
+
     doc = nlp.make_doc("coping")
     doc[0].pos_ = "VERB"
     assert doc[0].lemma_ == ""
@@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
     assert doc[0].lemma_ == "cope"
 
 
-def test_lemmatizer_serialize(nlp, lemmatizer):
-    @registry.misc("cope_lookups")
+def test_lemmatizer_serialize(nlp):
+    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+    nlp.initialize()
+
     def cope_lookups():
         lookups = Lookups()
-        lookups.add_table("lemma_lookup", {"cope": "cope"})
+        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
         lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
         lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
         lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
         return lookups
 
     nlp2 = English()
-    lemmatizer2 = nlp2.add_pipe(
-        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
-    )
+    lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
+    lemmatizer2.initialize(lookups=cope_lookups())
     lemmatizer2.from_bytes(lemmatizer.to_bytes())
     assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
     assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
-        doc2 = nlp2.make_doc("coping")
-        doc2[0].pos_ = "VERB"
-        assert doc2[0].lemma_ == ""
-        doc2 = lemmatizer(doc2)
-        assert doc2[0].text == "coping"
-        assert doc2[0].lemma_ == "cope"
+    doc2 = nlp2.make_doc("coping")
+    doc2[0].pos_ = "VERB"
+    assert doc2[0].lemma_ == ""
+    doc2 = lemmatizer(doc2)
+    assert doc2[0].text == "coping"
+    assert doc2[0].lemma_ == "cope"
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index f980756e5..27ea04432 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
 > nlp.add_pipe("lemmatizer", config=config)
 > ```
 
-| Setting     | Description                                                                                                                                                                                                                                                                         |
-| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                                                                                                                                                                                                   |
-| `lookups`   | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
-| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                 |
-| `model`     | **Not yet implemented:** the model to use. ~~Model~~                                                                                                                                                                                                                                |
+| Setting     | Description                                                                       |
+| ----------- | --------------------------------------------------------------------------------- |
+| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
+| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~               |
+| `model`     | **Not yet implemented:** the model to use. ~~Model~~                              |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
@@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name           | Description                                                                                                                                                    |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                                                               |
-| `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                                                                           |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                            |
-| _keyword-only_ |                                                                                                                                                                |
-| mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                                                                              |
-| lookups        | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
-| overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                                                                                  |
+| Name           | Description                                                                                         |
+| -------------- | --------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                    |
+| `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ |                                                                                                     |
+| mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                   |
+| overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                       |
 
 ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
 
@@ -127,6 +125,37 @@ applied to the `Doc` in order.
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
+## Lemmatizer.initialize {#initialize tag="method"}
+
+Initialize the lemmatizer and load any data resources. This method is typically
+called by [`Language.initialize`](/api/language#initialize) and lets you
+customize arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config. The loading only happens during initialization, typically before
+training. At runtime, all data is loaded from disk.
+
+> #### Example
+>
+> ```python
+> lemmatizer = nlp.add_pipe("lemmatizer")
+> lemmatizer.initialize(lookups=lookups)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.lemmatizer]
+>
+> [initialize.components.lemmatizer.lookups]
+> @misc = "load_my_lookups.v1"
+> ```
+
+| Name           | Description                                                                                                                                                                                                                                                                         |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~                                                                                                                 |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                     |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                |
+| `lookups`      | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
+
 ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
 
 Lemmatize a token using a lookup-based approach. If no lemma is found, the

From 09dcb75076e39eca904e54c21e22c25491a82a02 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 2 Oct 2020 15:43:32 +0200
Subject: [PATCH 343/516] small UX fix for DocBin (#6167)

* add informative warning when messing up store_user_data DocBin flags

* add informative warning when messing up store_user_data DocBin flags

* cleanup test

* rename to patterns_path
---
 spacy/errors.py                             |  2 +-
 spacy/tests/serialize/test_serialize_doc.py | 20 +++++++++++++
 spacy/tokens/_serialize.py                  | 31 +++++++++++++++------
 website/docs/api/docbin.md                  |  2 +-
 4 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4edd1cbae..dbb25479d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -419,7 +419,7 @@ class Errors:
     E164 = ("x is neither increasing nor decreasing: {}.")
     E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
             "that case.")
-    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
+    E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
             "Current DocBin: {current}\nOther DocBin: {other}")
     E169 = ("Can't find module: {module}")
     E170 = ("Cannot apply transition {name}: invalid for the current state.")
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 4a976fc02..8b6adb83b 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -1,3 +1,6 @@
+import pytest
+from spacy.tokens.doc import Underscore
+
 import spacy
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -86,3 +89,20 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab):
     assert re_doc1.text == "that 's "
     assert not re_doc2.has_unknown_spaces
     assert re_doc2.text == "that's"
+
+
+@pytest.mark.parametrize(
+    "writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")]
+)
+def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
+    """Test that custom extensions are correctly serialized in DocBin."""
+    Doc.set_extension("foo", default="nothing")
+    doc = Doc(en_vocab, words=["hello", "world"])
+    doc._.foo = "bar"
+    doc_bin_1 = DocBin(store_user_data=writer_flag)
+    doc_bin_1.add(doc)
+    doc_bin_bytes = doc_bin_1.to_bytes()
+    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
+    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
+    assert doc_2._.foo == reader_value
+    Underscore.doc_extensions = {}
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index ed283a86b..11eb75821 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -58,7 +58,7 @@ class DocBin:
 
         attrs (Iterable[str]): List of attributes to serialize. 'orth' and
             'spacy' are always serialized, so they're not required.
-        store_user_data (bool): Whether to include the `Doc.user_data`.
+        store_user_data (bool): Whether to write the `Doc.user_data` to bytes/file.
         docs (Iterable[Doc]): Docs to add.
 
         DOCS: https://nightly.spacy.io/api/docbin#init
@@ -106,11 +106,12 @@ class DocBin:
             self.strings.add(token.ent_type_)
             self.strings.add(token.ent_kb_id_)
         self.cats.append(doc.cats)
-        if self.store_user_data:
-            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
+        self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 
     def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
         """Recover Doc objects from the annotations, using the given vocab.
+        Note that the user data of each doc will be read (if available) and returned,
+        regardless of the setting of 'self.store_user_data'.
 
         vocab (Vocab): The shared vocab.
         YIELDS (Doc): The Doc objects.
@@ -129,7 +130,7 @@ class DocBin:
             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
             doc = doc.from_array(self.attrs, tokens)
             doc.cats = self.cats[i]
-            if self.store_user_data:
+            if i < len(self.user_data) and self.user_data[i] is not None:
                 user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
                 doc.user_data.update(user_data)
             yield doc
@@ -137,21 +138,31 @@ class DocBin:
     def merge(self, other: "DocBin") -> None:
         """Extend the annotations of this DocBin with the annotations from
         another. Will raise an error if the pre-defined attrs of the two
-        DocBins don't match.
+        DocBins don't match, or if they differ in whether or not to store
+        user data.
 
         other (DocBin): The DocBin to merge into the current bin.
 
         DOCS: https://nightly.spacy.io/api/docbin#merge
         """
         if self.attrs != other.attrs:
-            raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
+            raise ValueError(
+                Errors.E166.format(param="attrs", current=self.attrs, other=other.attrs)
+            )
+        if self.store_user_data != other.store_user_data:
+            raise ValueError(
+                Errors.E166.format(
+                    param="store_user_data",
+                    current=self.store_user_data,
+                    other=other.store_user_data,
+                )
+            )
         self.tokens.extend(other.tokens)
         self.spaces.extend(other.spaces)
         self.strings.update(other.strings)
         self.cats.extend(other.cats)
         self.flags.extend(other.flags)
-        if self.store_user_data:
-            self.user_data.extend(other.user_data)
+        self.user_data.extend(other.user_data)
 
     def to_bytes(self) -> bytes:
         """Serialize the DocBin's annotations to a bytestring.
@@ -200,8 +211,10 @@ class DocBin:
         self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
         self.cats = msg["cats"]
         self.flags = msg.get("flags", [{} for _ in lengths])
-        if self.store_user_data and "user_data" in msg:
+        if "user_data" in msg:
             self.user_data = list(msg["user_data"])
+        else:
+            self.user_data = [None] * len(self)
         for tokens in self.tokens:
             assert len(tokens.shape) == 2, tokens.shape  # this should never happen
         return self
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 03aff2f6e..3625ed790 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -47,7 +47,7 @@ Create a `DocBin` object to hold serialized annotations.
 | Argument          | Description                                                                                                                                                                                                                                                                                         |
 | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `attrs`           | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
-| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~                                                                                                                                                                                 |
+| `store_user_data` | Whether to write the `Doc.user_data` and the values of custom extension attributes to file/bytes. Defaults to `False`. ~~bool~~                                                                                                                                                                     |
 | `docs`            | `Doc` objects to add on initialization. ~~Iterable[Doc]~~                                                                                                                                                                                                                                           |
 
 ## DocBin.\_\len\_\_ {#len tag="method"}

From 62ccd5c4dfbcfbf8248b00696eebc97427444e8a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 2 Oct 2020 16:37:21 +0200
Subject: [PATCH 344/516] Relax model meta performance schema (#6185)

Allow more embedded per_x in `ModelMetaSchema`
---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 1125fa7da..591b7e134 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
     sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
     vectors: Dict[str, Any] = Field({}, title="Included word vectors")
     labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
-    performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
+    performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
     spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
     # fmt: on
 

From 6965cdf16dd043913a815781ef77e90d565f6073 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 2 Oct 2020 17:26:21 +0200
Subject: [PATCH 345/516] Fix comment

---
 spacy/ml/models/tok2vec.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 84b54f029..120e9b02c 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -186,11 +186,7 @@ def CharacterEmbed(
 
     feature (int or str): An attribute to embed, to concatenate with the characters.
     width (int): The width of the output vector and the feature embedding.
-<<<<<<< HEAD
-    rows (int): The number of rows in the NORM hash embedding table.
-=======
     rows (int): The number of rows in the LOWER hash embedding table.
->>>>>>> 300e5a9928fd226dfddbf7d5c22558f696bfa1af
     nM (int): The dimensionality of the character embeddings. Recommended values
         are between 16 and 64.
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values

From fb48de349cd588d601d7c9bdb072f8a51a848694 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 2 Oct 2020 20:31:14 +0200
Subject: [PATCH 346/516] bwd compat for pipe.begin_training

---
 spacy/errors.py   | 4 +++-
 spacy/language.py | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index dbb25479d..2c076db52 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,7 +85,9 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
+    W089 = ("The 'begin_training' method has been renamed to 'initialize', "
+            "for calls to 'nlp' as well as for the individual pipeline "
+            "components.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
diff --git a/spacy/language.py b/spacy/language.py
index 14b9f4eb0..36cd251f3 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1207,7 +1207,11 @@ class Language:
             )
             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
         for name, proc in self.pipeline:
-            if hasattr(proc, "initialize"):
+            # backwards compatibility for older components
+            if hasattr(proc, "begin_training"):
+                warnings.warn(Warnings.W089, DeprecationWarning)
+                proc.begin_training(get_examples, pipeline=self.pipeline, sgd=self._optimizer)
+            elif hasattr(proc, "initialize"):
                 p_settings = I["components"].get(name, {})
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name

From 3589a64d44efad29a340b13b505cc47a7fe2c797 Mon Sep 17 00:00:00 2001
From: Stanislav Schmidt <Stannislav@users.noreply.github.com>
Date: Fri, 2 Oct 2020 21:00:11 +0200
Subject: [PATCH 347/516] Change type of texts argument in pipe to iterable
 (#6186)

* Change type of texts argument in pipe to iterable

* Add contributor agreement
---
 .github/contributors/Stannislav.md | 106 +++++++++++++++++++++++++++++
 spacy/language.py                  |   2 +-
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 .github/contributors/Stannislav.md

diff --git a/.github/contributors/Stannislav.md b/.github/contributors/Stannislav.md
new file mode 100644
index 000000000..899d6b09b
--- /dev/null
+++ b/.github/contributors/Stannislav.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Stanislav Schmidt    |
+| Company name (if applicable)   | Blue Brain Project   |
+| Title or role (if applicable)  | ML Engineer          |
+| Date                           | 2020-10-02           |
+| GitHub username                | Stannislav           |
+| Website (optional)             |                      |
diff --git a/spacy/language.py b/spacy/language.py
index e9d195453..ee46da3c1 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -751,7 +751,7 @@ class Language(object):
     ):
         """Process texts as a stream, and yield `Doc` objects in order.
 
-        texts (iterator): A sequence of texts to process.
+        texts (iterable): A sequence of texts to process.
         as_tuples (bool): If set to True, inputs should be a sequence of
             (text, context) tuples. Output will then be a sequence of
             (doc, context) tuples. Defaults to False.

From 52e4586ec11bf6ef13680cf80c5bdc33499be2c1 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 11:13:00 +0200
Subject: [PATCH 348/516] Add transformers to extras_require [ci skip]

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 963ce60ca..7192ba9d4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -66,6 +66,8 @@ console_scripts =
 [options.extras_require]
 lookups =
     spacy_lookups_data==1.0.0rc0
+transformers =
+    spacy_transformers>=1.0.0a17,<1.0.0
 cuda =
     cupy>=5.0.0b4,<9.0.0
 cuda80 =

From eb9b3ff9c5a2bc779412d85e77e840b5049e4209 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 11:35:42 +0200
Subject: [PATCH 349/516] Update install docs and quickstarts [ci skip]

---
 website/docs/usage/index.md               |  60 ++++---
 website/src/components/quickstart.js      |  74 ++++----
 website/src/styles/quickstart.module.sass |  36 +++-
 website/src/widgets/quickstart-install.js | 208 +++++++++++++---------
 website/src/widgets/quickstart-models.js  | 132 ++++++++------
 5 files changed, 309 insertions(+), 201 deletions(-)

diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index ad2614175..e0a4fdb07 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -8,10 +8,7 @@ menu:
   - ['Changelog', 'changelog']
 ---
 
-spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
-**macOS/OS X** and **Windows**. The latest spaCy releases are available over
-[pip](https://pypi.python.org/pypi/spacy) and
-[conda](https://anaconda.org/conda-forge/spacy).
+## Quickstart {hidden="true"}
 
 > #### 📖 Looking for the old docs?
 >
@@ -19,21 +16,22 @@ spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
 > website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed
 > and how to migrate, see the guide on [v3.0 guide](/usage/v3).
 
-## Quickstart {hidden="true"}
-
 import QuickstartInstall from 'widgets/quickstart-install.js'
 
-<QuickstartInstall title="Quickstart" id="quickstart" />
+<QuickstartInstall id="quickstart" />
 
 ## Installation instructions {#installation}
 
+spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
+**macOS/OS X** and **Windows**. The latest spaCy releases are available over
+[pip](https://pypi.python.org/pypi/spacy) and
+[conda](https://anaconda.org/conda-forge/spacy).
+
 ### pip {#pip}
 
 Using pip, spaCy releases are available as source packages and binary wheels.
-
-```bash
-$ pip install -U spacy
-```
+Before you install spaCy and its dependencies, make sure that your `pip`,
+`setuptools` and `wheel` are up to date.
 
 > #### Download pipelines
 >
@@ -47,16 +45,10 @@ $ pip install -U spacy
 > >>> nlp = spacy.load("en_core_web_sm")
 > ```
 
-<Infobox variant="warning">
-
-To install additional data tables for lemmatization you can run
-`pip install spacy[lookups]` or install
-[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
-separately. The lookups package is needed to provide normalization and
-lemmatization data for new models and to lemmatize in languages that don't yet
-come with trained pipelines and aren't powered by third-party libraries.
-
-</Infobox>
+```bash
+$ pip install -U pip setuptools wheel
+$ pip install -U spacy
+```
 
 When using pip it is generally recommended to install packages in a virtual
 environment to avoid modifying system state:
@@ -64,9 +56,28 @@ environment to avoid modifying system state:
 ```bash
 $ python -m venv .env
 $ source .env/bin/activate
+$ pip install -U pip setuptools wheel
 $ pip install spacy
 ```
 
+spaCy also lets you install extra dependencies by specifying the following
+keywords in brackets, e.g. `spacy[ja]` or `spacy[lookups,transformers]` (with
+multiple comma-separated extras). See the `[options.extras_require]` section in
+spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
+
+> #### Example
+>
+> ```bash
+> $ pip install spacy[lookups,transformers]
+> ```
+
+| Name             | Description                                                                                                                                                                                                                                                    |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
+| `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
+| `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
+| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
+
 ### conda {#conda}
 
 Thanks to our great community, we've been able to re-add conda support. You can
@@ -112,10 +123,9 @@ $ python -m spacy validate
 ### Run spaCy with GPU {#gpu new="2.0.14"}
 
 As of v2.0, spaCy comes with neural network models that are implemented in our
-machine learning library, [Thinc](https://github.com/explosion/thinc). For GPU
-support, we've been grateful to use the work of Chainer's
-[CuPy](https://cupy.chainer.org) module, which provides a numpy-compatible
-interface for GPU arrays.
+machine learning library, [Thinc](https://thinc.ai). For GPU support, we've been
+grateful to use the work of Chainer's [CuPy](https://cupy.chainer.org) module,
+which provides a numpy-compatible interface for GPU arrays.
 
 spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`,
 `spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]` or
diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js
index 64f828c2f..e47e02e35 100644
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@@ -24,6 +24,7 @@ const Quickstart = ({
     rawContent = null,
     id = 'quickstart',
     setters = {},
+    showDropdown = {},
     hidePrompts,
     small,
     codeLang,
@@ -107,6 +108,8 @@ const Quickstart = ({
                     }) => {
                         // Optional function that's called with the value
                         const setterFunc = setters[id] || (() => {})
+                        // Check if dropdown should be shown
+                        const dropdownGetter = showDropdown[id] || (() => true)
                         return (
                             <div key={id} data-quickstart-group={id} className={classes.group}>
                                 <style data-quickstart-style={id} scoped>
@@ -123,37 +126,6 @@ const Quickstart = ({
                                     )}
                                 </div>
                                 <div className={classes.fields}>
-                                    {!!dropdown.length && (
-                                        <select
-                                            defaultValue={defaultValue}
-                                            className={classes.select}
-                                            onChange={({ target }) => {
-                                                const value = target.value
-                                                if (value != other) {
-                                                    setterFunc(value)
-                                                    setOther(id, false)
-                                                } else {
-                                                    setterFunc('')
-                                                    setOther(id, true)
-                                                }
-                                            }}
-                                        >
-                                            {dropdown.map(({ id, title }) => (
-                                                <option key={id} value={id}>
-                                                    {title}
-                                                </option>
-                                            ))}
-                                            {other && <option value={other}>{other}</option>}
-                                        </select>
-                                    )}
-                                    {other && otherState[id] && (
-                                        <input
-                                            type="text"
-                                            className={classes.textInput}
-                                            placeholder="Type here..."
-                                            onChange={({ target }) => setterFunc(target.value)}
-                                        />
-                                    )}
                                     {options.map(option => {
                                         const optionType = multiple ? 'checkbox' : 'radio'
                                         const checkedForId = checked[id] || []
@@ -179,7 +151,10 @@ const Quickstart = ({
                                                     type={optionType}
                                                     className={classNames(
                                                         classes.input,
-                                                        classes[optionType]
+                                                        classes[optionType],
+                                                        {
+                                                            [classes.long]: options.length >= 4,
+                                                        }
                                                     )}
                                                     name={id}
                                                     id={`quickstart-${option.id}`}
@@ -209,6 +184,41 @@ const Quickstart = ({
                                             </Fragment>
                                         )
                                     })}
+                                    <span className={classes.fieldExtra}>
+                                        {!!dropdown.length && (
+                                            <select
+                                                defaultValue={defaultValue}
+                                                className={classNames(classes.select, {
+                                                    [classes.selectHidden]: !dropdownGetter(),
+                                                })}
+                                                onChange={({ target }) => {
+                                                    const value = target.value
+                                                    if (value != other) {
+                                                        setterFunc(value)
+                                                        setOther(id, false)
+                                                    } else {
+                                                        setterFunc('')
+                                                        setOther(id, true)
+                                                    }
+                                                }}
+                                            >
+                                                {dropdown.map(({ id, title }) => (
+                                                    <option key={id} value={id}>
+                                                        {title}
+                                                    </option>
+                                                ))}
+                                                {other && <option value={other}>{other}</option>}
+                                            </select>
+                                        )}
+                                        {other && otherState[id] && (
+                                            <input
+                                                type="text"
+                                                className={classes.textInput}
+                                                placeholder="Type here..."
+                                                onChange={({ target }) => setterFunc(target.value)}
+                                            />
+                                        )}
+                                    </span>
                                 </div>
                             </div>
                         )
diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index 91dd19f85..a08d6bcb6 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -36,22 +36,37 @@
 
 .label
     cursor: pointer
-    border: 1px solid var(--color-subtle)
-    border-radius: var(--border-radius)
     display: inline-block
-    padding: 0.65rem 1.25rem
-    margin: 0 0.5rem 0.75rem 0
+    padding: 0.35rem 0.5rem 0.25rem 0
+    margin: 0 1rem 0.75rem 0
     font-size: var(--font-size-xs)
     font-weight: bold
-    background: var(--color-back)
 
     &:hover
         background: var(--color-subtle-light)
 
-    .input:focus + &
+    .input:focus +
         border: 1px solid var(--color-theme)
         outline: none
 
+    .radio + &
+        margin: 0 0 0.75rem 0
+        border-radius: 0
+        border-width: 1px 0 1px 1px
+        border-style: solid
+        border-color: var(--color-subtle)
+        background: var(--color-back)
+        padding: 0.65rem 1.25rem
+
+        &:nth-child(2)  // first child is checkbox
+            border-top-left-radius: var(--border-radius)
+            border-bottom-left-radius: var(--border-radius)
+
+        &:nth-last-child(2)  // last child is additional container
+            border-top-right-radius: var(--border-radius)
+            border-bottom-right-radius: var(--border-radius)
+            border-right-width: 1px
+
     .radio:checked + &
         color: var(--color-back)
         border-color: var(--color-theme)
@@ -64,9 +79,10 @@
         height: 20px
         border: 1px solid var(--color-subtle)
         vertical-align: middle
-        margin-right: 1rem
+        margin-right: 0.5rem
         cursor: pointer
         border-radius: var(--border-radius)
+        background: var(--color-back)
 
     .checkbox:checked + &:before
         // Embed "check" icon here for simplicity
@@ -74,6 +90,9 @@
         background-size: contain
         border-color: var(--color-theme)
 
+.field-extra:not(:empty):not(:first-child)
+    margin-left: 1rem
+
 .legend
     color: var(--color-dark)
     padding: 0.75rem 0
@@ -93,6 +112,9 @@
     font-size: var(--font-size-sm)
     background: var(--color-back)
 
+.select-hidden
+    display: none
+
 .text-input
     border: 1px solid var(--color-subtle)
     border-radius: var(--border-radius)
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index eb98cb1fc..741973945 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -1,9 +1,20 @@
-import React from 'react'
+import React, { useState } from 'react'
 import { StaticQuery, graphql } from 'gatsby'
 
 import { Quickstart, QS } from '../components/quickstart'
 import { repo } from '../components/util'
 
+const DEFAULT_HARDWARE = 'cpu'
+const DEFAULT_CUDA = 'cuda100'
+const CUDA = {
+    '8.0': 'cuda80',
+    '9.0': 'cuda90',
+    '9.1': 'cuda91',
+    '9.2': 'cuda92',
+    '10.0': 'cuda100',
+    '10.1': 'cuda101',
+    '10.2': 'cuda102',
+}
 const DATA = [
     {
         id: 'os',
@@ -23,6 +34,16 @@ const DATA = [
             { id: 'source', title: 'from source' },
         ],
     },
+    {
+        id: 'hardware',
+        title: 'Hardware',
+        options: [
+            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
+            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
+        ],
+        dropdown: Object.keys(CUDA).map(id => ({ id: CUDA[id], title: `CUDA ${id}` })),
+        defaultValue: DEFAULT_CUDA,
+    },
     {
         id: 'config',
         title: 'Configuration',
@@ -30,100 +51,115 @@ const DATA = [
         options: [
             {
                 id: 'venv',
-                title: 'virtualenv',
+                title: 'virtual env',
                 help: 'Use a virtual environment and install spaCy into a user directory',
             },
-        ],
-    },
-    {
-        id: 'addition',
-        title: 'Additions',
-        multiple: true,
-        options: [
             {
-                id: 'transformers',
-                title: 'Transformers',
-                help: 'Use transformers like BERT to train your spaCy pipelines',
-            },
-            {
-                id: 'lookups',
-                title: 'Lemmatizer data',
-                help: 'Install additional lookup tables and rules for lemmatization',
+                id: 'train',
+                title: 'train models',
+                help:
+                    'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
             },
         ],
     },
 ]
 
-const QuickstartInstall = ({ id, title }) => (
-    <StaticQuery
-        query={query}
-        render={({ site }) => {
-            const { nightly, languages } = site.siteMetadata
-            const models = languages.filter(({ models }) => models !== null)
-            const data = [
-                ...DATA,
-                {
-                    id: 'models',
-                    title: 'Trained Pipelines',
-                    multiple: true,
-                    options: models.map(({ code, name }) => ({ id: code, title: name })),
-                },
-            ]
-            return (
-                <Quickstart data={data} title={title} id={id}>
-                    <QS config="venv">python -m venv .env</QS>
-                    <QS config="venv" os="mac">
-                        source .env/bin/activate
-                    </QS>
-                    <QS config="venv" os="linux">
-                        source .env/bin/activate
-                    </QS>
-                    <QS config="venv" os="windows">
-                        .env\Scripts\activate
-                    </QS>
-                    <QS package="pip">pip install -U spacy</QS>
-                    <QS package="conda">conda install -c conda-forge spacy</QS>
-                    <QS package="source">
-                        git clone https://github.com/{repo}
-                        {nightly ? ` --branch develop` : ''}
-                    </QS>
-                    <QS package="source">cd spaCy</QS>
-                    <QS package="source" os="linux">
-                        export PYTHONPATH=`pwd`
-                    </QS>
-                    <QS package="source" os="windows">
-                        set PYTHONPATH=C:\path\to\spaCy
-                    </QS>
-                    <QS package="source">pip install -r requirements.txt</QS>
-                    <QS addition="transformers" package="pip">
-                        pip install -U spacy-transformers
-                    </QS>
-                    <QS addition="transformers" package="source">
-                        pip install -U spacy-transformers
-                    </QS>
-                    <QS addition="transformers" package="conda">
-                        conda install -c conda-forge spacy-transformers
-                    </QS>
-                    <QS addition="lookups" package="pip">
-                        pip install -U spacy-lookups-data
-                    </QS>
-                    <QS addition="lookups" package="source">
-                        pip install -U spacy-lookups-data
-                    </QS>
-                    <QS addition="lookups" package="conda">
-                        conda install -c conda-forge spacy-lookups-data
-                    </QS>
-                    <QS package="source">python setup.py build_ext --inplace</QS>
-                    {models.map(({ code, models: modelOptions }) => (
-                        <QS models={code} key={code}>
-                            python -m spacy download {modelOptions[0]}
+const QuickstartInstall = ({ id, title }) => {
+    const [train, setTrain] = useState(false)
+    const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
+    const [cuda, setCuda] = useState(DEFAULT_CUDA)
+    const setters = {
+        hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
+        config: v => setTrain(v.includes('train')),
+    }
+    const showDropdown = {
+        hardware: () => hardware === 'gpu',
+    }
+    const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
+        .filter(e => e)
+        .join(',')
+    return (
+        <StaticQuery
+            query={query}
+            render={({ site }) => {
+                const { nightly, languages } = site.siteMetadata
+                const pkg = nightly ? 'spacy-nightly' : 'spacy'
+                const models = languages.filter(({ models }) => models !== null)
+                const data = [
+                    ...DATA,
+                    {
+                        id: 'models',
+                        title: 'Trained Pipelines',
+                        multiple: true,
+                        options: models
+                            .sort((a, b) => a.name.localeCompare(b.name))
+                            .map(({ code, name }) => ({ id: code, title: name })),
+                    },
+                ]
+                return (
+                    <Quickstart
+                        data={data}
+                        title={title}
+                        id={id}
+                        setters={setters}
+                        showDropdown={showDropdown}
+                    >
+                        <QS config="venv">python -m venv .env</QS>
+                        <QS config="venv" os="mac">
+                            source .env/bin/activate
                         </QS>
-                    ))}
-                </Quickstart>
-            )
-        }}
-    />
-)
+                        <QS config="venv" os="linux">
+                            source .env/bin/activate
+                        </QS>
+                        <QS config="venv" os="windows">
+                            .env\Scripts\activate
+                        </QS>
+                        <QS package="pip">pip install -U pip setuptools wheel</QS>
+                        <QS package="source">pip install -U pip setuptools wheel</QS>
+                        <QS package="pip">
+                            pip install -U {pkg}
+                            {pipExtras && `[${pipExtras}]`}
+                            {nightly ? ' --pre' : ''}
+                        </QS>
+                        <QS package="conda">conda install -c conda-forge spacy</QS>
+                        <QS package="conda" hardware="gpu">
+                            conda install -c conda-forge cupy
+                        </QS>
+                        <QS package="source">
+                            git clone https://github.com/{repo}
+                            {nightly ? ` --branch develop` : ''}
+                        </QS>
+                        <QS package="source">cd spaCy</QS>
+                        <QS package="source" os="linux">
+                            export PYTHONPATH=`pwd`
+                        </QS>
+                        <QS package="source" os="windows">
+                            set PYTHONPATH=C:\path\to\spaCy
+                        </QS>
+                        <QS package="source">pip install -r requirements.txt</QS>
+                        <QS package="source">python setup.py build_ext --inplace</QS>
+                        <QS package="source" config="train">
+                            pip install -e '.[{pipExtras}]'
+                        </QS>
+
+                        <QS config="train" package="conda">
+                            conda install -c conda-forge spacy-transformers
+                        </QS>
+                        <QS config="train" package="conda">
+                            conda install -c conda-forge spacy-lookups-data
+                        </QS>
+
+                        {models.map(({ code, models: modelOptions }) => (
+                            <QS models={code} key={code}>
+                                python -m spacy download {modelOptions[0]}
+                            </QS>
+                        ))}
+                    </Quickstart>
+                )
+            }}
+        />
+    )
+}
 
 export default QuickstartInstall
 
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index d19ff9e52..ffd1b3df9 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -1,12 +1,16 @@
-import React, { Fragment } from 'react'
+import React, { Fragment, useState } from 'react'
 import { StaticQuery, graphql } from 'gatsby'
 
 import { Quickstart, QS } from '../components/quickstart'
 
+const DEFAULT_LANG = 'en'
+const DEFAULT_OPT = 'efficiency'
+
 const data = [
     {
         id: 'lang',
         title: 'Language',
+        defaultValue: DEFAULT_LANG,
     },
     {
         id: 'load',
@@ -25,6 +29,16 @@ const data = [
             },
         ],
     },
+    {
+        id: 'optimize',
+        title: 'Optimize for',
+        help:
+            'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
+        options: [
+            { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
+            { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
+        ],
+    },
     {
         id: 'config',
         title: 'Options',
@@ -33,57 +47,73 @@ const data = [
     },
 ]
 
-const QuickstartInstall = ({ id, title, description, defaultLang = 'en', children }) => (
-    <StaticQuery
-        query={query}
-        render={({ site }) => {
-            const models = site.siteMetadata.languages.filter(({ models }) => models !== null)
-            data[0].options = models.map(({ code, name }) => ({
-                id: code,
-                title: name,
-                checked: code === defaultLang,
-            }))
-            return (
-                <Quickstart data={data} title={title} id={id} description={description}>
-                    {models.map(({ code, models, example }) => {
-                        const pkg = models[0]
-                        const exampleText = example || 'No text available yet'
-                        return (
-                            <Fragment key={code}>
-                                <QS lang={code}>python -m spacy download {pkg}</QS>
-                                <QS lang={code} divider />
-                                <QS lang={code} load="spacy" prompt="python">
-                                    import spacy
-                                </QS>
-                                <QS lang={code} load="spacy" prompt="python">
-                                    nlp = spacy.load("{pkg}")
-                                </QS>
-                                <QS lang={code} load="module" prompt="python">
-                                    import {pkg}
-                                </QS>
-                                <QS lang={code} load="module" prompt="python">
-                                    nlp = {pkg}.load()
-                                </QS>
-                                <QS lang={code} config="example" prompt="python">
-                                    doc = nlp("{exampleText}")
-                                </QS>
-                                <QS lang={code} config="example" prompt="python">
-                                    print([
-                                    {code === 'xx'
-                                        ? '(ent.text, ent.label) for ent in doc.ents'
-                                        : '(w.text, w.pos_) for w in doc'}
-                                    ])
-                                </QS>
-                            </Fragment>
-                        )
-                    })}
+const QuickstartInstall = ({ id, title, description, children }) => {
+    const [lang, setLang] = useState(DEFAULT_LANG)
+    const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
+    const setters = {
+        lang: setLang,
+        optimize: v => setEfficiency(v.includes('efficiency')),
+    }
+    return (
+        <StaticQuery
+            query={query}
+            render={({ site }) => {
+                const models = site.siteMetadata.languages.filter(({ models }) => models !== null)
+                data[0].dropdown = models
+                    .sort((a, b) => a.name.localeCompare(b.name))
+                    .map(({ code, name }) => ({
+                        id: code,
+                        title: name,
+                    }))
+                return (
+                    <Quickstart
+                        data={data}
+                        title={title}
+                        id={id}
+                        description={description}
+                        setters={setters}
+                        copy={false}
+                    >
+                        {models.map(({ code, models, example }) => {
+                            const pkg = efficiency ? models[0] : models[models.length - 1]
+                            const exampleText = example || 'No text available yet'
+                            return lang !== code ? null : (
+                                <Fragment key={code}>
+                                    <QS>python -m spacy download {pkg}</QS>
+                                    <QS divider />
+                                    <QS load="spacy" prompt="python">
+                                        import spacy
+                                    </QS>
+                                    <QS load="spacy" prompt="python">
+                                        nlp = spacy.load("{pkg}")
+                                    </QS>
+                                    <QS load="module" prompt="python">
+                                        import {pkg}
+                                    </QS>
+                                    <QS load="module" prompt="python">
+                                        nlp = {pkg}.load()
+                                    </QS>
+                                    <QS config="example" prompt="python">
+                                        doc = nlp("{exampleText}")
+                                    </QS>
+                                    <QS config="example" prompt="python">
+                                        print([
+                                        {code === 'xx'
+                                            ? '(ent.text, ent.label) for ent in doc.ents'
+                                            : '(w.text, w.pos_) for w in doc'}
+                                        ])
+                                    </QS>
+                                </Fragment>
+                            )
+                        })}
 
-                    {children}
-                </Quickstart>
-            )
-        }}
-    />
-)
+                        {children}
+                    </Quickstart>
+                )
+            }}
+        />
+    )
+}
 
 export default QuickstartInstall
 

From f758804401e288ee93561073ecee81f729a2b7a9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 11:41:28 +0200
Subject: [PATCH 350/516] Save one line of code

---
 spacy/util.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index c43943ef7..4d68e829c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1065,8 +1065,7 @@ def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
     loc (str / Path): Path to the file.
     RETURNS: The loaded module.
     """
-    loc = str(loc)
-    spec = importlib.util.spec_from_file_location(name, loc)
+    spec = importlib.util.spec_from_file_location(name, str(loc))
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     return module

From ae15c9de7971679df9bb60034d007530957205ad Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 11:43:56 +0200
Subject: [PATCH 351/516] Raise error from caught KeyError to preserve
 traceback

---
 spacy/language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 18c08258f..d76741da3 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -973,8 +973,8 @@ class Language:
             try:
                 doc = proc(doc, **component_cfg.get(name, {}))
             except KeyError as e:
-                warnings.warn(str(e))
-                raise ValueError(Errors.E109.format(name=name)) from None
+                # This typically happens if a component is not initialized
+                raise ValueError(Errors.E109.format(name=name)) from e
             if doc is None:
                 raise ValueError(Errors.E005.format(name=name))
         return doc

From 5fb776556a1afa849e9a85a9cfcbec3c96f2136d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 14:47:02 +0200
Subject: [PATCH 352/516] Update docs [ci skip]

---
 website/docs/usage/processing-pipelines.md | 315 +++++++++++++++------
 website/docs/usage/saving-loading.md       |  26 +-
 website/docs/usage/training.md             |   3 +-
 3 files changed, 250 insertions(+), 94 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c98bd08bc..3d0c7b7e9 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -3,8 +3,11 @@ title: Language Processing Pipelines
 next: /usage/embeddings-transformers
 menu:
   - ['Processing Text', 'processing']
-  - ['How Pipelines Work', 'pipelines']
+  - ['Pipelines & Components', 'pipelines']
   - ['Custom Components', 'custom-components']
+  - ['Component Data', 'component-data']
+  - ['Type Hints & Validation', 'type-hints']
+  - ['Trainable Components', 'trainable-components']
   - ['Extension Attributes', 'custom-components-attributes']
   - ['Plugins & Wrappers', 'plugins']
 ---
@@ -89,26 +92,27 @@ have to call `list()` on it first:
 
 </Infobox>
 
-## How pipelines work {#pipelines}
+## Pipelines and built-in components {#pipelines}
 
 spaCy makes it very easy to create your own pipelines consisting of reusable
 components – this includes spaCy's default tagger, parser and entity recognizer,
 but also your own custom processing functions. A pipeline component can be added
-to an already existing `nlp` object, specified when initializing a `Language`
-class, or defined within a [pipeline package](/usage/saving-loading#models).
+to an already existing `nlp` object, specified when initializing a
+[`Language`](/api/language) class, or defined within a
+[pipeline package](/usage/saving-loading#models).
 
 > #### config.cfg (excerpt)
 >
 > ```ini
 >  [nlp]
 >  lang = "en"
->  pipeline = ["tagger", "parser"]
+>  pipeline = ["tok2vec", "parser"]
 >
 > [components]
 >
-> [components.tagger]
-> factory = "tagger"
-> # Settings for the tagger component
+> [components.tok2vec]
+> factory = "tok2vec"
+> # Settings for the tok2vec component
 >
 > [components.parser]
 > factory = "parser"
@@ -140,7 +144,7 @@ nlp = spacy.load("en_core_web_sm")
 ```
 
 ... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
-pipeline `["tagger", "parser", "ner"]`. spaCy will then initialize
+pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize
 `spacy.lang.en.English`, and create each pipeline component and add it to the
 processing pipeline. It'll then load in the model data from the data directory
 and return the modified `Language` class for you to use as the `nlp` object.
@@ -739,6 +743,64 @@ make your factory a separate function. That's also how spaCy does it internally.
 
 </Accordion>
 
+### Language-specific factories {#factories-language new="3"}
+
+There are many use case where you might want your pipeline components to be
+language-specific. Sometimes this requires entirely different implementation per
+language, sometimes the only difference is in the settings or data. spaCy allows
+you to register factories of the **same name** on both the `Language` base
+class, as well as its **subclasses** like `English` or `German`. Factories are
+resolved starting with the specific subclass. If the subclass doesn't define a
+component of that name, spaCy will check the `Language` base class.
+
+Here's an example of a pipeline component that overwrites the normalized form of
+a token, the `Token.norm_` with an entry from a language-specific lookup table.
+It's registered twice under the name `"token_normalizer"` – once using
+`@English.factory` and once using `@German.factory`:
+
+```python
+### {executable="true"}
+from spacy.lang.en import English
+from spacy.lang.de import German
+
+class TokenNormalizer:
+    def __init__(self, norm_table):
+        self.norm_table = norm_table
+
+    def __call__(self, doc):
+        for token in doc:
+            # Overwrite the token.norm_ if there's an entry in the data
+            token.norm_ = self.norm_table.get(token.text, token.norm_)
+        return doc
+
+@English.factory("token_normalizer")
+def create_en_normalizer(nlp, name):
+    return TokenNormalizer({"realise": "realize", "colour": "color"})
+
+@German.factory("token_normalizer")
+def create_de_normalizer(nlp, name):
+    return TokenNormalizer({"daß": "dass", "wußte": "wusste"})
+
+nlp_en = English()
+nlp_en.add_pipe("token_normalizer")  # uses the English factory
+print([token.norm_ for token in nlp_en("realise colour daß wußte")])
+
+nlp_de = German()
+nlp_de.add_pipe("token_normalizer")  # uses the German factory
+print([token.norm_ for token in nlp_de("realise colour daß wußte")])
+```
+
+<Infobox title="Implementation details">
+
+Under the hood, language-specific factories are added to the
+[`factories` registry](/api/top-level#registry) prefixed with the language code,
+e.g. `"en.token_normalizer"`. When resolving the factory in
+[`nlp.add_pipe`](/api/language#add_pipe), spaCy first checks for a
+language-specific version of the factory using `nlp.lang` and if none is
+available, falls back to looking up the regular factory name.
+
+</Infobox>
+
 ### Example: Stateful component with settings {#example-stateful-components}
 
 This example shows a **stateful** pipeline component for handling acronyms:
@@ -808,34 +870,47 @@ doc = nlp("LOL, be right back")
 print(doc._.acronyms)
 ```
 
+## Initializing and serializing component data {#component-data}
+
 Many stateful components depend on **data resources** like dictionaries and
 lookup tables that should ideally be **configurable**. For example, it makes
-sense to make the `DICTIONARY` and argument of the registered function, so the
-`AcronymComponent` can be re-used with different data. One logical solution
-would be to make it an argument of the component factory, and allow it to be
-initialized with different dictionaries.
+sense to make the `DICTIONARY` in the above example an argument of the
+registered function, so the `AcronymComponent` can be re-used with different
+data. One logical solution would be to make it an argument of the component
+factory, and allow it to be initialized with different dictionaries.
 
-> #### Example
->
-> Making the data an argument of the registered function would result in output
-> like this in your `config.cfg`, which is typically not what you want (and only
-> works for JSON-serializable data).
+> #### config.cfg
 >
 > ```ini
-> [components.acronyms.dictionary]
+> [components.acronyms.data]
+> # 🚨 Problem: you don't want the data in the config
 > lol = "laugh out loud"
 > brb = "be right back"
 > ```
 
+```python
+@Language.factory("acronyms", default_config={"data": {}, "case_sensitive": False})
+def create_acronym_component(nlp: Language, name: str, data: Dict[str, str], case_sensitive: bool):
+    # 🚨 Problem: data ends up in the config file
+    return AcronymComponent(nlp, data, case_sensitive)
+```
+
 However, passing in the dictionary directly is problematic, because it means
 that if a component saves out its config and settings, the
 [`config.cfg`](/usage/training#config) will include a dump of the entire data,
-since that's the config the component was created with.
+since that's the config the component was created with. It will also fail if the
+data is not JSON-serializable.
 
-```diff
-DICTIONARY = {"lol": "laughing out loud", "brb": "be right back"}
-- default_config = {"dictionary:" DICTIONARY}
-```
+### Option 1: Using a registered function {#component-data-function}
+
+<Infobox>
+
+- ✅ **Pros:** can load anything in Python, easy to add to and configure via
+  config
+- ❌ **Cons:** requires the function and its dependencies to be available at
+  runtime
+
+</Infobox>
 
 If what you're passing in isn't JSON-serializable – e.g. a custom object like a
 [model](#trainable-components) – saving out the component config becomes
@@ -877,7 +952,7 @@ result of the registered function is passed in as the key `"dictionary"`.
 > [components.acronyms]
 > factory = "acronyms"
 >
-> [components.acronyms.dictionary]
+> [components.acronyms.data]
 > @misc = "acronyms.slang_dict.v1"
 > ```
 
@@ -895,11 +970,135 @@ the name. Registered functions can also take **arguments** by the way that can
 be defined in the config as well – you can read more about this in the docs on
 [training with custom code](/usage/training#custom-code).
 
-### Initializing components with data {#initialization}
+### Option 2: Save data with the pipeline and load it in once on initialization {#component-data-initialization}
 
-<!-- TODO: -->
+<Infobox>
 
-### Python type hints and pydantic validation {#type-hints new="3"}
+- ✅ **Pros:** lets components save and load their own data and reflect user
+  changes, load in data assets before training without depending on them at
+  runtime
+- ❌ **Cons:** requires more component methods, more complex config and data
+  flow
+
+</Infobox>
+
+Just like models save out their binary weights when you call
+[`nlp.to_disk`](/api/language#to_disk), components can also **serialize** any
+other data assets – for instance, an acronym dictionary. If a pipeline component
+implements its own `to_disk` and `from_disk` methods, those will be called
+automatically by `nlp.to_disk` and will receive the path to the directory to
+save to or load from. The component can then perform any custom saving or
+loading. If a user makes changes to the component data, they will be reflected
+when the `nlp` object is saved. For more examples of this, see the usage guide
+on [serialization methods](/usage/saving-loading/#serialization-methods).
+
+> #### About the data path
+>
+> The `path` argument spaCy passes to the serialization methods consists of the
+> path provided by the user, plus a directory of the component name. This means
+> that when you call `nlp.to_disk("/path")`, the `acronyms` component will
+> receive the directory path `/path/acronyms` and can then create files in this
+> directory.
+
+```python
+### Custom serialization methods {highlight="6-7,9-11"}
+import srsly
+
+class AcronymComponent:
+    # other methods here...
+
+    def to_disk(self, path, exclude=tuple()):
+        srsly.write_json(path / "data.json", self.data)
+
+    def from_disk(self, path, exclude=tuple()):
+        self.data = srsly.read_json(path / "data.json")
+        return self
+```
+
+Now the component can save to and load from a directory. The only remaining
+question: How do you **load in the initial data**? In Python, you could just
+call the pipe's `from_disk` method yourself. But if you're adding the component
+to your [training config](/usage/training#config), spaCy will need to know how
+to set it up, from start to finish, including the data to initialize it with.
+
+While you could use a registered function or a file loader like
+[`srsly.read_json.v1`](/api/top-level#file_readers) as an argument of the
+component factory, this approach is problematic: the component factory runs
+**every time the component is created**. This means it will run when creating
+the `nlp` object before training, but also every a user loads your pipeline. So
+your runtime pipeline would either depend on a local path on your file system,
+or it's loaded twice: once when the component is created, and then again when
+the data is by `from_disk`.
+
+> ```ini
+> ### config.cfg
+> [components.acronyms.data]
+> # 🚨 Problem: Runtime pipeline depends on local path
+> @readers = "srsly.read_json.v1"
+> path = "/path/to/slang_dict.json"
+> ```
+>
+> ```ini
+> ### config.cfg
+> [components.acronyms.data]
+> # 🚨 Problem: this always runs
+> @misc = "acronyms.slang_dict.v1"
+> ```
+
+```python
+@Language.factory("acronyms", default_config={"data": {}, "case_sensitive": False})
+def create_acronym_component(nlp: Language, name: str, data: Dict[str, str], case_sensitive: bool):
+    # 🚨 Problem: data will be loaded every time component is created
+    return AcronymComponent(nlp, data, case_sensitive)
+```
+
+To solve this, your component can implement a separate method, `initialize`,
+which will be called by [`nlp.initialize`](/api/language#initialize) if
+available. This typically happens before training, but not at runtime when the
+pipeline is loaded. For more background on this, see the usage guides on the
+[config lifecycle](/usage/training#config-lifecycle) and
+[custom initialization](/usage/training#initialization).
+
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
+A component's `initialize` method needs to take at least **two named
+arguments**: a `get_examples` callback that gives it access to the training
+examples, and the current `nlp` object. This is mostly used by trainable
+components so they can initialize their models and label schemes from the data,
+so we can ignore those arguments here. All **other arguments** on the method can
+be defined via the config – in this case a dictionary `data`.
+
+> #### config.cfg
+>
+> ```ini
+> [initialize.components.my_component]
+>
+> [initialize.components.my_component.data]
+> # ✅ This only runs on initialization
+> @readers = "srsly.read_json.v1"
+> path = "/path/to/slang_dict.json"
+> ```
+
+```python
+### Custom initialize method {highlight="5-6"}
+class AcronymComponent:
+    def __init__(self):
+        self.data = {}
+
+    def initialize(self, get_examples=None, nlp=None, data={}):
+        self.data = data
+```
+
+When [`nlp.initialize`](/api/language#initialize) runs before training (or when
+you call it in your own code), the
+[`[initialize]`](/api/data-formats#config-initialize) block of the config is
+loaded and used to construct the `nlp` object. The custom acronym component will
+then be passed the data loaded from the JSON file. After training, the `nlp`
+object is saved to disk, which will run the component's `to_disk` method. When
+the pipeline is loaded back into spaCy later to use it, the `from_disk` method
+will load the data back in.
+
+## Python type hints and validation {#type-hints new="3"}
 
 spaCy's configs are powered by our machine learning library Thinc's
 [configuration system](https://thinc.ai/docs/usage-config), which supports
@@ -968,65 +1167,7 @@ nlp.add_pipe("debug", config={"log_level": "DEBUG"})
 doc = nlp("This is a text...")
 ```
 
-### Language-specific factories {#factories-language new="3"}
-
-There are many use case where you might want your pipeline components to be
-language-specific. Sometimes this requires entirely different implementation per
-language, sometimes the only difference is in the settings or data. spaCy allows
-you to register factories of the **same name** on both the `Language` base
-class, as well as its **subclasses** like `English` or `German`. Factories are
-resolved starting with the specific subclass. If the subclass doesn't define a
-component of that name, spaCy will check the `Language` base class.
-
-Here's an example of a pipeline component that overwrites the normalized form of
-a token, the `Token.norm_` with an entry from a language-specific lookup table.
-It's registered twice under the name `"token_normalizer"` – once using
-`@English.factory` and once using `@German.factory`:
-
-```python
-### {executable="true"}
-from spacy.lang.en import English
-from spacy.lang.de import German
-
-class TokenNormalizer:
-    def __init__(self, norm_table):
-        self.norm_table = norm_table
-
-    def __call__(self, doc):
-        for token in doc:
-            # Overwrite the token.norm_ if there's an entry in the data
-            token.norm_ = self.norm_table.get(token.text, token.norm_)
-        return doc
-
-@English.factory("token_normalizer")
-def create_en_normalizer(nlp, name):
-    return TokenNormalizer({"realise": "realize", "colour": "color"})
-
-@German.factory("token_normalizer")
-def create_de_normalizer(nlp, name):
-    return TokenNormalizer({"daß": "dass", "wußte": "wusste"})
-
-nlp_en = English()
-nlp_en.add_pipe("token_normalizer")  # uses the English factory
-print([token.norm_ for token in nlp_en("realise colour daß wußte")])
-
-nlp_de = German()
-nlp_de.add_pipe("token_normalizer")  # uses the German factory
-print([token.norm_ for token in nlp_de("realise colour daß wußte")])
-```
-
-<Infobox title="Implementation details">
-
-Under the hood, language-specific factories are added to the
-[`factories` registry](/api/top-level#registry) prefixed with the language code,
-e.g. `"en.token_normalizer"`. When resolving the factory in
-[`nlp.add_pipe`](/api/language#add_pipe), spaCy first checks for a
-language-specific version of the factory using `nlp.lang` and if none is
-available, falls back to looking up the regular factory name.
-
-</Infobox>
-
-### Trainable components {#trainable-components new="3"}
+## Trainable components {#trainable-components new="3"}
 
 spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
 components that have their own model instance, make predictions over `Doc`
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index f8a5eea2a..c19ff39eb 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -2,6 +2,7 @@
 title: Saving and Loading
 menu:
   - ['Basics', 'basics']
+  - ['Serializing Docs', 'docs']
   - ['Serialization Methods', 'serialization-methods']
   - ['Entry Points', 'entry-points']
   - ['Trained Pipelines', 'models']
@@ -52,7 +53,7 @@ defined [factories](/usage/processing-pipeline#custom-components-factories) and
 _then_ loads in the binary data. You can read more about this process
 [here](/usage/processing-pipelines#pipelines).
 
-### Serializing Doc objects efficiently {#docs new="2.2"}
+## Serializing Doc objects efficiently {#docs new="2.2"}
 
 If you're working with lots of data, you'll probably need to pass analyses
 between machines, either to use something like [Dask](https://dask.org) or
@@ -179,9 +180,20 @@ example, model weights or terminology lists – you can take advantage of spaCy'
 built-in component serialization by making your custom component expose its own
 `to_disk` and `from_disk` or `to_bytes` and `from_bytes` methods. When an `nlp`
 object with the component in its pipeline is saved or loaded, the component will
-then be able to serialize and deserialize itself. The following example shows a
-custom component that keeps arbitrary JSON-serializable data, allows the user to
-add to that data and saves and loads the data to and from a JSON file.
+then be able to serialize and deserialize itself.
+
+<Infobox title="Custom components and data" emoji="📖">
+
+For more details on how to work with pipeline components that depend on data
+resources and manage data loading and initialization at training and runtime,
+see the usage guide on initializing and serializing
+[component data](/usage/processing-pipelines#component-data).
+
+</Infobox>
+
+The following example shows a custom component that keeps arbitrary
+JSON-serializable data, allows the user to add to that data and saves and loads
+the data to and from a JSON file.
 
 > #### Real-world example
 >
@@ -208,13 +220,13 @@ class CustomComponent:
         # Add something to the component's data
         self.data.append(data)
 
-    def to_disk(self, path, **kwargs):
+    def to_disk(self, path, exclude=tuple()):
         # This will receive the directory path + /my_component
         data_path = path / "data.json"
         with data_path.open("w", encoding="utf8") as f:
             f.write(json.dumps(self.data))
 
-    def from_disk(self, path, **cfg):
+    def from_disk(self, path, exclude=tuple()):
         # This will receive the directory path + /my_component
         data_path = path / "data.json"
         with data_path.open("r", encoding="utf8") as f:
@@ -276,6 +288,8 @@ custom components to spaCy automatically.
 
 </Infobox>
 
+<!-- ## Initializing components with data {#initialization new="3"} -->
+
 ## Using entry points {#entry-points new="2.1"}
 
 Entry points let you expose parts of a Python package you write to other Python
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 1dd57fd4a..74d2f6de5 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -819,7 +819,8 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
 
 ### Customizing the initialization {#initialization}
 
-<!-- TODO: -->
+<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
+</Infobox>
 
 ## Data utilities {#data}
 

From db419f6b2f31f603484d8cce2587f5fc2ad31825 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 14:57:46 +0200
Subject: [PATCH 353/516] Improve control of training progress and logging
 (#6184)

* Make logging and progress easier to control

* Update docs

* Cleanup errors

* Fix ConfigValidationError

* Pass stdout/stderr, not wasabi.Printer

* Fix type

* Upd logging example

* Fix logger example

* Fix type
---
 spacy/cli/train.py             | 20 ++++-----
 spacy/training/initialize.py   |  2 +-
 spacy/training/loggers.py      | 74 ++++++++++++++++++++++++----------
 spacy/training/loop.py         | 66 +++++++++++++++---------------
 website/docs/usage/training.md | 41 +++++++++++--------
 5 files changed, 118 insertions(+), 85 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 57a88159d..0b27f63dc 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -3,6 +3,7 @@ from pathlib import Path
 from wasabi import msg
 import typer
 import logging
+import sys
 
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
@@ -39,7 +40,12 @@ def train_cli(
     DOCS: https://nightly.spacy.io/api/cli#train
     """
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-    verify_cli_args(config_path, output_path)
+    # Make sure all files and paths exists if they are needed
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if output_path is not None and not output_path.exists():
+        output_path.mkdir()
+        msg.good(f"Created output directory: {output_path}")
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
@@ -50,14 +56,4 @@ def train_cli(
         nlp = init_nlp(config, use_gpu=use_gpu)
     msg.good("Initialized pipeline")
     msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu, silent=False)
-
-
-def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
-    # Make sure all files and paths exists if they are needed
-    if not config_path or not config_path.exists():
-        msg.fail("Config file not found", config_path, exits=1)
-    if output_path is not None:
-        if not output_path.exists():
-            output_path.mkdir()
-            msg.good(f"Created output directory: {output_path}")
+    train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index d64f211c4..7cb1555d7 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -102,7 +102,7 @@ def load_vectors_into_model(
             "with the packaged vectors. Make sure that the vectors package you're "
             "loading is compatible with the current version of spaCy."
         )
-        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
         raise err from None
     nlp.vocab.vectors = vectors_nlp.vocab.vectors
     if add_strings:
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 0f054d433..be2da4bd8 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -1,18 +1,24 @@
-from typing import Dict, Any, Tuple, Callable, List
+from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
+import wasabi
+import tqdm
+import sys
 
 from ..util import registry
 from .. import util
 from ..errors import Errors
-from wasabi import msg
 
 
 @registry.loggers("spacy.ConsoleLogger.v1")
-def console_logger():
+def console_logger(progress_bar: bool=False):
     def setup_printer(
         nlp: "Language",
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
+    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
+        msg = wasabi.Printer(no_print=True)
         # we assume here that only components are enabled that should be trained & logged
         logged_pipes = nlp.pipe_names
+        eval_frequency = nlp.config["training"]["eval_frequency"]
         score_weights = nlp.config["training"]["score_weights"]
         score_cols = [col for col, value in score_weights.items() if value is not None]
         score_widths = [max(len(col), 6) for col in score_cols]
@@ -22,10 +28,18 @@ def console_logger():
         table_header = [col.upper() for col in table_header]
         table_widths = [3, 6] + loss_widths + score_widths + [6]
         table_aligns = ["r" for _ in table_widths]
-        msg.row(table_header, widths=table_widths)
-        msg.row(["-" * width for width in table_widths])
+        stdout.write(msg.row(table_header, widths=table_widths))
+        stdout.write(msg.row(["-" * width for width in table_widths]))
+        progress = None
 
-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
+            nonlocal progress
+
+            if info is None:
+                # If we don't have a new checkpoint, just return.
+                if progress is not None:
+                    progress.update(1)
+                return 
             try:
                 losses = [
                     "{0:.2f}".format(float(info["losses"][pipe_name]))
@@ -39,24 +53,37 @@ def console_logger():
                         keys=list(info["losses"].keys()),
                     )
                 ) from None
+
             scores = []
             for col in score_cols:
                 score = info["other_scores"].get(col, 0.0)
                 try:
                     score = float(score)
-                    if col != "speed":
-                        score *= 100
-                    scores.append("{0:.2f}".format(score))
                 except TypeError:
                     err = Errors.E916.format(name=col, score_type=type(score))
                     raise ValueError(err) from None
+                if col != "speed":
+                    score *= 100
+                scores.append("{0:.2f}".format(score))
+
             data = (
                 [info["epoch"], info["step"]]
                 + losses
                 + scores
                 + ["{0:.2f}".format(float(info["score"]))]
             )
-            msg.row(data, widths=table_widths, aligns=table_aligns)
+            if progress is not None:
+                progress.close()
+            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
+            if progress_bar:
+                # Set disable=None, so that it disables on non-TTY
+                progress = tqdm.tqdm(
+                    total=eval_frequency,
+                    disable=None,
+                    leave=False,
+                    file=stderr
+                )
+                progress.set_description(f"Epoch {info['epoch']+1}")
 
         def finalize():
             pass
@@ -70,10 +97,12 @@ def console_logger():
 def wandb_logger(project_name: str, remove_config_values: List[str] = []):
     import wandb
 
-    console = console_logger()
+    console = console_logger(progress_bar=False)
 
     def setup_logger(
         nlp: "Language",
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
         config = nlp.config.interpolate()
         config_dot = util.dict_to_dot(config)
@@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
             del config_dot[field]
         config = util.dot_to_dict(config_dot)
         wandb.init(project=project_name, config=config, reinit=True)
-        console_log_step, console_finalize = console(nlp)
+        console_log_step, console_finalize = console(nlp, stdout, stderr)
 
-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
             console_log_step(info)
-            score = info["score"]
-            other_scores = info["other_scores"]
-            losses = info["losses"]
-            wandb.log({"score": score})
-            if losses:
-                wandb.log({f"loss_{k}": v for k, v in losses.items()})
-            if isinstance(other_scores, dict):
-                wandb.log(other_scores)
+            if info is not None:
+                score = info["score"]
+                other_scores = info["other_scores"]
+                losses = info["losses"]
+                wandb.log({"score": score})
+                if losses:
+                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
+                if isinstance(other_scores, dict):
+                    wandb.log(other_scores)
 
         def finalize():
             console_finalize()
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index e20cddd3e..093a9ebb3 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -1,11 +1,11 @@
-from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
 from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 import random
-import tqdm
-from wasabi import Printer
+import wasabi
+import sys
 
 from .example import Example
 from ..schemas import ConfigSchemaTraining
@@ -21,7 +21,8 @@ def train(
     output_path: Optional[Path] = None,
     *,
     use_gpu: int = -1,
-    silent: bool = False,
+    stdout: IO=sys.stdout,
+    stderr: IO=sys.stderr
 ) -> None:
     """Train a pipeline.
 
@@ -29,10 +30,15 @@ def train(
     output_path (Path): Optional output path to save trained model to.
     use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
         before calling this function.
-    silent (bool): Whether to pretty-print outputs.
+    stdout (file): A file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+    stderr (file): A second file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+ 
     RETURNS (Path / None): The path to the final exported model.
     """
-    msg = Printer(no_print=silent)
+    # We use no_print here so we can respect the stdout/stderr options.
+    msg = wasabi.Printer(no_print=True)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
     if config["training"]["seed"] is not None:
@@ -63,50 +69,44 @@ def train(
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
     )
-    msg.info(f"Pipeline: {nlp.pipe_names}")
+    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
     if frozen_components:
-        msg.info(f"Frozen components: {frozen_components}")
-    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
+        stdout.write(msg.info(f"Frozen components: {frozen_components}"))
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
     with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
+        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:
-        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-        progress.set_description(f"Epoch 1")
         for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
-            if is_best_checkpoint is not None:
-                progress.close()
-                print_row(info)
-                if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=frozen_components):
-                        update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / "model-best")
-                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-                progress.set_description(f"Epoch {info['epoch']}")
+            log_step(info if is_best_checkpoint else None)
+            if is_best_checkpoint is not None and output_path is not None:
+                with nlp.select_pipes(disable=frozen_components):
+                    update_meta(T, nlp, info)
+                with nlp.use_params(optimizer.averages):
+                    nlp = before_to_disk(nlp)
+                    nlp.to_disk(output_path / "model-best")
     except Exception as e:
-        finalize_logger()
         if output_path is not None:
             # We don't want to swallow the traceback if we don't have a
-            # specific error.
-            msg.warn(
-                f"Aborting and saving the final best model. "
-                f"Encountered exception: {str(e)}"
+            # specific error, but we do want to warn that we're trying
+            # to do something here.
+            stdout.write(
+                msg.warn(
+                    f"Aborting and saving the final best model. "
+                    f"Encountered exception: {str(e)}"
+                )
             )
-            nlp = before_to_disk(nlp)
-            nlp.to_disk(output_path / "model-final")
         raise e
     finally:
         finalize_logger()
         if output_path is not None:
-            final_model_path = output_path / "model-final"
+            final_model_path = output_path / "model-last"
             if optimizer.averages:
                 with nlp.use_params(optimizer.averages):
                     nlp.to_disk(final_model_path)
             else:
                 nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory", final_model_path)
+    # This will only run if we don't hit an error
+    stdout.write(msg.good("Saved pipeline to output directory", final_model_path))
 
 
 def train_while_improving(
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 74d2f6de5..fb1efec1b 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
-**dictionary** with the following keys:
+[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
+receives a **dictionary** with the following keys:
 
 | Key            | Value                                                                                          |
 | -------------- | ---------------------------------------------------------------------------------------------- |
@@ -715,30 +715,37 @@ tabular results to a file:
 
 ```python
 ### functions.py
-from typing import Tuple, Callable, Dict, Any
+import sys
+from typing import IO, Tuple, Callable, Dict, Any
 import spacy
+from spacy import Language
 from pathlib import Path
 
 @spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
-    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
-        with Path(log_path).open("w", encoding="utf8") as file_:
-            file_.write("step\\t")
-            file_.write("score\\t")
-            for pipe in nlp.pipe_names:
-                file_.write(f"loss_{pipe}\\t")
-            file_.write("\\n")
+    def setup_logger(
+        nlp: Language,
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
+    ) -> Tuple[Callable, Callable]:
+        stdout.write(f"Logging to {log_path}\n")
+        log_file = Path(log_path).open("w", encoding="utf8")
+        log_file.write("step\\t")
+        log_file.write("score\\t")
+        for pipe in nlp.pipe_names:
+            log_file.write(f"loss_{pipe}\\t")
+        log_file.write("\\n")
 
-        def log_step(info: Dict[str, Any]):
-            with Path(log_path).open("a") as file_:
-                file_.write(f"{info['step']}\\t")
-                file_.write(f"{info['score']}\\t")
+        def log_step(info: Optional[Dict[str, Any]]):
+            if info:
+                log_file.write(f"{info['step']}\\t")
+                log_file.write(f"{info['score']}\\t")
                 for pipe in nlp.pipe_names:
-                    file_.write(f"{info['losses'][pipe]}\\t")
-                file_.write("\\n")
+                    log_file.write(f"{info['losses'][pipe]}\\t")
+                log_file.write("\\n")
 
         def finalize():
-            pass
+            log_file.close()
 
         return log_step, finalize
 

From 7b127f307e648d4ddbb559efb0bf15c5620a4bcf Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 16:06:42 +0200
Subject: [PATCH 354/516] Set version to v3.0.0a30

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index acf386ace..e61e5ab25 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a29"
+__version__ = "3.0.0a30"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 35d695a031853b1b914ef36bdb84da84f2042ac4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 16:08:24 +0200
Subject: [PATCH 355/516] Update docs

---
 website/docs/api/dependencyparser.md | 30 +++++++--
 website/docs/api/entityrecognizer.md | 30 +++++++--
 website/docs/api/morphologizer.md    | 30 +++++++--
 website/docs/api/tagger.md           | 30 +++++++--
 website/docs/api/textcategorizer.md  | 35 +++++++---
 website/docs/usage/training.md       | 99 ++++++++++++++++++++++++----
 6 files changed, 209 insertions(+), 45 deletions(-)

diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index ea4b779c7..fe8f7d8d5 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -176,12 +176,12 @@ This method was previously called `begin_training`.
 > path = "corpus/labels/parser.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
-| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                            |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                                  |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                                   |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
 
 ## DependencyParser.predict {#predict tag="method"}
 
@@ -433,6 +433,24 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 
+## DependencyParser.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`DependencyParser.initialize`](/api/dependencyparser#initialize) to initialize
+the model with a pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = parser.label_data
+> parser.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                                                     |
+| ----------- | ------------------------------------------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ |
+
 ## Serialization fields {#serialization-fields}
 
 During serialization, spaCy will export several data fields used to restore
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 5fbd0b229..6ac0d163f 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -165,12 +165,12 @@ This method was previously called `begin_training`.
 > path = "corpus/labels/ner.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
-| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                            |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                                  |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                                   |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ |
 
 ## EntityRecognizer.predict {#predict tag="method"}
 
@@ -421,6 +421,24 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 
+## EntityRecognizer.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`EntityRecognizer.initialize`](/api/entityrecognizer#initialize) to initialize
+the model with a pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = ner.label_data
+> ner.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                                                     |
+| ----------- | ------------------------------------------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ |
+
 ## Serialization fields {#serialization-fields}
 
 During serialization, spaCy will export several data fields used to restore
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 50e2bb33a..d32514fb0 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -147,12 +147,12 @@ config.
 > path = "corpus/labels/morphologizer.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
-| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                       |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                             |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                              |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
 
 ## Morphologizer.predict {#predict tag="method"}
 
@@ -377,6 +377,24 @@ coarse-grained POS as the feature `POS`.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 
+## Morphologizer.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`Morphologizer.initialize`](/api/morphologizer#initialize) to initialize the
+model with a pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = morphologizer.label_data
+> morphologizer.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                     |
+| ----------- | ----------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~dict~~ |
+
 ## Serialization fields {#serialization-fields}
 
 During serialization, spaCy will export several data fields used to restore
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index d7c56be67..2123004b6 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -148,12 +148,12 @@ This method was previously called `begin_training`.
 > path = "corpus/labels/tagger.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
-| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                            |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
 
 ## Tagger.predict {#predict tag="method"}
 
@@ -411,6 +411,24 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 
+## Tagger.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`Tagger.initialize`](/api/tagger#initialize) to initialize the model with a
+pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = tagger.label_data
+> tagger.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                                |
+| ----------- | ---------------------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
+
 ## Serialization fields {#serialization-fields}
 
 During serialization, spaCy will export several data fields used to restore
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index dd8c81040..0901a6fa9 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -29,7 +29,6 @@ architectures and their arguments and hyperparameters.
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 > config = {
->    "labels": [],
 >    "threshold": 0.5,
 >    "model": DEFAULT_TEXTCAT_MODEL,
 > }
@@ -38,7 +37,6 @@ architectures and their arguments and hyperparameters.
 
 | Setting          | Description                                                                                                                                                      |
 | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels`         | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~                                      |
 | `threshold`      | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
 | `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~                                                    |
 | `model`          | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
@@ -61,7 +59,7 @@ architectures and their arguments and hyperparameters.
 >
 > # Construction from class
 > from spacy.pipeline import TextCategorizer
-> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5, positive_label="POS")
+> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5, positive_label="POS")
 > ```
 
 Create a new pipeline instance. In your application, you would normally use a
@@ -74,7 +72,6 @@ shortcut for this and instantiate the component using its string name and
 | `model`          | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
 | _keyword-only_   |                                                                                                                            |
-| `labels`         | The labels to use. ~~Iterable[str]~~                                                                                       |
 | `threshold`      | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
 | `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~                             |
 
@@ -161,12 +158,12 @@ This method was previously called `begin_training`.
 > path = "corpus/labels/textcat.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                         |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                               |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                     |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                |
-| `labels`       | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                            |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
+| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
 
 ## TextCategorizer.predict {#predict tag="method"}
 
@@ -425,6 +422,24 @@ The labels currently added to the component.
 | ----------- | ------------------------------------------------------ |
 | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
 
+## TextCategorizer.label_data {#label_data tag="property" new="3"}
+
+The labels currently added to the component and their internal meta information.
+This is the data generated by [`init labels`](/api/cli#init-labels) and used by
+[`TextCategorizer.initialize`](/api/textcategorizer#initialize) to initialize
+the model with a pre-defined label set.
+
+> #### Example
+>
+> ```python
+> labels = textcat.label_data
+> textcat.initialize(lambda: [], nlp=nlp, labels=labels)
+> ```
+
+| Name        | Description                                                |
+| ----------- | ---------------------------------------------------------- |
+| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
+
 ## Serialization fields {#serialization-fields}
 
 During serialization, spaCy will export several data fields used to restore
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 74d2f6de5..6317479bc 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -692,14 +692,14 @@ for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
 [`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
 **dictionary** with the following keys:
 
-| Key            | Value                                                                                          |
-| -------------- | ---------------------------------------------------------------------------------------------- |
-| `epoch`        | How many passes over the data have been completed. ~~int~~                                     |
-| `step`         | How many steps have been completed. ~~int~~                                                    |
-| `score`        | The main score from the last evaluation, measured on the dev set. ~~float~~                    |
-| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~         |
-| `losses`       | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~                 |
-| `checkpoints`  | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
+| Key            | Value                                                                                                 |
+| -------------- | ----------------------------------------------------------------------------------------------------- |
+| `epoch`        | How many passes over the data have been completed. ~~int~~                                            |
+| `step`         | How many steps have been completed. ~~int~~                                                           |
+| `score`        | The main score from the last evaluation, measured on the dev set. ~~float~~                           |
+| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~                |
+| `losses`       | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~                        |
+| `checkpoints`  | A list of previous results, where each result is a `(score, step)` tuple. ~~List[Tuple[float, int]]~~ |
 
 You can easily implement and plug in your own logger that records the training
 results in a custom way, or sends them to an experiment management tracker of
@@ -819,7 +819,84 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
 
 ### Customizing the initialization {#initialization}
 
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
+When you start training a new model from scratch,
+[`spacy train`](/api/cli#train) will call
+[`nlp.initialize`](/api/language#initialize) to initialize the pipeline for
+training. This process typically includes the following:
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [initialize]
+> vectors = ${paths.vectors}
+> init_tok2vec = ${paths.init_tok2vec}
+>
+> [initialize.components]
+> # Settings for components
+> ```
+
+1. Load in **data resources** defined in the `[initialize]` config, including
+   **word vectors** and
+   [pretrained](/usage/embeddings-transformers/#pretraining) **tok2vec
+   weights**.
+2. Call the `initialize` methods of the tokenizer (if implemented, e.g. for
+   [Chinese](/usage/models#chinese)) and pipeline components with a callback to
+   access the training data, the current `nlp` object and any **custom
+   arguments** defined in the `[initialize]` config.
+3. In **pipeline components**: if needed, use the data to
+   [infer missing shapes](/usage/layers-architectures#thinc-shape-inference) and
+   set up the label scheme if no labels are provided. Components may also load
+   other data like lookup tables or dictionaries.
+
+The initialization step allows the config to define **all settings** required
+for the pipeline, while keeping a separation between settings and functions that
+should only be used **before training** to set up the initial pipeline, and
+logic and configuration that needs to be available **at runtime**. Without that
+separation, TODO:
+
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
+#### Initializing labels {#initialization-labels}
+
+Built-in pipeline components like the
+[`EntityRecognizer`](/api/entityrecognizer) or
+[`DependencyParser`](/api/dependencyparser) need to know their available labels
+and associated internal meta information to initialize their model weights.
+Using the `get_examples` callback provided on initialization, they're able to
+**read the labels off the training data** automatically, which is very
+convenient – but it can also slow down the training process to compute this
+information on every run.
+
+The [`init labels`](/api/cli#init-labels) command lets you auto-generate JSON
+files containing the label data for all supported components. You can then pass
+in the labels in the `[initialize]` settings for the respective components to
+allow them to initialize faster.
+
+> #### config.cfg
+>
+> ```ini
+> [initialize.components.ner]
+>
+> [initialize.components.ner.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "corpus/labels/ner.json
+> ```
+
+```cli
+$ python -m spacy init labels config.cfg ./corpus --paths.train ./corpus/train.spacy
+```
+
+Under the hood, the command delegates to the `label_data` property of the
+pipeline components, for instance
+[`EntityRecognizer.label_data`](/api/entityrecognizer#label_data).
+
+<Infobox variant="warning" title="Important note">
+
+The JSON format differs for each component and some components need additional
+meta information about their labels. The format exported by
+[`init labels`](/api/cli#init-labels) matches what the components need, so you
+should always let spaCy **auto-generate the labels** for you.
+
 </Infobox>
 
 ## Data utilities {#data}
@@ -1298,8 +1375,8 @@ of being dropped.
 
 > - [`nlp`](/api/language): The `nlp` object with the pipeline components and
 >   their models.
-> - [`nlp.initialize`](/api/language#initialize): Start the training and return
->   an optimizer to update the component model weights.
+> - [`nlp.initialize`](/api/language#initialize): Initialize the pipeline and
+>   return an optimizer to update the component model weights.
 > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
 >   state between updates.
 > - [`nlp.update`](/api/language#update): Update component models with examples.

From 989a96308f2d8333718279021d8f42d994404e60 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 16:31:58 +0200
Subject: [PATCH 356/516] Tidy up, auto-format, types

---
 spacy/training/loggers.py | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index be2da4bd8..e8c948f54 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -1,5 +1,5 @@
 from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
-import wasabi
+from wasabi import Printer
 import tqdm
 import sys
 
@@ -7,15 +7,16 @@ from ..util import registry
 from .. import util
 from ..errors import Errors
 
+if TYPE_CHECKING:
+    from ..language import Language  # noqa: F401
+
 
 @registry.loggers("spacy.ConsoleLogger.v1")
-def console_logger(progress_bar: bool=False):
+def console_logger(progress_bar: bool = False):
     def setup_printer(
-        nlp: "Language",
-        stdout: IO=sys.stdout,
-        stderr: IO=sys.stderr
-    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
-        msg = wasabi.Printer(no_print=True)
+        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
+    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
+        msg = Printer(no_print=True)
         # we assume here that only components are enabled that should be trained & logged
         logged_pipes = nlp.pipe_names
         eval_frequency = nlp.config["training"]["eval_frequency"]
@@ -32,14 +33,14 @@ def console_logger(progress_bar: bool=False):
         stdout.write(msg.row(["-" * width for width in table_widths]))
         progress = None
 
-        def log_step(info: Optional[Dict[str, Any]]):
+        def log_step(info: Optional[Dict[str, Any]]) -> None:
             nonlocal progress
 
             if info is None:
                 # If we don't have a new checkpoint, just return.
                 if progress is not None:
                     progress.update(1)
-                return 
+                return
             try:
                 losses = [
                     "{0:.2f}".format(float(info["losses"][pipe_name]))
@@ -78,14 +79,11 @@ def console_logger(progress_bar: bool=False):
             if progress_bar:
                 # Set disable=None, so that it disables on non-TTY
                 progress = tqdm.tqdm(
-                    total=eval_frequency,
-                    disable=None,
-                    leave=False,
-                    file=stderr
+                    total=eval_frequency, disable=None, leave=False, file=stderr
                 )
                 progress.set_description(f"Epoch {info['epoch']+1}")
 
-        def finalize():
+        def finalize() -> None:
             pass
 
         return log_step, finalize
@@ -100,10 +98,8 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
     console = console_logger(progress_bar=False)
 
     def setup_logger(
-        nlp: "Language",
-        stdout: IO=sys.stdout,
-        stderr: IO=sys.stderr
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
+    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
         config = nlp.config.interpolate()
         config_dot = util.dict_to_dot(config)
         for field in remove_config_values:
@@ -124,7 +120,7 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
                 if isinstance(other_scores, dict):
                     wandb.log(other_scores)
 
-        def finalize():
+        def finalize() -> None:
             console_finalize()
             wandb.join()
 

From dd542ec6a4d3784f20f44c726893a4a80c67baac Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 17:07:38 +0200
Subject: [PATCH 357/516] Fix label initialization of textcat component (#6190)

---
 spacy/errors.py                               |  5 +-
 spacy/pipeline/senter.pyx                     |  4 --
 spacy/pipeline/textcat.py                     | 52 +++++--------------
 spacy/tests/pipeline/test_textcat.py          | 50 +++++++++---------
 .../serialize/test_serialize_pipeline.py      |  8 +--
 spacy/training/initialize.py                  | 30 -----------
 website/docs/api/textcategorizer.md           | 40 +++++++-------
 7 files changed, 64 insertions(+), 125 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index dbb25479d..119b88369 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -497,8 +497,9 @@ class Errors:
     E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
             "data that does not appear to be a binary classification problem "
             "with two labels. Labels found: {labels}")
-    E920 = ("The textcat's 'positive_label' config setting '{pos_label}' "
-            "does not match any label in the training data. Labels found: {labels}")
+    E920 = ("The textcat's 'positive_label' setting '{pos_label}' "
+            "does not match any label in the training data or provided during "
+            "initialization. Available labels: {labels}")
     E921 = ("The method 'set_output' can only be called on components that have "
             "a Model with a 'resize_output' attribute. Otherwise, the output "
             "layer can not be dynamically changed.")
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 65c17c771..ec635de5c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -71,10 +71,6 @@ class SentenceRecognizer(Tagger):
         # are 0
         return tuple(["I", "S"])
 
-    @property
-    def label_data(self):
-        return self.labels
-
     def set_annotations(self, docs, batch_tag_ids):
         """Modify a batch of documents, using pre-computed scores.
 
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index a092d960f..989c65b8f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -56,12 +56,7 @@ subword_features = true
 @Language.factory(
     "textcat",
     assigns=["doc.cats"],
-    default_config={
-        "labels": [],
-        "threshold": 0.5,
-        "positive_label": None,
-        "model": DEFAULT_TEXTCAT_MODEL,
-    },
+    default_config={"threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL},
     default_score_weights={
         "cats_score": 1.0,
         "cats_score_desc": None,
@@ -75,12 +70,7 @@ subword_features = true
     },
 )
 def make_textcat(
-    nlp: Language,
-    name: str,
-    model: Model[List[Doc], List[Floats2d]],
-    labels: List[str],
-    threshold: float,
-    positive_label: Optional[str],
+    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float,
 ) -> "TextCategorizer":
     """Create a TextCategorizer compoment. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels can
@@ -90,19 +80,9 @@ def make_textcat(
 
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
         scores for each category.
-    labels (list): A list of categories to learn. If empty, the model infers the
-        categories from the data.
     threshold (float): Cutoff to consider a prediction "positive".
-    positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise.
     """
-    return TextCategorizer(
-        nlp.vocab,
-        model,
-        name,
-        labels=labels,
-        threshold=threshold,
-        positive_label=positive_label,
-    )
+    return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
 
 
 class TextCategorizer(Pipe):
@@ -112,14 +92,7 @@ class TextCategorizer(Pipe):
     """
 
     def __init__(
-        self,
-        vocab: Vocab,
-        model: Model,
-        name: str = "textcat",
-        *,
-        labels: List[str],
-        threshold: float,
-        positive_label: Optional[str],
+        self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float
     ) -> None:
         """Initialize a text categorizer.
 
@@ -127,9 +100,7 @@ class TextCategorizer(Pipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        labels (List[str]): The labels to use.
         threshold (float): Cutoff to consider a prediction "positive".
-        positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#init
         """
@@ -137,11 +108,7 @@ class TextCategorizer(Pipe):
         self.model = model
         self.name = name
         self._rehearsal_model = None
-        cfg = {
-            "labels": labels,
-            "threshold": threshold,
-            "positive_label": positive_label,
-        }
+        cfg = {"labels": [], "threshold": threshold, "positive_label": None}
         self.cfg = dict(cfg)
 
     @property
@@ -348,6 +315,7 @@ class TextCategorizer(Pipe):
         *,
         nlp: Optional[Language] = None,
         labels: Optional[Dict] = None,
+        positive_label: Optional[str] = None,
     ):
         """Initialize the pipe for training, using a representative set
         of data examples.
@@ -369,6 +337,14 @@ class TextCategorizer(Pipe):
         else:
             for label in labels:
                 self.add_label(label)
+        if positive_label is not None:
+            if positive_label not in self.labels:
+                err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
+                raise ValueError(err)
+            if len(self.labels) != 2:
+                err = Errors.E919.format(pos_label=positive_label, labels=self.labels)
+                raise ValueError(err)
+        self.cfg["positive_label"] = positive_label
         subbatch = list(islice(get_examples(), 10))
         doc_sample = [eg.reference for eg in subbatch]
         label_sample, _ = self._examples_to_truth(subbatch)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index e0a785851..dd0159927 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -10,7 +10,6 @@ from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
 from spacy.scorer import Scorer
 from spacy.training import Example
-from spacy.training.initialize import verify_textcat_config
 
 from ..util import make_tempdir
 
@@ -21,6 +20,17 @@ TRAIN_DATA = [
 ]
 
 
+def make_get_examples(nlp):
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    def get_examples():
+        return train_examples
+
+    return get_examples
+
+
 @pytest.mark.skip(reason="Test is flakey when run with others")
 def test_simple_train():
     nlp = Language()
@@ -92,10 +102,7 @@ def test_no_label():
 def test_implicit_label():
     nlp = Language()
     nlp.add_pipe("textcat")
-    train_examples = []
-    for t in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-    nlp.initialize(get_examples=lambda: train_examples)
+    nlp.initialize(get_examples=make_get_examples(nlp))
 
 
 def test_no_resize():
@@ -113,29 +120,26 @@ def test_no_resize():
 def test_initialize_examples():
     nlp = Language()
     textcat = nlp.add_pipe("textcat")
-    train_examples = []
     for text, annotations in TRAIN_DATA:
-        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
         for label, value in annotations.get("cats").items():
             textcat.add_label(label)
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
-    nlp.initialize(get_examples=lambda: train_examples)
+    get_examples = make_get_examples(nlp)
+    nlp.initialize(get_examples=get_examples)
     with pytest.raises(ValueError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(ValueError):
-        nlp.initialize(get_examples=train_examples)
+        nlp.initialize(get_examples=get_examples())
 
 
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
     nlp = English()
+    nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
     # Set exclusive labels
-    textcat = nlp.add_pipe(
-        "textcat",
-        config={"model": {"exclusive_classes": True}, "positive_label": "POSITIVE"},
-    )
+    textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}},)
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -203,28 +207,26 @@ def test_textcat_configs(textcat_config):
 
 def test_positive_class():
     nlp = English()
-    pipe_config = {"positive_label": "POS", "labels": ["POS", "NEG"]}
-    textcat = nlp.add_pipe("textcat", config=pipe_config)
+    textcat = nlp.add_pipe("textcat")
+    get_examples = make_get_examples(nlp)
+    textcat.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS")
     assert textcat.labels == ("POS", "NEG")
-    verify_textcat_config(nlp, pipe_config)
 
 
 def test_positive_class_not_present():
     nlp = English()
-    pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING"]}
-    textcat = nlp.add_pipe("textcat", config=pipe_config)
-    assert textcat.labels == ("SOME", "THING")
+    textcat = nlp.add_pipe("textcat")
+    get_examples = make_get_examples(nlp)
     with pytest.raises(ValueError):
-        verify_textcat_config(nlp, pipe_config)
+        textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS")
 
 
 def test_positive_class_not_binary():
     nlp = English()
-    pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING", "POS"]}
-    textcat = nlp.add_pipe("textcat", config=pipe_config)
-    assert textcat.labels == ("SOME", "THING", "POS")
+    textcat = nlp.add_pipe("textcat")
+    get_examples = make_get_examples(nlp)
     with pytest.raises(ValueError):
-        verify_textcat_config(nlp, pipe_config)
+        textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
 
 
 def test_textcat_evaluation():
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 1c605fea8..f90531dbb 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -136,13 +136,7 @@ def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_TEXTCAT_MODEL}
     model = registry.resolve(cfg, validate=True)["model"]
-    textcat = TextCategorizer(
-        en_vocab,
-        model,
-        labels=["ENTITY", "ACTION", "MODIFIER"],
-        threshold=0.5,
-        positive_label=None,
-    )
+    textcat = TextCategorizer(en_vocab, model, threshold=0.5)
     textcat.to_bytes(exclude=["vocab"])
 
 
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 7cb1555d7..bbdf4f62b 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -50,9 +50,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         logger.info("Initialized pipeline components")
-    # Verify the config after calling 'initialize' to ensure labels
-    # are properly initialized
-    verify_config(nlp)
     return nlp
 
 
@@ -152,33 +149,6 @@ def init_tok2vec(
     return False
 
 
-def verify_config(nlp: "Language") -> None:
-    """Perform additional checks based on the config, loaded nlp object and training data."""
-    # TODO: maybe we should validate based on the actual components, the list
-    # in config["nlp"]["pipeline"] instead?
-    for pipe_config in nlp.config["components"].values():
-        # We can't assume that the component name == the factory
-        factory = pipe_config["factory"]
-        if factory == "textcat":
-            verify_textcat_config(nlp, pipe_config)
-
-
-def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None:
-    # if 'positive_label' is provided: double check whether it's in the data and
-    # the task is binary
-    if pipe_config.get("positive_label"):
-        textcat_labels = nlp.get_pipe("textcat").labels
-        pos_label = pipe_config.get("positive_label")
-        if pos_label not in textcat_labels:
-            raise ValueError(
-                Errors.E920.format(pos_label=pos_label, labels=textcat_labels)
-            )
-        if len(list(textcat_labels)) != 2:
-            raise ValueError(
-                Errors.E919.format(pos_label=pos_label, labels=textcat_labels)
-            )
-
-
 def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]:
     """RETURNS (List[str]): All sourced components in the original config,
     e.g. {"source": "en_core_web_sm"}. If the config contains a key
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 0901a6fa9..447765e15 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -35,11 +35,10 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("textcat", config=config)
 > ```
 
-| Setting          | Description                                                                                                                                                      |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold`      | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
-| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~                                                    |
-| `model`          | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
+| Setting     | Description                                                                                                                                                      |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
+| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/textcat.py
@@ -59,21 +58,20 @@ architectures and their arguments and hyperparameters.
 >
 > # Construction from class
 > from spacy.pipeline import TextCategorizer
-> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5, positive_label="POS")
+> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5)
 > ```
 
 Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name             | Description                                                                                                                |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                           |
-| `model`          | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
-| _keyword-only_   |                                                                                                                            |
-| `threshold`      | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
-| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~                             |
+| Name           | Description                                                                                                                |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                           |
+| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
+| _keyword-only_ |                                                                                                                            |
+| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
 
 ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
 
@@ -152,18 +150,20 @@ This method was previously called `begin_training`.
 > ```ini
 > ### config.cfg
 > [initialize.components.textcat]
+> positive_label = "POS"
 >
 > [initialize.components.textcat.labels]
 > @readers = "spacy.read_labels.v1"
 > path = "corpus/labels/textcat.json
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
-| _keyword-only_ |                                                                                                                                                                                                                                                                                                                                                                                                            |
-| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
-| `labels`       | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
+| Name             | Description                                                                                                                                                                                                                                                                                                                                                                                                |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
+| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
+| `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
+| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~                                                                                                                                                                                                                                                                                              |
 
 ## TextCategorizer.predict {#predict tag="method"}
 

From 7c4ab7e82c5eba0133dee880f5e79d86ec083b13 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 17:16:10 +0200
Subject: [PATCH 358/516] Fix Lemmatizer.get_lookups_config

---
 spacy/lang/fr/lemmatizer.py          | 15 ++++-----------
 spacy/lang/nl/lemmatizer.py          | 14 ++++----------
 spacy/lang/pl/lemmatizer.py          | 25 ++++++++++---------------
 spacy/tests/lang/test_lemmatizers.py | 11 +++++++++--
 website/docs/api/lemmatizer.md       | 21 ++++-----------------
 5 files changed, 31 insertions(+), 55 deletions(-)

diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index 0dd782cc4..bb5a270ab 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
@@ -15,17 +15,10 @@ class FrenchLemmatizer(Lemmatizer):
     """
 
     @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
         if mode == "rule":
-            return {
-                "required_tables": [
-                    "lemma_lookup",
-                    "lemma_rules",
-                    "lemma_exc",
-                    "lemma_index",
-                ],
-                "optional_tables": [],
-            }
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
         else:
             return super().get_lookups_config(mode)
 
diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py
index 42b97a862..6c025dcf6 100644
--- a/spacy/lang/nl/lemmatizer.py
+++ b/spacy/lang/nl/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
@@ -6,16 +6,10 @@ from ...tokens import Token
 
 class DutchLemmatizer(Lemmatizer):
     @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
         if mode == "rule":
-            return {
-                "required_tables": [
-                    "lemma_lookup",
-                    "lemma_rules",
-                    "lemma_exc",
-                    "lemma_index",
-                ],
-            }
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
         else:
             return super().get_lookups_config(mode)
 
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 406ef9e4a..059d0609a 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
 from ...pipeline import Lemmatizer
 from ...tokens import Token
@@ -11,21 +11,16 @@ class PolishLemmatizer(Lemmatizer):
     # lemmatization, as well as case-sensitive lemmatization for nouns.
 
     @classmethod
-    def get_lookups_config(cls, mode: str) -> Dict:
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
         if mode == "pos_lookup":
-            return {
-                "required_tables": [
-                    "lemma_lookup_adj",
-                    "lemma_lookup_adp",
-                    "lemma_lookup_adv",
-                    "lemma_lookup_aux",
-                    "lemma_lookup_noun",
-                    "lemma_lookup_num",
-                    "lemma_lookup_part",
-                    "lemma_lookup_pron",
-                    "lemma_lookup_verb",
-                ]
-            }
+            # fmt: off
+            required = [
+                "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
+                "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
+                "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
+            ]
+            # fmt: on
+            return (required, [])
         else:
             return super().get_lookups_config(mode)
 
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 5f45664eb..a49d70d6b 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -23,8 +23,9 @@ def test_lemmatizer_initialize(lang, capfd):
         lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
         return lookups
 
+    lang_cls = get_lang_class(lang)
     # Test that languages can be initialized
-    nlp = get_lang_class(lang)()
+    nlp = lang_cls()
     lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
     assert not lemmatizer.lookups.tables
     nlp.config["initialize"]["components"]["lemmatizer"] = {
@@ -41,7 +42,13 @@ def test_lemmatizer_initialize(lang, capfd):
     assert doc[0].lemma_ == "y"
 
     # Test initialization by calling .initialize() directly
-    nlp = get_lang_class(lang)()
+    nlp = lang_cls()
     lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
     lemmatizer.initialize(lookups=lemmatizer_init_lookups())
     assert nlp("x")[0].lemma_ == "y"
+
+    # Test lookups config format
+    for mode in ("rule", "lookup", "pos_lookup"):
+        required, optional = lemmatizer.get_lookups_config(mode)
+        assert isinstance(required, list)
+        assert isinstance(optional, list)
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 27ea04432..e838c75b2 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -190,23 +190,10 @@ lemmatization entirely.
 Returns the lookups configuration settings for a given mode for use in
 [`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups).
 
-| Name        | Description                                                                                                                                                                       |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `mode`      | The lemmatizer mode. ~~str~~                                                                                                                                                      |
-| **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ |
-
-## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
-
-Load and validate lookups tables. If the provided lookups is `None`, load the
-default lookups tables according to the language and mode settings. Confirm that
-all required tables for the language and mode are present.
-
-| Name        | Description                                                                                        |
-| ----------- | -------------------------------------------------------------------------------------------------- |
-| `lang`      | The language. ~~str~~                                                                              |
-| `mode`      | The lemmatizer mode. ~~str~~                                                                       |
-| `lookups`   | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ |
-| **RETURNS** | The lookups. ~~Lookups~~                                                                           |
+| Name        | Description                                                                            |
+| ----------- | -------------------------------------------------------------------------------------- |
+| `mode`      | The lemmatizer mode. ~~str~~                                                           |
+| **RETURNS** | The required table names and the optional table names. ~~Tuple[List[str], List[str]]~~ |
 
 ## Lemmatizer.to_disk {#to_disk tag="method"}
 

From 3bc3c05fcc7e4fa40f6a3e43681444b3c36b653e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 17:20:18 +0200
Subject: [PATCH 359/516] Tidy up and auto-format

---
 spacy/cli/debug_data.py                     |  2 +-
 spacy/lang/pl/__init__.py                   |  1 -
 spacy/lang/zh/__init__.py                   |  2 +-
 spacy/ml/models/tok2vec.py                  |  8 ++++++--
 spacy/pipeline/textcat.py                   |  2 +-
 spacy/tests/conftest.py                     |  3 ++-
 spacy/tests/doc/test_morphanalysis.py       | 14 ++++++++------
 spacy/tests/lang/zh/test_serialize.py       |  3 ++-
 spacy/tests/pipeline/test_textcat.py        |  7 +++++--
 spacy/tests/serialize/test_serialize_doc.py |  8 +++++++-
 spacy/tests/test_scorer.py                  |  6 ++----
 spacy/training/augment.py                   |  2 +-
 spacy/training/loop.py                      |  6 +++---
 spacy/vocab.pyx                             |  2 +-
 14 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 3dc8d262d..ead759e33 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -171,7 +171,7 @@ def debug_data(
         n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
         msg.warn(
             "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"],
+                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
             ),
         )
         msg.text(
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index f7be8a6c2..9e7303e83 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -8,7 +8,6 @@ from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .lemmatizer import PolishLemmatizer
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...lookups import Lookups
 from ...language import Language
 
 
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 858f41f65..55a77330a 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -47,7 +47,7 @@ class Segmenter(str, Enum):
 
 
 @registry.tokenizers("spacy.zh.ChineseTokenizer")
-def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,):
+def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
     def chinese_tokenizer_factory(nlp):
         return ChineseTokenizer(nlp, segmenter=segmenter)
 
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 2870de1b9..f9a906397 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -165,8 +165,12 @@ def MultiHashEmbed(
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
-    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool,
-    feature: Union[int, str]="LOWER"
+    width: int,
+    rows: int,
+    nM: int,
+    nC: int,
+    also_use_static_vectors: bool,
+    feature: Union[int, str] = "LOWER",
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 989c65b8f..fc60ebf89 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -70,7 +70,7 @@ subword_features = true
     },
 )
 def make_textcat(
-    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float,
+    nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
     """Create a TextCategorizer compoment. The text categorizer predicts categories
     over a whole document. It can learn one or more labels, and the labels can
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index bcf582388..4a3d126d7 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -294,7 +294,8 @@ def zh_tokenizer_pkuseg():
                 "segmenter": "pkuseg",
             }
         },
-        "initialize": {"tokenizer": {
+        "initialize": {
+            "tokenizer": {
                 "pkuseg_model": "default",
             }
         },
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index b44b13d4c..918d4acdc 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -5,12 +5,14 @@ import pytest
 def i_has(en_tokenizer):
     doc = en_tokenizer("I has")
     doc[0].set_morph({"PronType": "prs"})
-    doc[1].set_morph({
-        "VerbForm": "fin",
-        "Tense": "pres",
-        "Number": "sing",
-        "Person": "three",
-    })
+    doc[1].set_morph(
+        {
+            "VerbForm": "fin",
+            "Tense": "pres",
+            "Number": "sing",
+            "Person": "three",
+        }
+    )
 
     return doc
 
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 58c084ec8..03cdbbe24 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -34,7 +34,8 @@ def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
                 "segmenter": "pkuseg",
             }
         },
-        "initialize": {"tokenizer": {
+        "initialize": {
+            "tokenizer": {
                 "pkuseg_model": "medicine",
             }
         },
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index dd0159927..e950c81c6 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -139,7 +139,8 @@ def test_overfitting_IO():
     nlp = English()
     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"}
     # Set exclusive labels
-    textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}},)
+    config = {"model": {"exclusive_classes": True}}
+    textcat = nlp.add_pipe("textcat", config=config)
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -226,7 +227,9 @@ def test_positive_class_not_binary():
     textcat = nlp.add_pipe("textcat")
     get_examples = make_get_examples(nlp)
     with pytest.raises(ValueError):
-        textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS")
+        textcat.initialize(
+            get_examples, labels=["SOME", "THING", "POS"], positive_label="POS"
+        )
 
 
 def test_textcat_evaluation():
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index 8b6adb83b..00b9d12d4 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -92,7 +92,13 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab):
 
 
 @pytest.mark.parametrize(
-    "writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")]
+    "writer_flag,reader_flag,reader_value",
+    [
+        (True, True, "bar"),
+        (True, False, "bar"),
+        (False, True, "nothing"),
+        (False, False, "nothing"),
+    ],
 )
 def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value):
     """Test that custom extensions are correctly serialized in DocBin."""
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 039f3d4d8..4c1b09849 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -158,7 +158,7 @@ def test_las_per_type(en_vocab):
     examples = []
     for input_, annot in test_las_apple:
         doc = Doc(
-            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"]
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         doc[0].dep_ = "compound"
@@ -182,9 +182,7 @@ def test_ner_per_type(en_vocab):
     examples = []
     for input_, annot in test_ner_cardinal:
         doc = Doc(
-            en_vocab,
-            words=input_.split(" "),
-            ents=["B-CARDINAL", "O", "B-CARDINAL"],
+            en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"]
         )
         entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 8965c5457..7415ad335 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -30,7 +30,7 @@ class OrthVariants(BaseModel):
 
 @registry.augmenters("spacy.orth_variants.v1")
 def create_orth_variants_augmenter(
-    level: float, lower: float, orth_variants: OrthVariants,
+    level: float, lower: float, orth_variants: OrthVariants
 ) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 093a9ebb3..fbfc5930f 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -21,8 +21,8 @@ def train(
     output_path: Optional[Path] = None,
     *,
     use_gpu: int = -1,
-    stdout: IO=sys.stdout,
-    stderr: IO=sys.stderr
+    stdout: IO = sys.stdout,
+    stderr: IO = sys.stderr,
 ) -> None:
     """Train a pipeline.
 
@@ -34,7 +34,7 @@ def train(
         printing, set to io.StringIO.
     stderr (file): A second file-like object to write output messages. To disable
         printing, set to io.StringIO.
- 
+
     RETURNS (Path / None): The path to the final exported model.
     """
     # We use no_print here so we can respect the stdout/stderr options.
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ce104d9db..a22f12c65 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -16,7 +16,7 @@ from .errors import Errors
 from .attrs import intify_attrs, NORM, IS_STOP
 from .vectors import Vectors
 from .util import registry
-from .lookups import Lookups, load_lookups
+from .lookups import Lookups
 from . import util
 from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang

From d6c967401f1a6fb78f34ec70170cecb2e498e3b8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 17:20:47 +0200
Subject: [PATCH 360/516] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index e61e5ab25..ba0ba1f4a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a30"
+__version__ = "3.0.0a31"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 989c59918c7c5e1b1c61187b53ec893f7358fcb0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 18:53:39 +0200
Subject: [PATCH 361/516] Update docs [ci skip]

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 32d73d762..e51e698dd 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -232,7 +232,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
-| **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
+| **CREATES**       | The best trained pipeline and the final checkpoint (if training is terminated).                                                                                                            |
 
 ## convert {#convert tag="command"}
 

From 80603f0fa57c9735a9b07a9af315d695cb445568 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 18:54:09 +0200
Subject: [PATCH 362/516] Make SentenceRecognizer.label_data return None

Overwrite the method from the base class (Tagger) but don't export anything in "init labels"
---
 spacy/pipeline/senter.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index ec635de5c..231072e9c 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger):
         # are 0
         return tuple(["I", "S"])
 
+    @property
+    def label_data(self):
+        return None
+
     def set_annotations(self, docs, batch_tag_ids):
         """Modify a batch of documents, using pre-computed scores.
 

From c2401fca411559c66fa4172886a24d4d632de162 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 3 Oct 2020 19:12:46 +0200
Subject: [PATCH 363/516] Add tests for Pipe.label_data

---
 spacy/tests/pipeline/test_pipe_methods.py | 33 ++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index ea09d990c..d6d04f158 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,6 +1,6 @@
 import pytest
 from spacy.language import Language
-from spacy.util import SimpleFrozenList
+from spacy.util import SimpleFrozenList, get_arg_names
 
 
 @pytest.fixture
@@ -346,3 +346,34 @@ def test_pipe_methods_frozen():
         nlp.components.sort()
     with pytest.raises(NotImplementedError):
         nlp.component_names.clear()
+
+
+@pytest.mark.parametrize(
+    "pipe",
+    [
+        "tagger",
+        "parser",
+        "ner",
+        "textcat",
+        pytest.param("morphologizer", marks=pytest.mark.xfail),
+    ],
+)
+def test_pipe_label_data_exports_labels(pipe):
+    nlp = Language()
+    pipe = nlp.add_pipe(pipe)
+    # Make sure pipe has pipe labels
+    assert getattr(pipe, "label_data", None) is not None
+    # Make sure pipe can be initialized with labels
+    initialize = getattr(pipe, "initialize", None)
+    assert initialize is not None
+    assert "labels" in get_arg_names(initialize)
+
+
+@pytest.mark.parametrize("pipe", ["senter", "entity_linker"])
+def test_pipe_label_data_no_labels(pipe):
+    nlp = Language()
+    pipe = nlp.add_pipe(pipe)
+    assert getattr(pipe, "label_data", None) is None
+    initialize = getattr(pipe, "initialize", None)
+    if initialize is not None:
+        assert "labels" not in get_arg_names(initialize)

From 8ea8b7d9406244deec7b357c68d1163268fa613f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:13:42 +0200
Subject: [PATCH 364/516] Support loading labels in morphologizer

---
 spacy/pipeline/morphologizer.pyx | 34 ++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index ab0554692..db6fa0a11 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -134,7 +134,7 @@ class Morphologizer(Tagger):
             self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         return 1
 
-    def initialize(self, get_examples, *, nlp=None):
+    def initialize(self, get_examples, *, nlp=None, labels=None):
         """Initialize the pipe for training, using a representative set
         of data examples.
 
@@ -145,20 +145,24 @@ class Morphologizer(Tagger):
         DOCS: https://nightly.spacy.io/api/morphologizer#initialize
         """
         self._ensure_examples(get_examples)
-        # First, fetch all labels from the data
-        for example in get_examples():
-            for i, token in enumerate(example.reference):
-                pos = token.pos_
-                morph = str(token.morph)
-                # create and add the combined morph+POS label
-                morph_dict = Morphology.feats_to_dict(morph)
-                if pos:
-                    morph_dict[self.POS_FEAT] = pos
-                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
-                # add label->morph and label->POS mappings
-                if norm_label not in self.cfg["labels_morph"]:
-                    self.cfg["labels_morph"][norm_label] = morph
-                    self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
+        if labels is not None:
+            self.cfg["labels_morph"] = labels["labels_morph"]
+            self.cfg["labels_pos"] = labels["labels_pos"]
+        else:
+            # First, fetch all labels from the data
+            for example in get_examples():
+                for i, token in enumerate(example.reference):
+                    pos = token.pos_
+                    morph = str(token.morph)
+                    # create and add the combined morph+POS label
+                    morph_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        morph_dict[self.POS_FEAT] = pos
+                    norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                    # add label->morph and label->POS mappings
+                    if norm_label not in self.cfg["labels_morph"]:
+                        self.cfg["labels_morph"][norm_label] = morph
+                        self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
         if len(self.labels) <= 1:
             raise ValueError(Errors.E143.format(name=self.name))
         doc_sample = []

From b305f2ff5a40fed855fee71b6b0cf7dca775ac28 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:26:10 +0200
Subject: [PATCH 365/516] Fix loggers

---
 spacy/training/loggers.py |  8 +++++---
 spacy/training/loop.py    | 12 +++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index e8c948f54..585764214 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -29,8 +29,8 @@ def console_logger(progress_bar: bool = False):
         table_header = [col.upper() for col in table_header]
         table_widths = [3, 6] + loss_widths + score_widths + [6]
         table_aligns = ["r" for _ in table_widths]
-        stdout.write(msg.row(table_header, widths=table_widths))
-        stdout.write(msg.row(["-" * width for width in table_widths]))
+        stdout.write(msg.row(table_header, widths=table_widths) + "\n")
+        stdout.write(msg.row(["-" * width for width in table_widths]) + "\n")
         progress = None
 
         def log_step(info: Optional[Dict[str, Any]]) -> None:
@@ -75,7 +75,9 @@ def console_logger(progress_bar: bool = False):
             )
             if progress is not None:
                 progress.close()
-            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
+            stdout.write(
+                msg.row(data, widths=table_widths, aligns=table_aligns) + "\n"
+            )
             if progress_bar:
                 # Set disable=None, so that it disables on non-TTY
                 progress = tqdm.tqdm(
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index fbfc5930f..2e347829a 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -69,10 +69,10 @@ def train(
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
     )
-    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
+    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
     if frozen_components:
-        stdout.write(msg.info(f"Frozen components: {frozen_components}"))
-    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
+        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
     with nlp.select_pipes(disable=frozen_components):
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:
@@ -93,7 +93,7 @@ def train(
                 msg.warn(
                     f"Aborting and saving the final best model. "
                     f"Encountered exception: {str(e)}"
-                )
+                ) + "\n"
             )
         raise e
     finally:
@@ -106,7 +106,9 @@ def train(
             else:
                 nlp.to_disk(final_model_path)
     # This will only run if we don't hit an error
-    stdout.write(msg.good("Saved pipeline to output directory", final_model_path))
+    stdout.write(
+        msg.good("Saved pipeline to output directory", final_model_path) + "\n"
+    )
 
 
 def train_while_improving(

From 85ede32680686c62bae2522cd5690ca7e826a2a5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:26:23 +0200
Subject: [PATCH 366/516] Format

---
 spacy/training/loggers.py | 4 +---
 spacy/training/loop.py    | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 585764214..f0ca7064a 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -75,9 +75,7 @@ def console_logger(progress_bar: bool = False):
             )
             if progress is not None:
                 progress.close()
-            stdout.write(
-                msg.row(data, widths=table_widths, aligns=table_aligns) + "\n"
-            )
+            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n")
             if progress_bar:
                 # Set disable=None, so that it disables on non-TTY
                 progress = tqdm.tqdm(
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 2e347829a..b63adb6c9 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -93,7 +93,8 @@ def train(
                 msg.warn(
                     f"Aborting and saving the final best model. "
                     f"Encountered exception: {str(e)}"
-                ) + "\n"
+                )
+                + "\n"
             )
         raise e
     finally:

From 70b9de8e589776ba90c000addfa24dffe5915b33 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:26:52 +0200
Subject: [PATCH 367/516] Set version to v3.0.0a32

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index ba0ba1f4a..037ca6bcb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a31"
+__version__ = "3.0.0a32"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 835070cedcc427bd111edf640fd923fa0a93ace8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:35:10 +0200
Subject: [PATCH 368/516] Upd test

---
 spacy/tests/pipeline/test_pipe_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index d6d04f158..0b663fcb8 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -355,7 +355,7 @@ def test_pipe_methods_frozen():
         "parser",
         "ner",
         "textcat",
-        pytest.param("morphologizer", marks=pytest.mark.xfail),
+        "morphologizer"
     ],
 )
 def test_pipe_label_data_exports_labels(pipe):

From 3b2a78720c451773a0dd049a3b7f0c18a8558da4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 3 Oct 2020 19:35:19 +0200
Subject: [PATCH 369/516] Upd morphologizer

---
 spacy/pipeline/morphologizer.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index db6fa0a11..29f0d7fb4 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -146,8 +146,8 @@ class Morphologizer(Tagger):
         """
         self._ensure_examples(get_examples)
         if labels is not None:
-            self.cfg["labels_morph"] = labels["labels_morph"]
-            self.cfg["labels_pos"] = labels["labels_pos"]
+            self.cfg["labels_morph"] = labels["morph"]
+            self.cfg["labels_pos"] = labels["pos"]
         else:
             # First, fetch all labels from the data
             for example in get_examples():

From 3f657ed3a1f7844b3629de018ab3fb6351971590 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 3 Oct 2020 22:34:10 +0200
Subject: [PATCH 370/516] implement warning in __init_subclass__ instead

---
 spacy/errors.py         |  6 +++---
 spacy/language.py       |  6 +-----
 spacy/pipeline/pipe.pyx | 10 +++++++++-
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 2c076db52..791e567eb 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,9 +85,9 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W089 = ("The 'begin_training' method has been renamed to 'initialize', "
-            "for calls to 'nlp' as well as for the individual pipeline "
-            "components.")
+    W088 = ("This component implements a 'begin_training' method, "
+            "which should probably be renamed to 'initialize'.")
+    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
diff --git a/spacy/language.py b/spacy/language.py
index 36cd251f3..14b9f4eb0 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1207,11 +1207,7 @@ class Language:
             )
             self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)
         for name, proc in self.pipeline:
-            # backwards compatibility for older components
-            if hasattr(proc, "begin_training"):
-                warnings.warn(Warnings.W089, DeprecationWarning)
-                proc.begin_training(get_examples, pipeline=self.pipeline, sgd=self._optimizer)
-            elif hasattr(proc, "initialize"):
+            if hasattr(proc, "initialize"):
                 p_settings = I["components"].get(name, {})
                 p_settings = validate_init_settings(
                     proc.initialize, p_settings, section="components", name=name
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 5316620e9..a18f04ee3 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True, profile=True
+import warnings
 from typing import Optional, Tuple
 import srsly
 from thinc.api import set_dropout_rate, Model
@@ -6,7 +7,7 @@ from thinc.api import set_dropout_rate, Model
 from ..tokens.doc cimport Doc
 
 from ..training import validate_examples
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from .. import util
 
 
@@ -33,6 +34,13 @@ cdef class Pipe:
         self.name = name
         self.cfg = dict(cfg)
 
+    @classmethod
+    def __init_subclass__(cls, **kwargs):
+        """Raise a warning if an inheriting class implements 'begin_training'
+         (from v2) instead of the new 'initialize' method (from v3)"""
+        if hasattr(cls, "begin_training"):
+            warnings.warn(Warnings.W088)
+
     @property
     def labels(self) -> Optional[Tuple[str]]:
         return []

From 2c4b2ee5e9b29442c119e9c8bb2b5bce761a78aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 3 Oct 2020 23:27:05 +0200
Subject: [PATCH 371/516] REL intro and get_candidates function

---
 website/docs/usage/layers-architectures.md | 54 ++++++++++++++++++++++
 website/docs/usage/processing-pipelines.md |  2 +-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index b65c3d903..678f70667 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -486,6 +486,60 @@ with Model.define_operators({">>": chain}):
 
 ## Create new trainable components {#components}
 
+In addition to [swapping out](#swap-architectures) default models in built-in
+components, you can also implement an entirely new,
+[trainable pipeline component](usage/processing-pipelines#trainable-components)
+from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
+and linking it up to your custom model implementation.
+
+### Example: Pipeline component for relation extraction {#component-rel}
+
+This section will run through an example of implementing a novel relation extraction 
+component from scratch. As a first step, we need a method that will generate pairs of
+entities that we want to classify as being related or not. These candidate pairs are 
+typically formed within one document, which means we'll have a function that takes a 
+`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
+on binary relation extraction, i.e. the tuple will be of length 2.
+
+We register this function in the 'misc' register so we can easily refer to it from the config, 
+and allow swapping it out for any candidate 
+generation function. For instance, a very straightforward implementation would be to just 
+take any two entities from the same document:
+
+```python
+@registry.misc.register("rel_cand_generator.v1")
+def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+But we could also refine this further by excluding relations of an entity with itself, 
+and posing a maximum distance (in number of tokens) between two entities:
+
+```python
+### {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v2")
+def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+    def get_candidate_indices(doc: "Doc"):
+        indices = []
+        for ent1 in doc.ents:
+            for ent2 in doc.ents:
+                if ent1 != ent2:
+                    if max_length and abs(ent2.start - ent1.start) <= max_length:
+                        indices.append((ent1, ent2))
+        return indices
+    return get_candidate_indices
+```
+
+
+
+
+
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c98bd08bc..3619993c5 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1035,7 +1035,7 @@ plug fully custom machine learning components into your pipeline. You'll need
 the following:
 
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model using implemented in
+   can be a model implemented in
    [Thinc](/usage/layers-architectures#thinc), or a
    [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a

From 08ad349a1851c3310a4ae7f34170eea37c9e2e3b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 00:08:02 +0200
Subject: [PATCH 372/516] tok2vec layer

---
 website/docs/usage/layers-architectures.md | 87 ++++++++++++++--------
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 678f70667..6f79cc6e8 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -489,51 +489,80 @@ with Model.define_operators({">>": chain}):
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
 [trainable pipeline component](usage/processing-pipelines#trainable-components)
-from scratch. This can be done by creating a new class inheriting from [`Pipe`](/api/pipe), 
-and linking it up to your custom model implementation.
+from scratch. This can be done by creating a new class inheriting from
+[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation extraction 
-component from scratch. As a first step, we need a method that will generate pairs of
-entities that we want to classify as being related or not. These candidate pairs are 
-typically formed within one document, which means we'll have a function that takes a 
-`Doc` as input and outputs a `List` of `Span` tuples. In this example, we will focus 
-on binary relation extraction, i.e. the tuple will be of length 2.
-
-We register this function in the 'misc' register so we can easily refer to it from the config, 
-and allow swapping it out for any candidate 
-generation function. For instance, a very straightforward implementation would be to just 
-take any two entities from the same document:
+This section will run through an example of implementing a novel relation
+extraction component from scratch. As a first step, we need a method that will
+generate pairs of entities that we want to classify as being related or not.
+These candidate pairs are typically formed within one document, which means
+we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
+tuples. In this example, we will focus on binary relation extraction, i.e. the
+tuple will be of length 2. For instance, a very straightforward implementation
+would be to just take any two entities from the same document:
 
 ```python
-@registry.misc.register("rel_cand_generator.v1")
-def create_candidate_indices() -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
-        for ent1 in doc.ents:
-            for ent2 in doc.ents:
-                indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+    candidates = []
+    for ent1 in doc.ents:
+        for ent2 in doc.ents:
+            candidates.append((ent1, ent2))
+    return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with itself, 
-and posing a maximum distance (in number of tokens) between two entities:
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll also register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
+> ```
+> [get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> ```
 
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
-    def get_candidate_indices(doc: "Doc"):
-        indices = []
+    def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+        candidates = []
         for ent1 in doc.ents:
             for ent2 in doc.ents:
                 if ent1 != ent2:
                     if max_length and abs(ent2.start - ent1.start) <= max_length:
-                        indices.append((ent1, ent2))
-        return indices
-    return get_candidate_indices
+                        candidates.append((ent1, ent2))
+        return candidates
+    return get_candidates
+```
+
+> ```
+> [tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> pretrained_vectors = null
+> width = 96
+> depth = 2
+> embed_size = 300
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+Next, we'll assume we have access to an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms a list of documents into a list of 2D vectors. Further, this
+`tok2vec` component will be trainable, which means that, following the Thinc
+paradigm, we'll apply it to some input, and receive the predicted results as
+well as a callback to perform backpropagation:
+
+```python
+tok2vec = model.get_ref("tok2vec")
+tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
 ```
 
 

From 2110e8f86dd47686c25d0d44fff314be0cf60d42 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 10:06:49 +0200
Subject: [PATCH 373/516] Auto-format

---
 spacy/tests/pipeline/test_pipe_methods.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 0b663fcb8..c0b9762ed 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -349,14 +349,7 @@ def test_pipe_methods_frozen():
 
 
 @pytest.mark.parametrize(
-    "pipe",
-    [
-        "tagger",
-        "parser",
-        "ner",
-        "textcat",
-        "morphologizer"
-    ],
+    "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
 )
 def test_pipe_label_data_exports_labels(pipe):
     nlp = Language()

From d3b3663942ebe862a83cba4ac5a3e2b0e3a6a2cc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 10:11:27 +0200
Subject: [PATCH 374/516] Adjust error message and add test

---
 spacy/errors.py                           |  7 +++++--
 spacy/pipeline/pipe.pyx                   |  2 +-
 spacy/tests/pipeline/test_pipe_methods.py | 12 ++++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 878eed114..5343e7ce8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -85,8 +85,11 @@ class Warnings:
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W088 = ("This component implements a 'begin_training' method, "
-            "which should probably be renamed to 'initialize'.")
+    W088 = ("The pipeline component {name} implements a 'begin_training' "
+            "method, which won't be called by spaCy. As of v3.0, 'begin_training' "
+            "has been renamed to 'initialize' so you likely want to rename the "
+            "component method. See the documentation for details: "
+            "https://nightly.spacy.io/api/language#initialize")
     W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index a18f04ee3..41ca23ace 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -39,7 +39,7 @@ cdef class Pipe:
         """Raise a warning if an inheriting class implements 'begin_training'
          (from v2) instead of the new 'initialize' method (from v3)"""
         if hasattr(cls, "begin_training"):
-            warnings.warn(Warnings.W088)
+            warnings.warn(Warnings.W088.format(name=cls.__name__))
 
     @property
     def labels(self) -> Optional[Tuple[str]]:
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index c0b9762ed..e647ba440 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,5 +1,6 @@
 import pytest
 from spacy.language import Language
+from spacy.pipeline import Pipe
 from spacy.util import SimpleFrozenList, get_arg_names
 
 
@@ -370,3 +371,14 @@ def test_pipe_label_data_no_labels(pipe):
     initialize = getattr(pipe, "initialize", None)
     if initialize is not None:
         assert "labels" not in get_arg_names(initialize)
+
+
+def test_warning_pipe_begin_training():
+    with pytest.warns(UserWarning, match="begin_training"):
+
+        class IncompatPipe(Pipe):
+            def __init__(self):
+                ...
+
+            def begin_training(*args, **kwargs):
+                ...

From ff914f4e6feec972b5475cc102be97754cd18dd5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 11:10:26 +0200
Subject: [PATCH 375/516] Lazy-load xx

---
 spacy/training/converters/conll_ner_to_docs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 3b851039c..902db585b 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -2,9 +2,9 @@ from wasabi import Printer
 
 from .. import tags_to_entities
 from ...training import iob_to_biluo
-from ...lang.xx import MultiLanguage
 from ...tokens import Doc, Span
 from ...util import load_model
+from ...util import load_model, get_lang_class
 
 
 def conll_ner_to_docs(
@@ -86,7 +86,7 @@ def conll_ner_to_docs(
     if model:
         nlp = load_model(model)
     else:
-        nlp = MultiLanguage()
+        nlp = get_lang_class("xx")()
     output_docs = []
     for conll_doc in input_data.strip().split(doc_delimiter):
         conll_doc = conll_doc.strip()
@@ -136,7 +136,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
             "Segmenting sentences with sentencizer. (Use `-b model` for "
             "improved parser-based sentence segmentation.)"
         )
-        nlp = MultiLanguage()
+        nlp = get_lang_class("xx")()
         sentencizer = nlp.create_pipe("sentencizer")
     lines = doc.strip().split("\n")
     words = [line.strip().split()[0] for line in lines]

From bcd52e5486b5b2747a39675c45d3bc9846afbe12 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 11:16:31 +0200
Subject: [PATCH 376/516] Tidy up errors and warnings

---
 spacy/cli/_util.py                            |   3 +-
 spacy/errors.py                               | 323 ++++++++----------
 spacy/ml/models/tok2vec.py                    |   3 +-
 spacy/ml/staticvectors.py                     |  13 +-
 .../pipeline/_parser_internals/arc_eager.pyx  |  17 +-
 spacy/pipeline/_parser_internals/ner.pyx      |  10 +-
 spacy/pipeline/morphologizer.pyx              |   2 +-
 spacy/pipeline/senter.pyx                     |   2 +-
 spacy/pipeline/tagger.pyx                     |   4 +-
 spacy/scorer.py                               |   2 +-
 spacy/tokens/doc.pyx                          |   4 +-
 spacy/tokens/span.pyx                         |   6 +-
 .../training/converters/conll_ner_to_docs.py  |   8 +-
 spacy/training/converters/iob_to_docs.py      |   5 +-
 spacy/training/pretrain.py                    |   5 +-
 15 files changed, 186 insertions(+), 221 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 69c32bbad..c959c9861 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -322,8 +322,7 @@ def git_checkout(
     if dest.exists():
         msg.fail("Destination of checkout must not exist", exits=1)
     if not dest.parent.exists():
-        raise IOError("Parent of destination of checkout must exist")
-
+        msg.fail("Parent of destination of checkout must exist", exits=1)
     if sparse and git_version >= (2, 22):
         return git_sparse_checkout(repo, subpath, dest, branch)
     elif sparse:
diff --git a/spacy/errors.py b/spacy/errors.py
index 5343e7ce8..9145a7b19 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -16,8 +16,6 @@ def add_codes(err_cls):
 
 @add_codes
 class Warnings:
-    W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
-            "using ftfy.fix_text if necessary.")
     W005 = ("Doc object not parsed. This means displaCy won't be able to "
             "generate a dependency visualization for it. Make sure the Doc "
             "was processed with a model that supports dependency parsing, and "
@@ -51,8 +49,6 @@ class Warnings:
     W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
     W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
             "ignoring the duplicate entry.")
-    W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
-            "loaded. (Shape: {shape})")
     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
@@ -65,7 +61,7 @@ class Warnings:
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
     W028 = ("Doc.from_array was called with a vector of type '{type}', "
-            "but is expecting one of type 'uint64' instead. This may result "
+            "but is expecting one of type uint64 instead. This may result "
             "in problems with the vocab further on in the pipeline.")
     W030 = ("Some entities could not be aligned in the text \"{text}\" with "
             "entities \"{entities}\". Use "
@@ -79,18 +75,17 @@ class Warnings:
             "If this is surprising, make sure you have the spacy-lookups-data "
             "package installed. The languages with lexeme normalization tables "
             "are currently: {langs}")
-    W034 = ("Please install the package spacy-lookups-data in order to include "
-            "the default lexeme normalization table for the language '{lang}'.")
     W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
             "attribute or operator.")
 
     # TODO: fix numbering after merging develop into master
-    W088 = ("The pipeline component {name} implements a 'begin_training' "
-            "method, which won't be called by spaCy. As of v3.0, 'begin_training' "
-            "has been renamed to 'initialize' so you likely want to rename the "
+    W088 = ("The pipeline component {name} implements a `begin_training` "
+            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
+            "has been renamed to `initialize`, so you likely want to rename the "
             "component method. See the documentation for details: "
             "https://nightly.spacy.io/api/language#initialize")
-    W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
+    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
+            "to `nlp.initialize`.")
     W090 = ("Could not locate any {format} files in path '{path}'.")
     W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
     W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@@ -108,39 +103,33 @@ class Warnings:
             "download a newer compatible model or retrain your custom model "
             "with the current spaCy version. For more details and available "
             "updates, run: python -m spacy validate")
-    W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
-            "instead.")
-    W097 = ("No Model config was provided to create the '{name}' component, "
-            "and no default configuration could be found either.")
-    W098 = ("No Model config was provided to create the '{name}' component, "
-            "so a default configuration was used.")
-    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
-            "but got '{type}' instead, so ignoring it.")
+    W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
+            "`nlp.select_pipes` instead.")
     W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
             "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
             "string \"Field1=Value1,Value2|Field2=Value3\".")
-    W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
+    W101 = ("Skipping Doc custom extension '{name}' while merging docs.")
     W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
     W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
             "word segmenters: {supported}. Defaulting to {default}.")
     W104 = ("Skipping modifications for '{target}' segmenter. The current "
             "segmenter is '{current}'.")
-    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
-            "need to match on a stream of documents, you can use nlp.pipe and "
+    W105 = ("As of spaCy v3.0, the `{matcher}.pipe` method is deprecated. If you "
+            "need to match on a stream of documents, you can use `nlp.pipe` and "
             "call the {matcher} on each Doc object.")
-    W107 = ("The property Doc.{prop} is deprecated. Use "
-            "Doc.has_annotation(\"{attr}\") instead.")
+    W107 = ("The property `Doc.{prop}` is deprecated. Use "
+            "`Doc.has_annotation(\"{attr}\")` instead.")
 
 
 @add_codes
 class Errors:
     E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
-            "This usually happens when spaCy calls nlp.{method} with custom "
+            "This usually happens when spaCy calls `nlp.{method}` with custom "
             "component name that's not registered on the current language class. "
             "If you're using a custom component, make sure you've added the "
-            "decorator @Language.component (for function components) or "
-            "@Language.factory (for class components).\n\nAvailable "
+            "decorator `@Language.component` (for function components) or "
+            "`@Language.factory` (for class components).\n\nAvailable "
             "factories: {opts}")
     E003 = ("Not a valid pipeline component. Expected callable, but "
             "got {component} (name: '{name}'). If you're using a custom "
@@ -158,14 +147,13 @@ class Errors:
     E008 = ("Can't restore disabled pipeline component '{name}' because it "
             "doesn't exist in the pipeline anymore. If you want to remove "
             "components from the pipeline, you should do it before calling "
-            "`nlp.select_pipes()` or after restoring the disabled components.")
+            "`nlp.select_pipes` or after restoring the disabled components.")
     E010 = ("Word vectors set to length 0. This may be because you don't have "
             "a model installed or loaded, or because your model doesn't "
             "include word vectors. For more info, see the docs:\n"
             "https://nightly.spacy.io/usage/models")
     E011 = ("Unknown operator: '{op}'. Options: {opts}")
     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E014 = ("Unknown tag ID: {tag}")
     E016 = ("MultitaskObjective target should be function or one of: dep, "
             "tag, ent, dep_tag_offset, ent_tag.")
     E017 = ("Can only add unicode or bytes. Got type: {value_type}")
@@ -181,27 +169,24 @@ class Errors:
             "For example, are all labels added to the model? If you're "
             "training a named entity recognizer, also make sure that none of "
             "your annotated entity spans have leading or trailing whitespace "
-            "or punctuation. "
-            "You can also use the experimental `debug data` command to "
+            "or punctuation. You can also use the `debug data` command to "
             "validate your JSON-formatted training data. For details, run:\n"
             "python -m spacy debug data --help")
     E025 = ("String is too long: {length} characters. Max is 2**30.")
     E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
             "length {length}.")
-    E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
-            "length, or 'spaces' should be left default at None. spaces "
+    E027 = ("Arguments `words` and `spaces` should be sequences of the same "
+            "length, or `spaces` should be left default at None. `spaces` "
             "should be a sequence of booleans, with True meaning that the "
             "word owns a ' ' character following it.")
-    E028 = ("orths_and_spaces expects either a list of unicode string or a "
-            "list of (unicode, bool) tuples. Got bytes instance: {value}")
-    E029 = ("noun_chunks requires the dependency parse, which requires a "
+    E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}")
+    E029 = ("`noun_chunks` requires the dependency parse, which requires a "
             "statistical model to be installed and loaded. For more info, see "
             "the documentation:\nhttps://nightly.spacy.io/usage/models")
     E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
-            "component to the pipeline with: "
-            "nlp.add_pipe('sentencizer'). "
+            "component to the pipeline with: `nlp.add_pipe('sentencizer')`. "
             "Alternatively, add the dependency parser, or set sentence "
-            "boundaries by setting doc[i].is_sent_start.")
+            "boundaries by setting `doc[i].is_sent_start`.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
     E033 = ("Cannot load into non-empty Doc of length {length}.")
     E035 = ("Error creating span with start {start} and end {end} for Doc of "
@@ -215,7 +200,7 @@ class Errors:
             "issue here: http://github.com/explosion/spaCy/issues")
     E040 = ("Attempt to access token at {i}, max length {max_length}.")
     E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
-    E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
+    E042 = ("Error accessing `doc[{i}].nbor({j})`, for doc of length {length}.")
     E043 = ("Refusing to write to token.sent_start if its document is parsed, "
             "because this may cause inconsistent state.")
     E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
@@ -235,7 +220,7 @@ class Errors:
     E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
             "original string.\nKey: {key}\nOrths: {orths}")
     E057 = ("Stepped slices not supported in Span objects. Try: "
-            "list(tokens)[start:stop:step] instead.")
+            "`list(tokens)[start:stop:step]` instead.")
     E058 = ("Could not retrieve vector for key {key}.")
     E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
     E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
@@ -244,7 +229,7 @@ class Errors:
             "and 63 are occupied. You can replace one by specifying the "
             "`flag_id` explicitly, e.g. "
             "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
-    E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
+    E063 = ("Invalid value for `flag_id`: {value}. Flag IDs must be between 1 "
             "and 63 (inclusive).")
     E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
             "string, the lexeme returned had an orth ID that did not match "
@@ -273,7 +258,7 @@ class Errors:
     E085 = ("Can't create lexeme for string '{string}'.")
     E087 = ("Unknown displaCy style: {style}.")
     E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
-            "v2.x parser and NER models require roughly 1GB of temporary "
+            "parser and NER models require roughly 1GB of temporary "
             "memory per 100,000 characters in the input. This means long "
             "texts may cause memory allocation errors. If you're not using "
             "the parser or NER, it's probably safe to increase the "
@@ -290,8 +275,8 @@ class Errors:
     E094 = ("Error reading line {line_num} in vectors file {loc}.")
     E095 = ("Can't write to frozen dictionary. This is likely an internal "
             "error. Are you writing to a default function argument?")
-    E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
-            "Span objects, or dicts if set to manual=True.")
+    E096 = ("Invalid object passed to displaCy: Can only visualize `Doc` or "
+            "Span objects, or dicts if set to `manual=True`.")
     E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
             "phrase pattern (string) but got:\n{pattern}")
     E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
@@ -308,11 +293,11 @@ class Errors:
     E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
             "token can only be part of one entity, so make sure the entities "
             "you're setting don't overlap.")
-    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
+    E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
             "settings: {opts}")
-    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
+    E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
     E109 = ("Component '{name}' could not be run. Did you forget to "
-            "call initialize()?")
+            "call `initialize()`?")
     E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
     E111 = ("Pickling a token is not supported, because tokens are only views "
             "of the parent Doc and can't exist on their own. A pickled token "
@@ -329,8 +314,8 @@ class Errors:
     E117 = ("The newly split tokens must match the text of the original token. "
             "New orths: {new}. Old text: {old}.")
     E118 = ("The custom extension attribute '{attr}' is not registered on the "
-            "Token object so it can't be set during retokenization. To "
-            "register an attribute, use the Token.set_extension classmethod.")
+            "`Token` object so it can't be set during retokenization. To "
+            "register an attribute, use the `Token.set_extension` classmethod.")
     E119 = ("Can't set custom extension attribute '{attr}' during "
             "retokenization because it's not writable. This usually means it "
             "was registered with a getter function (and no setter) or as a "
@@ -354,7 +339,7 @@ class Errors:
     E130 = ("You are running a narrow unicode build, which is incompatible "
             "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
             "unicode build instead. You can also rebuild Python and set the "
-            "--enable-unicode=ucs4 flag.")
+            "`--enable-unicode=ucs4 flag`.")
     E131 = ("Cannot write the kb_id of an existing Span object because a Span "
             "is a read-only view of the underlying Token objects stored in "
             "the Doc. Instead, create a new Span object and specify the "
@@ -367,27 +352,20 @@ class Errors:
     E133 = ("The sum of prior probabilities for alias '{alias}' should not "
             "exceed 1, but found {sum}.")
     E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
-    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
-            "to provide a valid JSON object as input with either the `text` "
-            "or `tokens` key. For more info, see the docs:\n"
-            "https://nightly.spacy.io/api/cli#pretrain-jsonl")
-    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
-            "includes either the `text` or `tokens` key. For more info, see "
-            "the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge Base for component '{name}' is empty. Use the methods "
-            "kb.add_entity and kb.add_alias to add entries.")
+    E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
+            "`kb.add_entity` and `kb.add_alias` to add entries.")
     E140 = ("The list of entities, prior probabilities and entity vectors "
             "should be of equal length.")
     E141 = ("Entity vectors should be of length {required} instead of the "
             "provided {found}.")
     E143 = ("Labels for component '{name}' not initialized. This can be fixed "
             "by calling add_label, or by providing a representative batch of "
-            "examples to the component's initialize method.")
+            "examples to the component's `initialize` method.")
     E145 = ("Error reading `{param}` from input file.")
-    E146 = ("Could not access `{path}`.")
+    E146 = ("Could not access {path}.")
     E147 = ("Unexpected error in the {method} functionality of the "
             "EntityLinker: {msg}. This is likely a bug in spaCy, so feel free "
-            "to open an issue.")
+            "to open an issue: https://github.com/explosion/spaCy/issues")
     E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that "
             "each entity in `doc.ents` is assigned to a KB identifier.")
     E149 = ("Error deserializing model. Check that the config used to create "
@@ -395,18 +373,18 @@ class Errors:
     E150 = ("The language of the `nlp` object and the `vocab` should be the "
             "same, but found '{nlp}' and '{vocab}' respectively.")
     E152 = ("The attribute {attr} is not supported for token patterns. "
-            "Please use the option validate=True with Matcher, PhraseMatcher, "
+            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
             "or EntityRuler for more details.")
     E153 = ("The value type {vtype} is not supported for token patterns. "
             "Please use the option validate=True with Matcher, PhraseMatcher, "
             "or EntityRuler for more details.")
     E154 = ("One of the attributes or values is not supported for token "
-            "patterns. Please use the option validate=True with Matcher, "
+            "patterns. Please use the option `validate=True` with the Matcher, "
             "PhraseMatcher, or EntityRuler for more details.")
     E155 = ("The pipeline needs to include a {pipe} in order to use "
             "Matcher or PhraseMatcher with the attribute {attr}. "
-            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
-            "instead of list(nlp.tokenizer.pipe()).")
+            "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
+            "instead of `list(nlp.tokenizer.pipe())`.")
     E157 = ("Can't render negative values for dependency arc start or end. "
             "Make sure that you're passing in absolute token indices, not "
             "relative token offsets.\nstart: {start}, end: {end}, label: "
@@ -415,13 +393,11 @@ class Errors:
     E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
     E160 = ("Can't find language data file: {path}")
     E161 = ("Found an internal inconsistency when predicting entity links. "
-            "This is likely a bug in spaCy, so feel free to open an issue.")
-    E162 = ("Cannot evaluate textcat model on data with different labels.\n"
-            "Labels in model: {model_labels}\nLabels in evaluation "
-            "data: {eval_labels}")
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
     E163 = ("cumsum was found to be unstable: its last element does not "
             "correspond to sum")
-    E164 = ("x is neither increasing nor decreasing: {}.")
+    E164 = ("x is neither increasing nor decreasing: {x}.")
     E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
             "that case.")
     E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
@@ -436,10 +412,10 @@ class Errors:
     E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you "
             "accidentally passed a single pattern to Matcher.add instead of a "
             "list of patterns? If you only want to add one pattern, make sure "
-            "to wrap it in a list. For example: matcher.add('{key}', [pattern])")
+            "to wrap it in a list. For example: `matcher.add('{key}', [pattern])`")
     E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
             "Doc. If you only want to add one pattern, make sure to wrap it "
-            "in a list. For example: matcher.add('{key}', [doc])")
+            "in a list. For example: `matcher.add('{key}', [doc])`")
     E180 = ("Span attributes can't be declared as required or assigned by "
             "components, since spans are only views of the Doc. Use Doc and "
             "Token attributes (or custom extension attributes) only and remove "
@@ -447,17 +423,16 @@ class Errors:
     E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
             "Only Doc and Token attributes are supported.")
     E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
-            "to define the attribute? For example: {attr}.???")
+            "to define the attribute? For example: `{attr}.???`")
     E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
             "attributes are supported, for example: {solution}")
     E184 = ("Only attributes without underscores are supported in component "
             "attribute declarations (because underscore and non-underscore "
             "attributes are connected anyways): {attr} -> {solution}")
     E185 = ("Received invalid attribute in component attribute declaration: "
-            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
-    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
+            "`{obj}.{attr}`\nAttribute '{attr}' does not exist on {obj}.")
     E187 = ("Only unicode strings are supported as labels.")
-    E189 = ("Each argument to Doc.__init__ should be of equal length.")
+    E189 = ("Each argument to `Doc.__init__` should be of equal length.")
     E190 = ("Token head out of range in `Doc.from_array()` for token index "
             "'{index}' with value '{value}' (equivalent to relative head "
             "index: '{rel_head_index}'). The head indices should be relative "
@@ -471,17 +446,32 @@ class Errors:
             "({curr_dim}).")
     E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
     E195 = ("Matcher can be called on {good} only, got {got}.")
-    E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can "
-            "only be fixed with token.is_sent_start.")
+    E196 = ("Refusing to write to `token.is_sent_end`. Sentence boundaries can "
+            "only be fixed with `token.is_sent_start`.")
     E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
     E198 = ("Unable to return {n} most similar vectors for the current vectors "
             "table, which contains {n_rows} vectors.")
-    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
-    E200 = ("Specifying a base model with a pretrained component '{component}' "
-            "can not be combined with adding a pretrained Tok2Vec layer.")
-    E201 = ("Span index out of range.")
+    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
+    E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
+            "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+            "Try checking whitespace and delimiters. See "
+            "https://nightly.spacy.io/api/cli#convert")
+    E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
+            "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
+    E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
+            "dimension refers to the output width, after the linear projection "
+            "has been applied.")
+    E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
+            "dimension refers to the width of the vectors table.")
+    E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
+    E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
+    E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
+    E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
+    E910 = ("Encountered NaN value when computing loss for component '{name}'.")
+    E911 = ("Invalid feature: {feat}. Must be a token attribute.")
     E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
             "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
     E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
@@ -494,44 +484,44 @@ class Errors:
             "final score, set its weight to null in the [training.score_weights] "
             "section of your training config.")
     E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
-    E917 = ("Received invalid value {value} for 'state_type' in "
+    E917 = ("Received invalid value {value} for `state_type` in "
             "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
-            "values are an instance of spacy.vocab.Vocab or True to create one"
+            "values are an instance of `spacy.vocab.Vocab` or True to create one"
             " (default).")
-    E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
+    E919 = ("A textcat `positive_label` '{pos_label}' was provided for training "
             "data that does not appear to be a binary classification problem "
             "with two labels. Labels found: {labels}")
-    E920 = ("The textcat's 'positive_label' setting '{pos_label}' "
+    E920 = ("The textcat's `positive_label` setting '{pos_label}' "
             "does not match any label in the training data or provided during "
             "initialization. Available labels: {labels}")
-    E921 = ("The method 'set_output' can only be called on components that have "
-            "a Model with a 'resize_output' attribute. Otherwise, the output "
+    E921 = ("The method `set_output` can only be called on components that have "
+            "a Model with a `resize_output` attribute. Otherwise, the output "
             "layer can not be dynamically changed.")
     E922 = ("Component '{name}' has been initialized with an output dimension of "
             "{nO} - cannot add any more labels.")
     E923 = ("It looks like there is no proper sample data to initialize the "
-            "Model of component '{name}'. "
-            "This is likely a bug in spaCy, so feel free to open an issue.")
+            "Model of component '{name}'. This is likely a bug in spaCy, so "
+            "feel free to open an issue: https://github.com/explosion/spaCy/issues")
     E924 = ("The '{name}' component does not seem to be initialized properly. "
-            "This is likely a bug in spaCy, so feel free to open an issue.")
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
     E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
             "mapping label names to colors but got: {obj}")
-    E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
+    E926 = ("It looks like you're trying to modify `nlp.{attr}` directly. This "
             "doesn't work because it's an immutable computed property. If you "
             "need to modify the pipeline, use the built-in methods like "
-            "nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe "
-            "instead.")
+            "`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
+            "`nlp.enable_pipe` instead.")
     E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
             "property or default function argument?")
-    E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, "
+    E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
             "but the provided argument {loc} points to a file.")
-    E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
-            "not seem to exist.")
-    E930 = ("Received invalid get_examples callback in {name}.initialize. "
+    E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+    E930 = ("Received invalid get_examples callback in `{name}.initialize`. "
             "Expected function that returns an iterable of Example objects but "
             "got: {obj}")
-    E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
+    E931 = ("Encountered Pipe subclass without `Pipe.{method}` method in component "
             "'{name}'. If the component is trainable and you want to use this "
             "method, make sure it's overwritten on the subclass. If your "
             "component isn't trainable, add a method that does nothing or "
@@ -544,21 +534,21 @@ class Errors:
             "models, see the models directory: https://spacy.io/models. If you "
             "want to create a blank model, use spacy.blank: "
             "nlp = spacy.blank(\"{name}\")")
-    E942 = ("Executing after_{name} callback failed. Expected the function to "
+    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
             "return an initialized nlp object but got: {value}. Maybe "
             "you forgot to return the modified object in your function?")
-    E943 = ("Executing before_creation callback failed. Expected the function to "
+    E943 = ("Executing `before_creation` callback failed. Expected the function to "
             "return an uninitialized Language subclass but got: {value}. Maybe "
             "you forgot to return the modified object in your function or "
             "returned the initialized nlp object instead?")
-    E944 = ("Can't copy pipeline component '{name}' from source model '{model}': "
+    E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
             "not found in pipeline. Available components: {opts}")
     E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
             "nlp object, but got: {source}")
-    E947 = ("Matcher.add received invalid 'greedy' argument: expected "
+    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
-    E948 = ("Matcher.add received invalid 'patterns' argument: expected "
-            "a List, but got: {arg_type}")
+    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+            "a list, but got: {arg_type}")
     E949 = ("Can only create an alignment when the texts are the same.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
@@ -570,9 +560,9 @@ class Errors:
             "for your language.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")
-    E957 = ("Writing directly to Language.factories isn't needed anymore in "
-            "spaCy v3. Instead, you can use the @Language.factory decorator "
-            "to register your custom component factory or @Language.component "
+    E957 = ("Writing directly to `Language.factories` isn't needed anymore in "
+            "spaCy v3. Instead, you can use the `@Language.factory` decorator "
+            "to register your custom component factory or `@Language.component` "
             "to register a simple stateless function component that just takes "
             "a Doc and returns it.")
     E958 = ("Language code defined in config ({bad_lang_code}) does not match "
@@ -590,99 +580,93 @@ class Errors:
             "component.\n\n{config}")
     E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
             "got: {cfg_type}.")
-    E963 = ("Can't read component info from @Language.{decorator} decorator. "
+    E963 = ("Can't read component info from `@Language.{decorator}` decorator. "
             "Maybe you forgot to call it? Make sure you're using "
-            "@Language.{decorator}() instead of @Language.{decorator}.")
+            "`@Language.{decorator}()` instead of `@Language.{decorator}`.")
     E964 = ("The pipeline component factory for '{name}' needs to have the "
             "following named arguments, which are passed in by spaCy:\n- nlp: "
             "receives the current nlp object and lets you access the vocab\n- "
             "name: the name of the component instance, can be used to identify "
             "the component, output losses etc.")
-    E965 = ("It looks like you're using the @Language.component decorator to "
+    E965 = ("It looks like you're using the `@Language.component` decorator to "
             "register '{name}' on a class instead of a function component. If "
             "you need to register a class or function that *returns* a component "
-            "function, use the @Language.factory decorator instead.")
-    E966 = ("nlp.add_pipe now takes the string name of the registered component "
+            "function, use the `@Language.factory` decorator instead.")
+    E966 = ("`nlp.add_pipe` now takes the string name of the registered component "
             "factory, not a callable component. Expected string, but got "
             "{component} (name: '{name}').\n\n- If you created your component "
-            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
-            "nlp.add_pipe('name') instead.\n\n- If you passed in a component "
-            "like TextCategorizer(): call nlp.add_pipe with the string name "
-            "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
-            "component: Add the decorator @Language.component (for function "
-            "components) or @Language.factory (for class components / factories) "
+            "with `nlp.create_pipe('name')`: remove nlp.create_pipe and call "
+            "`nlp.add_pipe('name')` instead.\n\n- If you passed in a component "
+            "like `TextCategorizer()`: call `nlp.add_pipe` with the string name "
+            "instead, e.g. `nlp.add_pipe('textcat')`.\n\n- If you're using a custom "
+            "component: Add the decorator `@Language.component` (for function "
+            "components) or `@Language.factory` (for class components / factories) "
             "to your custom component and assign it a name, e.g. "
-            "@Language.component('your_name'). You can then run "
-            "nlp.add_pipe('your_name') to add it to the pipeline.")
+            "`@Language.component('your_name')`. You can then run "
+            "`nlp.add_pipe('your_name')` to add it to the pipeline.")
     E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
-    E968 = ("nlp.replace_pipe now takes the string name of the registered component "
+    E968 = ("`nlp.replace_pipe` now takes the string name of the registered component "
             "factory, not a callable component. Expected string, but got "
             "{component}.\n\n- If you created your component with"
-            "with nlp.create_pipe('name'): remove nlp.create_pipe and call "
-            "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
-            "component like TextCategorizer(): call nlp.replace_pipe with the "
-            "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
+            "with `nlp.create_pipe('name')`: remove `nlp.create_pipe` and call "
+            "`nlp.replace_pipe('{name}', 'name')` instead.\n\n- If you passed in a "
+            "component like `TextCategorizer()`: call `nlp.replace_pipe` with the "
+            "string name instead, e.g. `nlp.replace_pipe('{name}', 'textcat')`.\n\n"
             "- If you're using a custom component: Add the decorator "
-            "@Language.component (for function components) or @Language.factory "
+            "`@Language.component` (for function components) or `@Language.factory` "
             "(for class components / factories) to your custom component and "
-            "assign it a name, e.g. @Language.component('your_name'). You can "
-            "then run nlp.replace_pipe('{name}', 'your_name').")
+            "assign it a name, e.g. `@Language.component('your_name')`. You can "
+            "then run `nlp.replace_pipe('{name}', 'your_name')`.")
     E969 = ("Expected string values for field '{field}', but received {types} instead. ")
     E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
-    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
+    E971 = ("Found incompatible lengths in `Doc.from_array`: {array_length} for the "
             "array and {doc_length} for the Doc itself.")
-    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
+    E972 = ("`Example.__init__` got None for '{arg}'. Requires Doc.")
     E973 = ("Unexpected type for NER data")
     E974 = ("Unknown {obj} attribute: {key}")
-    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
+    E976 = ("The method `Example.from_dict` expects a {type} as {n} argument, "
             "but received None.")
     E977 = ("Can not compare a MorphAnalysis with a string object. "
-            "This is likely a bug in spaCy, so feel free to open an issue.")
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
     E978 = ("The {name} method takes a list of Example objects, but got: {types}")
-    E979 = ("Cannot convert {type} to an Example object.")
     E980 = ("Each link annotation should refer to a dictionary with at most one "
             "identifier mapping to 1.0, and all others to 0.0.")
-    E981 = ("The offsets of the annotations for 'links' could not be aligned "
+    E981 = ("The offsets of the annotations for `links` could not be aligned "
             "to token boundaries.")
-    E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+    E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
             "into {values}, but found {value}.")
     E983 = ("Invalid key for '{dict}': {key}. Available keys: "
             "{keys}")
     E984 = ("Invalid component config for '{name}': component block needs either "
-            "a key 'factory' specifying the registered function used to "
-            "initialize the component, or a key 'source' key specifying a "
-            "spaCy model to copy the component from. For example, factory = "
-            "\"ner\" will use the 'ner' factory and all other settings in the "
-            "block will be passed to it as arguments. Alternatively, source = "
-            "\"en_core_web_sm\" will copy the component from that model.\n\n{config}")
-    E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
+            "a key `factory` specifying the registered function used to "
+            "initialize the component, or a key `source` key specifying a "
+            "spaCy model to copy the component from. For example, `factory = "
+            "\"ner\"` will use the 'ner' factory and all other settings in the "
+            "block will be passed to it as arguments. Alternatively, `source = "
+            "\"en_core_web_sm\"` will copy the component from that model.\n\n{config}")
+    E985 = ("Can't load model from config file: no [nlp] section found.\n\n{config}")
     E986 = ("Could not create any training batches: check your input. "
-            "Are the train and dev paths defined? "
-            "Is 'discard_oversize' set appropriately? ")
-    E987 = ("The text of an example training instance is either a Doc or "
-            "a string, but found {type} instead.")
-    E988 = ("Could not parse any training examples. Ensure the data is "
-            "formatted correctly.")
-    E989 = ("'nlp.update()' was called with two positional arguments. This "
+            "Are the train and dev paths defined? Is `discard_oversize` set appropriately? ")
+    E989 = ("`nlp.update()` was called with two positional arguments. This "
             "may be due to a backwards-incompatible change to the format "
             "of the training data in spaCy 3.0 onwards. The 'update' "
-            "function should now be called with a batch of 'Example' "
-            "objects, instead of (text, annotation) tuples. ")
-    E991 = ("The function 'select_pipes' should be called with either a "
-            "'disable' argument to list the names of the pipe components "
+            "function should now be called with a batch of Example "
+            "objects, instead of `(text, annotation)` tuples. ")
+    E991 = ("The function `nlp.select_pipes` should be called with either a "
+            "`disable` argument to list the names of the pipe components "
             "that should be disabled, or with an 'enable' argument that "
             "specifies which pipes should not be disabled.")
     E992 = ("The function `select_pipes` was called with `enable`={enable} "
             "and `disable`={disable} but that information is conflicting "
             "for the `nlp` pipeline with components {names}.")
-    E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
+    E993 = ("The config for the nlp object needs to include a key `lang` specifying "
             "the code of the language to initialize it with (for example "
-            "'en' for English) - this can't be 'None'.\n\n{config}")
-    E996 = ("Could not parse {file}: {msg}")
+            "'en' for English) - this can't be None.\n\n{config}")
     E997 = ("Tokenizer special cases are not allowed to modify the text. "
             "This would map '{chunk}' to '{orth}' given token attributes "
             "'{token_attrs}'.")
-    E999 = ("Unable to merge the `Doc` objects because they do not all share "
+    E999 = ("Unable to merge the Doc objects because they do not all share "
             "the same `Vocab`.")
     E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
              "loaded. Provide the name of a pretrained model or the path to "
@@ -694,35 +678,24 @@ class Errors:
     E1003 = ("Unsupported lemmatizer mode '{mode}'.")
     E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
              "Required tables: {tables}. Found: {found}. Maybe you forgot to "
-             "call nlp.initialize() to load in the data?")
+             "call `nlp.initialize()` to load in the data?")
     E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
              "'{chunk}'. Tokenizer exceptions are only allowed to specify "
-             "`ORTH` and `NORM`.")
-    E1006 = ("Unable to initialize {name} model with 0 labels.")
+             "ORTH and NORM.")
     E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
     E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
              "that you are providing a list of patterns as `List[List[dict]]`.")
-    E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
-             "through token.morph_ instead or add the string to the "
-             "StringStore with `nlp.vocab.strings.add(string)`.")
     E1010 = ("Unable to set entity information for token {i} which is included "
              "in more than one span in entities, blocked, missing or outside.")
-    E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
+    E1011 = ("Unsupported default '{default}' in `doc.set_ents`. Available "
              "options: {modes}")
     E1012 = ("Entity spans and blocked/missing/outside spans should be "
-             "provided to doc.set_ents as lists of `Span` objects.")
+             "provided to `doc.set_ents` as lists of Span objects.")
     E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
              "token itself. To set the morph from this MorphAnalysis, set from "
              "the string value with: `token.set_morph(str(other_morph))`.")
 
 
-@add_codes
-class TempErrors:
-    T003 = ("Resizing pretrained Tagger models is not currently supported.")
-    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
-            "issue tracker: http://github.com/explosion/spaCy/issues")
-
-
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
     "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index f9a906397..1a0979cab 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -6,6 +6,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 
 from ...tokens import Doc
 from ...util import registry
+from ...errors import Errors
 from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
@@ -201,7 +202,7 @@ def CharacterEmbed(
     """
     feature = intify_attr(feature)
     if feature is None:
-        raise ValueError("Invalid feature: Must be a token attribute.")
+        raise ValueError(Errors.E911(feat=feature))
     if also_use_static_vectors:
         model = chain(
             concatenate(
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 41afdbf80..c77247d33 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,11 @@
 from typing import List, Tuple, Callable, Optional, cast
-
 from thinc.initializers import glorot_uniform_init
 from thinc.util import partial
 from thinc.types import Ragged, Floats2d, Floats1d
 from thinc.api import Model, Ops, registry
 
 from ..tokens import Doc
+from ..errors import Errors
 
 
 @registry.layers("spacy.StaticVectors.v1")
@@ -76,16 +76,9 @@ def init(
         nO = Y.data.shape[1]
 
     if nM is None:
-        raise ValueError(
-            "Cannot initialize StaticVectors layer: nM dimension unset. "
-            "This dimension refers to the width of the vectors table."
-        )
+        raise ValueError(Errors.E905)
     if nO is None:
-        raise ValueError(
-            "Cannot initialize StaticVectors layer: nO dimension unset. "
-            "This dimension refers to the output width, after the linear  "
-            "projection has been applied."
-        )
+        raise ValueError(Errors.E904)
     model.set_dim("nM", nM)
     model.set_dim("nO", nO)
     model.set_param("W", init_W(model.ops, (nO, nM)))
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index dafa99bdd..69f015bda 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -9,10 +9,11 @@ from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
 from ...training.example cimport Example
-from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
 
+from ...errors import Errors
+
 # Calculate cost as gold/not gold. We don't use scalar value anyway.
 cdef int BINARY_COSTS = 1
 cdef weight_t MIN_SCORE = -90000
@@ -86,7 +87,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls,
                 SENT_START_UNKNOWN,
                 0
             )
- 
+
         elif is_sent_start is None:
             gs.state_bits[i] = set_state_flag(
                 gs.state_bits[i],
@@ -109,7 +110,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls,
                 IS_SENT_START,
                 0
             )
- 
+
     for i, (head, label) in enumerate(zip(heads, labels)):
         if head is not None:
             gs.heads[i] = head
@@ -158,7 +159,7 @@ cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) nogil:
         )
         gs.n_kids_in_stack[i] = 0
         gs.n_kids_in_buffer[i] = 0
- 
+
     for i in range(stcls.stack_depth()):
         s_i = stcls.S(i)
         if not is_head_unknown(gs, s_i):
@@ -403,7 +404,7 @@ cdef class RightArc:
             return 0
         sent_start = st._sent[st.B_(0).l_edge].sent_start
         return sent_start != 1 and st.H(st.S(0)) != st.B(0)
-    
+
     @staticmethod
     cdef int transition(StateC* st, attr_t label) nogil:
         st.add_arc(st.S(0), st.B(0), label)
@@ -701,10 +702,10 @@ cdef class ArcEager(TransitionSystem):
                 output[i] = self.c[i].is_valid(st, self.c[i].label)
             else:
                 output[i] = is_valid[self.c[i].move]
-    
+
     def get_cost(self, StateClass stcls, gold, int i):
         if not isinstance(gold, ArcEagerGold):
-            raise TypeError("Expected ArcEagerGold")
+            raise TypeError(Errors.E909.format(name="ArcEagerGold"))
         cdef ArcEagerGold gold_ = gold
         gold_state = gold_.c
         n_gold = 0
@@ -717,7 +718,7 @@ cdef class ArcEager(TransitionSystem):
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        StateClass stcls, gold) except -1:
         if not isinstance(gold, ArcEagerGold):
-            raise TypeError("Expected ArcEagerGold")
+            raise TypeError(Errors.E909.format(name="ArcEagerGold"))
         cdef ArcEagerGold gold_ = gold
         gold_.update(stcls)
         gold_state = gold_.c
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index 0351bcaf7..4f142caaf 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,16 +1,18 @@
-from collections import Counter
 from libc.stdint cimport int32_t
 from cymem.cymem cimport Pool
 
+from collections import Counter
+
 from ...typedefs cimport weight_t, attr_t
 from ...lexeme cimport Lexeme
 from ...attrs cimport IS_SPACE
 from ...training.example cimport Example
-from ...errors import Errors
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition, do_func_t
 
+from ...errors import Errors
+
 
 cdef enum:
     MISSING
@@ -248,7 +250,7 @@ cdef class BiluoPushDown(TransitionSystem):
 
     def get_cost(self, StateClass stcls, gold, int i):
         if not isinstance(gold, BiluoGold):
-            raise TypeError("Expected BiluoGold")
+            raise TypeError(Errors.E909.format(name="BiluoGold"))
         cdef BiluoGold gold_ = gold
         gold_state = gold_.c
         n_gold = 0
@@ -261,7 +263,7 @@ cdef class BiluoPushDown(TransitionSystem):
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        StateClass stcls, gold) except -1:
         if not isinstance(gold, BiluoGold):
-            raise TypeError("Expected BiluoGold")
+            raise TypeError(Errors.E909.format(name="BiluoGold"))
         cdef BiluoGold gold_ = gold
         gold_.update(stcls)
         gold_state = gold_.c
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 29f0d7fb4..82f3bf37d 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -238,7 +238,7 @@ class Morphologizer(Tagger):
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
-            raise ValueError("nan value when computing loss")
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def score(self, examples, **kwargs):
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 231072e9c..0bfef7c7b 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -125,7 +125,7 @@ class SentenceRecognizer(Tagger):
             truths.append(eg_truth)
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
-            raise ValueError("nan value when computing loss")
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def initialize(self, get_examples, *, nlp=None):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 37ad42b88..6cb582b36 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -15,7 +15,7 @@ from .pipe import Pipe, deserialize_config
 from ..language import Language
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from ..errors import Errors, TempErrors, Warnings
+from ..errors import Errors, Warnings
 from ..scorer import Scorer
 from ..training import validate_examples
 from .. import util
@@ -258,7 +258,7 @@ class Tagger(Pipe):
         truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
-            raise ValueError("nan value when computing loss")
+            raise ValueError(Errors.E910.format(name=self.name))
         return float(loss), d_scores
 
     def initialize(self, get_examples, *, nlp=None, labels=None):
diff --git a/spacy/scorer.py b/spacy/scorer.py
index db32dabae..d1065f3a9 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -905,7 +905,7 @@ def _auc(x, y):
         if np.all(dx <= 0):
             direction = -1
         else:
-            raise ValueError(Errors.E164.format(x))
+            raise ValueError(Errors.E164.format(x=x))
 
     area = direction * np.trapz(y, x)
     if isinstance(area, np.memmap):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 9dfa6e714..3404274ce 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -245,7 +245,7 @@ cdef class Doc:
         self.noun_chunks_iterator = self.vocab.get_noun_chunks
         cdef bint has_space
         if words is None and spaces is not None:
-            raise ValueError("words must be set if spaces is set")
+            raise ValueError(Errors.E908)
         elif spaces is None and words is not None:
             self.has_unknown_spaces = True
         else:
@@ -309,7 +309,7 @@ cdef class Doc:
                 else:
                     if len(ent) < 3 or ent[1] != "-":
                         raise ValueError(Errors.E177.format(tag=ent))
-                    ent_iob, ent_type = ent.split("-", 1) 
+                    ent_iob, ent_type = ent.split("-", 1)
                     if ent_iob not in iob_strings:
                         raise ValueError(Errors.E177.format(tag=ent))
                     ent_iob = iob_strings.index(ent_iob)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 781474d3a..6a14e2849 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -17,7 +17,7 @@ from ..lexeme cimport Lexeme
 from ..symbols cimport dep
 
 from ..util import normalize_slice
-from ..errors import Errors, TempErrors, Warnings
+from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
 
@@ -652,7 +652,7 @@ cdef class Span:
             return self.root.ent_id
 
         def __set__(self, hash_t key):
-            raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
+            raise NotImplementedError(Errors.E200.format(attr="ent_id"))
 
     property ent_id_:
         """RETURNS (str): The (string) entity ID."""
@@ -660,7 +660,7 @@ cdef class Span:
             return self.root.ent_id_
 
         def __set__(self, hash_t key):
-            raise NotImplementedError(TempErrors.T007.format(attr="ent_id_"))
+            raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
 
     @property
     def orth_(self):
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 902db585b..28f0f87c3 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -3,7 +3,7 @@ from wasabi import Printer
 from .. import tags_to_entities
 from ...training import iob_to_biluo
 from ...tokens import Doc, Span
-from ...util import load_model
+from ...errors import Errors
 from ...util import load_model, get_lang_class
 
 
@@ -103,11 +103,7 @@ def conll_ner_to_docs(
             lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
             cols = list(zip(*[line.split() for line in lines]))
             if len(cols) < 2:
-                raise ValueError(
-                    "The token-per-line NER file is not formatted correctly. "
-                    "Try checking whitespace and delimiters. See "
-                    "https://nightly.spacy.io/api/cli#convert"
-                )
+                raise ValueError(Errors.E093)
             length = len(cols[0])
             words.extend(cols[0])
             sent_starts.extend([True] + [False] * (length - 1))
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index bfd981649..73ad8953d 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -4,6 +4,7 @@ from .conll_ner_to_docs import n_sents_info
 from ...vocab import Vocab
 from ...training import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
+from ...errors import Errors
 from ...util import minibatch
 
 
@@ -45,9 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
                 sent_words, sent_iob = zip(*sent_tokens)
                 sent_tags = ["-"] * len(sent_words)
             else:
-                raise ValueError(
-                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert"
-                )
+                raise ValueError(Errors.E092)
             words.extend(sent_words)
             tags.extend(sent_tags)
             iob.extend(sent_iob)
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 4f05c6344..b91fb07a8 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -16,6 +16,7 @@ from ..attrs import ID
 from ..ml.models.multi_task import build_cloze_multi_task_model
 from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
+from ..errors import Errors
 from ..util import registry, load_model_from_config, dot_to_object
 
 
@@ -151,9 +152,9 @@ def create_objective(config: Config):
             distance = L2Distance(normalize=True, ignore_zeros=True)
             return partial(get_vectors_loss, distance=distance)
         else:
-            raise ValueError("Unexpected loss type", config["loss"])
+            raise ValueError(Errors.E906.format(loss_type=config["loss"]))
     else:
-        raise ValueError("Unexpected objective_type", objective_type)
+        raise ValueError(Errors.E907.format(objective_type=objective_type))
 
 
 def get_vectors_loss(ops, docs, prediction, distance):

From 96b636c2d3f8e8f62bf53e0c5c30147c48bca537 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 4 Oct 2020 13:08:21 +0200
Subject: [PATCH 377/516] Update attribute ruler

---
 spacy/pipeline/attributeruler.py | 38 ++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index f314953e9..b4580ff7c 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -18,15 +18,16 @@ from .. import util
 
 MatcherPatternType = List[Dict[Union[int, str], Any]]
 AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
+TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
+MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
 
 
 @Language.factory(
-    "attribute_ruler", default_config={"pattern_dicts": None, "validate": False}
+    "attribute_ruler", default_config={"validate": False}
 )
 def make_attribute_ruler(
     nlp: Language,
     name: str,
-    pattern_dicts: Optional[Iterable[AttributeRulerPatternType]],
     validate: bool,
 ):
     return AttributeRuler(
@@ -49,14 +50,14 @@ class AttributeRuler(Pipe):
         pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
         validate: bool = False,
     ) -> None:
-        """Initialize the AttributeRuler.
+        """Create the AttributeRuler. After creation, you can add patterns
+        with the `.initialize()` or `.add_patterns()` methods, or load patterns
+        with `.from_bytes()` or `.from_disk()`. Loading patterns will remove
+        any patterns you've added previously.
 
         vocab (Vocab): The vocab.
         name (str): The pipe name. Defaults to "attribute_ruler".
-        pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as
-        the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add
-        as patterns.
-
+        
         RETURNS (AttributeRuler): The AttributeRuler component.
 
         DOCS: https://nightly.spacy.io/api/attributeruler#init
@@ -68,8 +69,27 @@ class AttributeRuler(Pipe):
         self._attrs_unnormed = []  # store for reference
         self.indices = []
 
-        if pattern_dicts:
-            self.add_patterns(pattern_dicts)
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        nlp: Optional[Language] = None,
+        patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
+        tag_map: Optional[TagMapType]=None,
+        morph_rules: Optional[MorphRulesType]=None
+    ):
+        """Initialize the attribute ruler by adding zero or more patterns.
+        
+        Rules can be specified as a sequence of dicts using the `patterns`
+        keyword argument. You can also provide rules using the "tag map" or
+        "morph rules" formats supported by spaCy prior to v3.
+        """
+        if patterns:
+            self.add_patterns(patterns)
+        if tag_map:
+            self.load_from_tag_map(tag_map)
+        if morph_rules:
+            self.load_from_morph_rules(morph_rules)
 
     def __call__(self, doc: Doc) -> Doc:
         """Apply the AttributeRuler to a Doc and set all attribute exceptions.

From 452b8309f9e34530e5f592699a3601400f40ffb0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 13:26:46 +0200
Subject: [PATCH 378/516] slight rewrite to hide some thinc implementation
 details

---
 website/docs/usage/layers-architectures.md | 98 ++++++++++++++--------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 6f79cc6e8..25f9a568c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -373,7 +373,7 @@ gpu_allocator = "pytorch"
 Of course it's also possible to define the `Model` from the previous section
 entirely in Thinc. The Thinc documentation provides details on the
 [various layers](https://thinc.ai/docs/api-layers) and helper functions
-available. Combinators can also be used to
+available. Combinators can be used to
 [overload operators](https://thinc.ai/docs/usage-models#operators) and a common
 usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
 simple neural network would then become:
@@ -494,13 +494,34 @@ from scratch. This can be done by creating a new class inheriting from
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
-This section will run through an example of implementing a novel relation
-extraction component from scratch. As a first step, we need a method that will
+This section outlines an example use-case of implementing a novel relation
+extraction component from scratch. We assume we want to implement a binary 
+relation extraction method that determines whether two entities in a document 
+are related or not, and if so, with what type of relation. We'll allow multiple 
+types of relations between two such entities - i.e. it is a multi-label setting.
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+a list of documents as input, and outputs a two-dimensional matrix of scores:
+
+```python
+@registry.architectures.register("rel_model.v1")
+def create_relation_model(...) -> Model[List[Doc], Floats2d]:
+    model = _create_my_model()
+    return model
+```
+
+The first layer in this model will typically be an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
+layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+transforms each document into a list of tokens, with each token being 
+represented by its embedding in the vector space.
+
+Next, we need a method that will
 generate pairs of entities that we want to classify as being related or not.
 These candidate pairs are typically formed within one document, which means
 we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. In this example, we will focus on binary relation extraction, i.e. the
-tuple will be of length 2. For instance, a very straightforward implementation
+tuples. For instance, a very straightforward implementation
 would be to just take any two entities from the same document:
 
 ```python
@@ -512,18 +533,24 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
     return candidates
 ```
 
-But we could also refine this further by excluding relations of an entity with
-itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll also register this function in the
-[`@misc` registry](/api/top-level#registry) so we can refer to it from the
-config, and easily swap it out for any other candidate generation function.
-
 > ```
-> [get_candidates]
+> [model]
+> @architectures = "rel_model.v1"
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
 > ```
 
+But we could also refine this further by excluding relations of an entity with
+itself, and posing a maximum distance (in number of tokens) between two
+entities. We'll register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
 ```python
 ### {highlight="1,2,7,8"}
 @registry.misc.register("rel_cand_generator.v2")
@@ -539,32 +566,33 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
+Finally, we'll require a method that transforms the candidate pairs of entities into 
+a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
+processed by a final `output_layer` of the network. Taking all this together, we can define 
+our relation model like this in the config:
+
 > ```
-> [tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
-> pretrained_vectors = null
-> width = 96
-> depth = 2
-> embed_size = 300
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
+> [model]
+> @architectures = "rel_model.v1"
+> nO = null
+> 
+> [model.tok2vec]
+> ...
+> 
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v2"
+> max_length = 6
+> 
+> [components.relation_extractor.model.create_candidate_tensor]
+> @misc = "rel_cand_tensor.v1"
+> 
+> [components.relation_extractor.model.output_layer]
+> @architectures = "rel_output_layer.v1"
+> nI = null
+> nO = null
 > ```
 
-Next, we'll assume we have access to an
-[embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms a list of documents into a list of 2D vectors. Further, this
-`tok2vec` component will be trainable, which means that, following the Thinc
-paradigm, we'll apply it to some input, and receive the predicted results as
-well as a callback to perform backpropagation:
-
-```python
-tok2vec = model.get_ref("tok2vec")
-tokvecs, bp_tokvecs = tok2vec(docs, is_train=True)
-```
-
+<!-- Link to project for implementation details -->
 
 
 

From 11347f34da5182d35559eae644231a432fb4d9c4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 13:54:05 +0200
Subject: [PATCH 379/516] Tidy up, tests and docs

---
 spacy/pipeline/attributeruler.py            |  57 ++++----
 spacy/tests/pipeline/test_attributeruler.py | 105 ++++++++------
 website/docs/api/attributeruler.md          | 145 +++++++++++---------
 website/docs/usage/linguistic-features.md   |  12 +-
 website/docs/usage/v3.md                    |  26 +++-
 5 files changed, 193 insertions(+), 152 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index b4580ff7c..9e6174d07 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -1,10 +1,11 @@
+from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
+from typing import Tuple
 import srsly
-from typing import List, Dict, Union, Iterable, Any, Optional
 from pathlib import Path
 
 from .pipe import Pipe
 from ..errors import Errors
-from ..training import validate_examples
+from ..training import validate_examples, Example
 from ..language import Language
 from ..matcher import Matcher
 from ..scorer import Scorer
@@ -22,17 +23,9 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
 MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
 
 
-@Language.factory(
-    "attribute_ruler", default_config={"validate": False}
-)
-def make_attribute_ruler(
-    nlp: Language,
-    name: str,
-    validate: bool,
-):
-    return AttributeRuler(
-        nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate
-    )
+@Language.factory("attribute_ruler", default_config={"validate": False})
+def make_attribute_ruler(nlp: Language, name: str, validate: bool):
+    return AttributeRuler(nlp.vocab, name, validate=validate)
 
 
 class AttributeRuler(Pipe):
@@ -43,12 +36,7 @@ class AttributeRuler(Pipe):
     """
 
     def __init__(
-        self,
-        vocab: Vocab,
-        name: str = "attribute_ruler",
-        *,
-        pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None,
-        validate: bool = False,
+        self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False
     ) -> None:
         """Create the AttributeRuler. After creation, you can add patterns
         with the `.initialize()` or `.add_patterns()` methods, or load patterns
@@ -57,7 +45,7 @@ class AttributeRuler(Pipe):
 
         vocab (Vocab): The vocab.
         name (str): The pipe name. Defaults to "attribute_ruler".
-        
+
         RETURNS (AttributeRuler): The AttributeRuler component.
 
         DOCS: https://nightly.spacy.io/api/attributeruler#init
@@ -71,15 +59,15 @@ class AttributeRuler(Pipe):
 
     def initialize(
         self,
-        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        get_examples: Optional[Callable[[], Iterable[Example]]],
         *,
         nlp: Optional[Language] = None,
         patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
-        tag_map: Optional[TagMapType]=None,
-        morph_rules: Optional[MorphRulesType]=None
+        tag_map: Optional[TagMapType] = None,
+        morph_rules: Optional[MorphRulesType] = None,
     ):
         """Initialize the attribute ruler by adding zero or more patterns.
-        
+
         Rules can be specified as a sequence of dicts using the `patterns`
         keyword argument. You can also provide rules using the "tag map" or
         "morph rules" formats supported by spaCy prior to v3.
@@ -126,7 +114,7 @@ class AttributeRuler(Pipe):
             set_token_attrs(span[index], attrs)
         return doc
 
-    def pipe(self, stream, *, batch_size=128):
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -210,16 +198,16 @@ class AttributeRuler(Pipe):
         self.attrs.append(attrs)
         self.indices.append(index)
 
-    def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None:
+    def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
         """Add patterns from a list of pattern dicts with the keys as the
         arguments to AttributeRuler.add.
-        pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys
+        patterns (Iterable[dict]): A list of pattern dicts with the keys
             as the arguments to AttributeRuler.add (patterns/attrs/index) to
             add as patterns.
 
         DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns
         """
-        for p in pattern_dicts:
+        for p in patterns:
             self.add(**p)
 
     @property
@@ -234,7 +222,7 @@ class AttributeRuler(Pipe):
             all_patterns.append(p)
         return all_patterns
 
-    def score(self, examples, **kwargs):
+    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
         """Score a batch of examples.
 
         examples (Iterable[Example]): The examples to score.
@@ -275,7 +263,7 @@ class AttributeRuler(Pipe):
 
     def from_bytes(
         self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
-    ):
+    ) -> "AttributeRuler":
         """Load the AttributeRuler from a bytestring.
 
         bytes_data (bytes): The data to load.
@@ -293,7 +281,6 @@ class AttributeRuler(Pipe):
             "patterns": load_patterns,
         }
         util.from_bytes(bytes_data, deserialize, exclude)
-
         return self
 
     def to_disk(
@@ -303,6 +290,7 @@ class AttributeRuler(Pipe):
 
         path (Union[Path, str]): A path to a directory.
         exclude (Iterable[str]): String names of serialization fields to exclude.
+
         DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
         """
         serialize = {
@@ -313,11 +301,13 @@ class AttributeRuler(Pipe):
 
     def from_disk(
         self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
-    ) -> None:
+    ) -> "AttributeRuler":
         """Load the AttributeRuler from disk.
 
         path (Union[Path, str]): A path to a directory.
         exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (AttributeRuler): The loaded object.
+
         DOCS: https://nightly.spacy.io/api/attributeruler#from_disk
         """
 
@@ -329,11 +319,10 @@ class AttributeRuler(Pipe):
             "patterns": load_patterns,
         }
         util.from_disk(path, deserialize, exclude)
-
         return self
 
 
-def _split_morph_attrs(attrs):
+def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
     """Split entries from a tag map or morph rules dict into to two dicts, one
     with the token-level features (POS, LEMMA) and one with the remaining
     features, which are presumed to be individual MORPH features."""
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 5773127af..c967bcdcd 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -63,6 +63,39 @@ def morph_rules():
     return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
 
 
+def check_tag_map(ruler):
+    doc = Doc(
+        ruler.vocab,
+        words=["This", "is", "a", "test", "."],
+        tags=["DT", "VBZ", "DT", "NN", "."],
+    )
+    doc = ruler(doc)
+    for i in range(len(doc)):
+        if i == 4:
+            assert doc[i].pos_ == "PUNCT"
+            assert str(doc[i].morph) == "PunctType=peri"
+        else:
+            assert doc[i].pos_ == ""
+            assert str(doc[i].morph) == ""
+
+
+def check_morph_rules(ruler):
+    doc = Doc(
+        ruler.vocab,
+        words=["This", "is", "the", "test", "."],
+        tags=["DT", "VBZ", "DT", "NN", "."],
+    )
+    doc = ruler(doc)
+    for i in range(len(doc)):
+        if i != 2:
+            assert doc[i].pos_ == ""
+            assert str(doc[i].morph) == ""
+        else:
+            assert doc[2].pos_ == "DET"
+            assert doc[2].lemma_ == "a"
+            assert str(doc[2].morph) == "Case=Nom"
+
+
 def test_attributeruler_init(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     for p in pattern_dicts:
@@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
 
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
     # initialize with patterns
-    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
+    ruler = nlp.add_pipe("attribute_ruler")
+    ruler.initialize(lambda: [], patterns=pattern_dicts)
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
@@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
     # initialize with patterns from asset
-    nlp.add_pipe(
-        "attribute_ruler",
-        config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}},
-    )
+    nlp.config["initialize"]["components"]["attribute_ruler"] = {
+        "patterns": {"@misc": "attribute_ruler_patterns"}
+    }
+    nlp.add_pipe("attribute_ruler")
+    nlp.initialize()
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
@@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
 
 def test_attributeruler_score(nlp, pattern_dicts):
     # initialize with patterns
-    nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
+    ruler = nlp.add_pipe("attribute_ruler")
+    ruler.initialize(lambda: [], patterns=pattern_dicts)
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert str(doc[2].morph) == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert str(doc[3].morph) == "Case=Nom|Number=Sing"
-
-    dev_examples = [
-        Example.from_dict(
-            nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]}
-        )
-    ]
+    doc = nlp.make_doc("This is a test.")
+    dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
     scores = nlp.evaluate(dev_examples)
     # "cat" is the only correct lemma
     assert scores["lemma_acc"] == pytest.approx(0.2)
@@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp):
 
 
 def test_attributeruler_tag_map(nlp, tag_map):
-    a = AttributeRuler(nlp.vocab)
-    a.load_from_tag_map(tag_map)
-    doc = Doc(
-        nlp.vocab,
-        words=["This", "is", "a", "test", "."],
-        tags=["DT", "VBZ", "DT", "NN", "."],
-    )
-    doc = a(doc)
-    for i in range(len(doc)):
-        if i == 4:
-            assert doc[i].pos_ == "PUNCT"
-            assert str(doc[i].morph) == "PunctType=peri"
-        else:
-            assert doc[i].pos_ == ""
-            assert str(doc[i].morph) == ""
+    ruler = AttributeRuler(nlp.vocab)
+    ruler.load_from_tag_map(tag_map)
+    check_tag_map(ruler)
+
+
+def test_attributeruler_tag_map_initialize(nlp, tag_map):
+    ruler = nlp.add_pipe("attribute_ruler")
+    ruler.initialize(lambda: [], tag_map=tag_map)
+    check_tag_map(ruler)
 
 
 def test_attributeruler_morph_rules(nlp, morph_rules):
-    a = AttributeRuler(nlp.vocab)
-    a.load_from_morph_rules(morph_rules)
-    doc = Doc(
-        nlp.vocab,
-        words=["This", "is", "the", "test", "."],
-        tags=["DT", "VBZ", "DT", "NN", "."],
-    )
-    doc = a(doc)
-    for i in range(len(doc)):
-        if i != 2:
-            assert doc[i].pos_ == ""
-            assert str(doc[i].morph) == ""
-        else:
-            assert doc[2].pos_ == "DET"
-            assert doc[2].lemma_ == "a"
-            assert str(doc[2].morph) == "Case=Nom"
+    ruler = AttributeRuler(nlp.vocab)
+    ruler.load_from_morph_rules(morph_rules)
+    check_morph_rules(ruler)
+
+
+def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
+    ruler = nlp.add_pipe("attribute_ruler")
+    ruler.initialize(lambda: [], morph_rules=morph_rules)
+    check_morph_rules(ruler)
 
 
 def test_attributeruler_indices(nlp):
diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index 60fda6bda..b89759080 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -4,6 +4,7 @@ tag: class
 source: spacy/pipeline/attributeruler.py
 new: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
+api_base_class: /api/pipe
 api_string_name: attribute_ruler
 api_trainable: false
 ---
@@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the
 > #### Example
 >
 > ```python
-> config = {
->    "pattern_dicts": None,
->    "validate": True,
-> }
+> config = {"validate": True}
 > nlp.add_pipe("attribute_ruler", config=config)
 > ```
 
-| Setting         | Description                                                                                                                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
-| `validate`      | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~                                                                                                                                                  |
+| Setting    | Description                                                                                   |
+| ---------- | --------------------------------------------------------------------------------------------- |
+| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/attributeruler.py
@@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the
 
 ## AttributeRuler.\_\_init\_\_ {#init tag="method"}
 
-Initialize the attribute ruler. If pattern dicts are supplied here, they need to
-be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
-keys, e.g.:
-
-```python
-pattern_dicts = [
-    {"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
-    {"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
-]
-```
+Initialize the attribute ruler.
 
 > #### Example
 >
 > ```python
 > # Construction via add_pipe
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
+> ruler = nlp.add_pipe("attribute_ruler")
 > ```
 
-| Name            | Description                                                                                                                              |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`         | The shared vocabulary to pass to the matcher. ~~Vocab~~                                                                                  |
-| `name`          | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
-| _keyword-only_  |                                                                                                                                          |
-| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~      |
-| `validate`      | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~                       |
+| Name           | Description                                                                                                                              |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | The shared vocabulary to pass to the matcher. ~~Vocab~~                                                                                  |
+| `name`         | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
+| _keyword-only_ |                                                                                                                                          |
+| `validate`     | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~                       |
 
 ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
 
-Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
-by the provided patterns.
+Apply the attribute ruler to a `Doc`, setting token attributes for tokens
+matched by the provided patterns.
 
 | Name        | Description                      |
 | ----------- | -------------------------------- |
@@ -90,10 +77,10 @@ may be negative to index from the end of the span.
 > #### Example
 >
 > ```python
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
+> ruler = nlp.add_pipe("attribute_ruler")
 > patterns = [[{"TAG": "VB"}]]
 > attrs = {"POS": "VERB"}
-> attribute_ruler.add(patterns=patterns, attrs=attrs)
+> ruler.add(patterns=patterns, attrs=attrs)
 > ```
 
 | Name       | Description                                                                                                                       |
@@ -107,11 +94,10 @@ may be negative to index from the end of the span.
 > #### Example
 >
 > ```python
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
-> pattern_dicts = [
+> ruler = nlp.add_pipe("attribute_ruler")
+> patterns = [
 >   {
->     "patterns": [[{"TAG": "VB"}]],
->     "attrs": {"POS": "VERB"}
+>     "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}
 >   },
 >   {
 >     "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
@@ -119,15 +105,16 @@ may be negative to index from the end of the span.
 >     "index": -1
 >   },
 > ]
-> attribute_ruler.add_patterns(pattern_dicts)
+> ruler.add_patterns(patterns)
 > ```
 
-Add patterns from a list of pattern dicts with the keys as the arguments to
+Add patterns from a list of pattern dicts. Each pattern dict can specify the
+keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of
 [`AttributeRuler.add`](/api/attributeruler#add).
 
-| Name            | Description                                                                |
-| --------------- | -------------------------------------------------------------------------- |
-| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
+| Name       | Description                                                                |
+| ---------- | -------------------------------------------------------------------------- |
+| `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
 
 ## AttributeRuler.patterns {#patterns tag="property"}
 
@@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the
 | ----------- | -------------------------------------------------------------------------------------------- |
 | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
 
-## AttributeRuler.score {#score tag="method" new="3"}
+## AttributeRuler.initialize {#initialize tag="method"}
 
-Score a batch of examples.
+Initialize the component with data. Typically called before training to load in
+rules from a file. This method is typically called by
+[`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 > #### Example
 >
 > ```python
-> scores = attribute_ruler.score(examples)
+> ruler = nlp.add_pipe("attribute_ruler")
+> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.attribute_ruler]
+>
+> [initialize.components.attribute_ruler.patterns]
+> @readers = "srsly.read_json.v1"
+> path = "corpus/attribute_ruler_patterns.json
 > ```
 
-| Name        | Description                                                                                                                                                                                                           |
-| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                                                                                                          |
-| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
+| Name           | Description                                                                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~                                                          |
+| _keyword-only_ |                                                                                                                                                                                                                                                |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                           |
+| `patterns`     | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
+| `tag_map`      | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~                                                                 |
+| `morph_rules`  | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~                            |
 
 ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
 
@@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules.
 | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
 
+## AttributeRuler.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = ruler.score(examples)
+> ```
+
+| Name        | Description                                                                                                                                                                                                           |
+| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`  | The examples to score. ~~Iterable[Example]~~                                                                                                                                                                          |
+| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
+
 ## AttributeRuler.to_disk {#to_disk tag="method"}
 
 Serialize the pipe to disk.
@@ -177,8 +198,8 @@ Serialize the pipe to disk.
 > #### Example
 >
 > ```python
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
-> attribute_ruler.to_disk("/path/to/attribute_ruler")
+> ruler = nlp.add_pipe("attribute_ruler")
+> ruler.to_disk("/path/to/attribute_ruler")
 > ```
 
 | Name           | Description                                                                                                                                |
@@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
-> attribute_ruler.from_disk("/path/to/attribute_ruler")
+> ruler = nlp.add_pipe("attribute_ruler")
+> ruler.from_disk("/path/to/attribute_ruler")
 > ```
 
 | Name           | Description                                                                                     |
@@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
-> attribute_ruler_bytes = attribute_ruler.to_bytes()
+> ruler = nlp.add_pipe("attribute_ruler")
+> ruler = ruler.to_bytes()
 > ```
 
 Serialize the pipe to a bytestring.
@@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 > #### Example
 >
 > ```python
-> attribute_ruler_bytes = attribute_ruler.to_bytes()
-> attribute_ruler = nlp.add_pipe("attribute_ruler")
-> attribute_ruler.from_bytes(attribute_ruler_bytes)
+> ruler_bytes = ruler.to_bytes()
+> ruler = nlp.add_pipe("attribute_ruler")
+> ruler.from_bytes(ruler_bytes)
 > ```
 
 | Name           | Description                                                                                 |
@@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument.
 > #### Example
 >
 > ```python
-> data = attribute_ruler.to_disk("/path", exclude=["vocab"])
+> data = ruler.to_disk("/path", exclude=["vocab"])
 > ```
 
-| Name       | Description                                                    |
-| ---------- | -------------------------------------------------------------- |
-| `vocab`    | The shared [`Vocab`](/api/vocab).                              |
-| `patterns` | The `Matcher` patterns. You usually don't want to exclude this.  |
-| `attrs`    | The attributes to set. You usually don't want to exclude this. |
-| `indices`  | The token indices. You usually don't want to exclude this.     |
+| Name       | Description                                                     |
+| ---------- | --------------------------------------------------------------- |
+| `vocab`    | The shared [`Vocab`](/api/vocab).                               |
+| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
+| `attrs`    | The attributes to set. You usually don't want to exclude this.  |
+| `indices`  | The token indices. You usually don't want to exclude this.      |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 7b9aaa0b9..1964bac18 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_)  # WP PRON
 
 <Infobox variant="warning" title="Migrating from spaCy v2.x">
 
-For easy migration from from spaCy v2 to v3, the
-[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules**
-in the v2 format with the methods
-[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
-[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules).
-
-```diff
-nlp = spacy.blank("en")
-+ ruler = nlp.add_pipe("attribute_ruler")
-+ ruler.load_from_tag_map(YOUR_TAG_MAP)
-```
+The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
 
 </Infobox>
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 4ce57af01..a10fc6321 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -804,8 +804,30 @@ nlp = spacy.blank("en")
 Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
 v3.0 now manages mappings and exceptions with a separate and more flexible
 pipeline component, the [`AttributeRuler`](/api/attributeruler). See the
-[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The
-`AttributeRuler` provides two handy helper methods
+[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If
+you have tag maps and morph rules in the v2.x format, you can load them into the
+attribute ruler before training using the `[initialize]` block of your config.
+
+> #### What does the initialization do?
+>
+> The `[initialize]` block is used when
+> [`nlp.initialize`](/api/language#initialize) is called (usually right before
+> training). It lets you define data resources for initializing the pipeline in
+> your `config.cfg`. After training, the rules are saved to disk with the
+> exported pipeline, so your runtime model doesn't depend on local data. For
+> details see the [config lifecycle](/usage/training/#config-lifecycle) and
+> [initialization](/usage/training/#initialization) docs.
+
+```ini
+### config.cfg (excerpt)
+[initialize.components.attribute_ruler]
+
+[initialize.components.attribute_ruler.tag_map]
+@readers = "srsly.read_json.v1"
+path = "./corpus/tag_map.json"
+```
+
+The `AttributeRuler` also provides two handy helper methods
 [`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and
 [`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let
 you load in your existing tag map or morph rules:

From 9f40d963fd92d2dc5de04af2bda45d79d440113e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:11:53 +0200
Subject: [PATCH 380/516] highlight the two steps: the model and the pipeline
 component

---
 website/docs/usage/layers-architectures.md | 126 ++++++++++++++-------
 1 file changed, 88 insertions(+), 38 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 25f9a568c..c4b3fb9dc 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -495,12 +495,19 @@ from scratch. This can be done by creating a new class inheriting from
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary 
-relation extraction method that determines whether two entities in a document 
-are related or not, and if so, with what type of relation. We'll allow multiple 
+extraction component from scratch. We assume we want to implement a binary
+relation extraction method that determines whether two entities in a document
+are related or not, and if so, with what type of relation. We'll allow multiple
 types of relations between two such entities - i.e. it is a multi-label setting.
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes 
+There are two major steps required: first, we need to
+[implement a machine learning model](#component-rel-model) specific to this
+task, and then we'll use this model to
+[implement a custom pipeline component](#component-rel-pipe).
+
+#### Step 1: Implementing the Model {#component-rel-model}
+
+We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
 a list of documents as input, and outputs a two-dimensional matrix of scores:
 
 ```python
@@ -514,15 +521,15 @@ The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
 layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
-transforms each document into a list of tokens, with each token being 
+transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will
-generate pairs of entities that we want to classify as being related or not.
-These candidate pairs are typically formed within one document, which means
-we'll have a function that takes a `Doc` as input and outputs a `List` of `Span`
-tuples. For instance, a very straightforward implementation
-would be to just take any two entities from the same document:
+Next, we need a method that will generate pairs of entities that we want to
+classify as being related or not. These candidate pairs are typically formed
+within one document, which means we'll have a function that takes a `Doc` as
+input and outputs a `List` of `Span` tuples. For instance, a very
+straightforward implementation would be to just take any two entities from the
+same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -536,10 +543,10 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 > ```
 > [model]
 > @architectures = "rel_model.v1"
-> 
+>
 > [model.tok2vec]
 > ...
-> 
+>
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
 > max_length = 6
@@ -566,33 +573,76 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities into 
-a 2D tensor using the specified Tok2Vec function, and this `Floats2d` object will then be 
-processed by a final `output_layer` of the network. Taking all this together, we can define 
-our relation model like this in the config:
+Finally, we'll require a method that transforms the candidate pairs of entities
+into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
+object will then be processed by a final `output_layer` of the network. Taking
+all this together, we can define our relation model like this in the config:
 
-> ```
-> [model]
-> @architectures = "rel_model.v1"
-> nO = null
-> 
-> [model.tok2vec]
-> ...
-> 
-> [model.get_candidates]
-> @misc = "rel_cand_generator.v2"
-> max_length = 6
-> 
-> [components.relation_extractor.model.create_candidate_tensor]
-> @misc = "rel_cand_tensor.v1"
-> 
-> [components.relation_extractor.model.output_layer]
-> @architectures = "rel_output_layer.v1"
-> nI = null
-> nO = null
-> ```
+```
+[model]
+@architectures = "rel_model.v1"
+...
 
-<!-- Link to project for implementation details -->
+[model.tok2vec]
+...
+
+[model.get_candidates]
+@misc = "rel_cand_generator.v2"
+max_length = 6
+
+[model.create_candidate_tensor]
+@misc = "rel_cand_tensor.v1"
+
+[model.output_layer]
+@architectures = "rel_output_layer.v1"
+...
+```
+
+<!-- TODO: Link to project for implementation details -->
+
+When creating this model, we'll store the custom functions as
+[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
+references, so we can access them easily:
+
+```python
+tok2vec_layer = model.get_ref("tok2vec")
+output_layer = model.get_ref("output_layer")
+create_candidate_tensor = model.attrs["create_candidate_tensor"]
+get_candidates = model.attrs["get_candidates"]
+```
+
+#### Step 2: Implementing the pipeline component {#component-rel-pipe}
+
+To use our new relation extraction model as part of a custom component, we 
+create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+
+```python
+from spacy.pipeline import Pipe
+from spacy.language import Language
+
+class RelationExtractor(Pipe):
+     def __init__(self, vocab, model, name="rel", labels=[]):
+        ...
+
+    def predict(self, docs):
+        ...
+
+    def set_annotations(self, docs, scores):
+         ...
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
+In our case, we can simply delegate to the internal model's 
+[predict](https://thinc.ai/docs/api-model#predict) function:
+```python
+def predict(self, docs: Iterable[Doc]) -> Floats2d:
+    scores = self.model.predict(docs)
+    return self.model.ops.asarray(scores)
+```
 
 
 

From 9b3a9343615bf98e01abaa0a7db0fe563458fdf5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 14:14:55 +0200
Subject: [PATCH 381/516] Update docs [ci skip]

---
 website/docs/usage/training.md | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 8516b444c..1981f03b7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -8,6 +8,7 @@ menu:
   - ['Config System', 'config']
   - ['Custom Training', 'config-custom']
   - ['Custom Functions', 'custom-functions']
+  - ['Initialization', 'initialization']
   - ['Data Utilities', 'data']
   - ['Parallel Training', 'parallel-training']
   - ['Internal API', 'api']
@@ -824,12 +825,15 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
     return create_model(output_width)
 ```
 
-### Customizing the initialization {#initialization}
+## Customizing the initialization {#initialization}
 
 When you start training a new model from scratch,
 [`spacy train`](/api/cli#train) will call
-[`nlp.initialize`](/api/language#initialize) to initialize the pipeline for
-training. This process typically includes the following:
+[`nlp.initialize`](/api/language#initialize) to initialize the pipeline and load
+the required data. All settings for this are defined in the
+[`[initialize]`](/api/data-formats#config-initialize) block of the config, so
+you can keep track of how the initial `nlp` object was created. The
+initialization process typically includes the following:
 
 > #### config.cfg (excerpt)
 >
@@ -859,10 +863,22 @@ The initialization step allows the config to define **all settings** required
 for the pipeline, while keeping a separation between settings and functions that
 should only be used **before training** to set up the initial pipeline, and
 logic and configuration that needs to be available **at runtime**. Without that
-separation, TODO:
+separation, it would be very difficult to use the came, reproducible config file
+because the component settings required for training (load data from an external
+file) wouldn't match the component settings required at runtime (load what's
+included with the saved `nlp` object and don't depend on external file).
 
 ![Illustration of pipeline lifecycle](../images/lifecycle.svg)
 
+<Infobox title="How components save and load data" emoji="📖">
+
+For details and examples of how pipeline components can **save and load data
+assets** like model weights or lookup tables, and how the component
+initialization is implemented under the hood, see the usage guide on
+[serializing and initializing component data](/usage/processing-pipelines#component-data-initialization).
+
+</Infobox>
+
 #### Initializing labels {#initialization-labels}
 
 Built-in pipeline components like the

From 84ae197dd6d229b1ef34a205d1103f87623c0db6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 4 Oct 2020 14:16:53 +0200
Subject: [PATCH 382/516] Fix logger

---
 spacy/training/loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index b63adb6c9..0d4414964 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -77,7 +77,7 @@ def train(
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:
         for batch, info, is_best_checkpoint in training_step_iterator:
-            log_step(info if is_best_checkpoint else None)
+            log_step(info if is_best_checkpoint is not None else None)
             if is_best_checkpoint is not None and output_path is not None:
                 with nlp.select_pipes(disable=frozen_components):
                     update_meta(T, nlp, info)

From 8f018e47f84264ca852c67578af1ab95cbd74be3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 14:43:45 +0200
Subject: [PATCH 383/516] Adjust [initialize.components] on
 Language.remove_pipe and Language.rename_pipe

---
 spacy/language.py                         |  7 +++++++
 spacy/tests/pipeline/test_pipe_methods.py | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index d76741da3..9fdde03d5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -896,6 +896,10 @@ class Language:
         self._components[i] = (new_name, self._components[i][1])
         self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
         self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
+        # Make sure [initialize] config is adjusted
+        if old_name in self._config["initialize"]["components"]:
+            init_cfg = self._config["initialize"]["components"].pop(old_name)
+            self._config["initialize"]["components"][new_name] = init_cfg
 
     def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]:
         """Remove a component from the pipeline.
@@ -912,6 +916,9 @@ class Language:
         # because factory may be used for something else
         self._pipe_meta.pop(name)
         self._pipe_configs.pop(name)
+        # Make sure name is removed from the [initialize] config
+        if name in self._config["initialize"]["components"]:
+            self._config["initialize"]["components"].pop(name)
         # Make sure the name is also removed from the set of disabled components
         if name in self.disabled:
             self._disabled.remove(name)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index e647ba440..a4297a1d1 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -382,3 +382,25 @@ def test_warning_pipe_begin_training():
 
             def begin_training(*args, **kwargs):
                 ...
+
+
+def test_pipe_methods_initialize():
+    """Test that the [initialize] config reflects the components correctly."""
+    nlp = Language()
+    nlp.add_pipe("tagger")
+    assert "tagger" not in nlp.config["initialize"]["components"]
+    nlp.config["initialize"]["components"]["tagger"] = {"labels": ["hello"]}
+    assert nlp.config["initialize"]["components"]["tagger"] == {"labels": ["hello"]}
+    nlp.remove_pipe("tagger")
+    assert "tagger" not in nlp.config["initialize"]["components"]
+    nlp.add_pipe("tagger")
+    assert "tagger" not in nlp.config["initialize"]["components"]
+    nlp.config["initialize"]["components"]["tagger"] = {"labels": ["hello"]}
+    nlp.rename_pipe("tagger", "my_tagger")
+    assert "tagger" not in nlp.config["initialize"]["components"]
+    assert nlp.config["initialize"]["components"]["my_tagger"] == {"labels": ["hello"]}
+    nlp.config["initialize"]["components"]["test"] = {"foo": "bar"}
+    nlp.add_pipe("ner", name="test")
+    assert "test" in nlp.config["initialize"]["components"]
+    nlp.remove_pipe("test")
+    assert "test" not in nlp.config["initialize"]["components"]

From b0463fbf75a83127352d52d6ac295bb73d16a6d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 4 Oct 2020 14:56:48 +0200
Subject: [PATCH 384/516] set_annotations explanation

---
 website/docs/usage/layers-architectures.md | 48 ++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index c4b3fb9dc..7e563cb5c 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -613,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
-To use our new relation extraction model as part of a custom component, we 
+To use our new relation extraction model as part of a custom component, we
 create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
@@ -635,15 +635,57 @@ def make_relation_extractor(nlp, name, model, labels):
     return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
-The [`predict`](/api/pipe#predict ) function needs to be implemented for each subclass. 
-In our case, we can simply delegate to the internal model's 
+The [`predict`](/api/pipe#predict) function needs to be implemented for each
+subclass. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
+
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
     scores = self.model.predict(docs)
     return self.model.ops.asarray(scores)
 ```
 
+The other method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
+and modifies the given `Doc` object in place to hold the predictions. For our
+relation extraction component, we'll store the data as a dictionary in a custom
+extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
+start offsets of each entity, as this defines an entity uniquely within one
+document.
+
+To interpret the scores predicted by the REL model correctly, we need to 
+refer to the model's `get_candidates` function that originally defined which 
+pairs of entities would be run through the model, so that the scores can be 
+related to those exact entities:
+
+> #### Example output
+>
+> ```python
+> doc = nlp("Amsterdam is the capital of the Netherlands.")
+> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> for value, rel_dict in doc._.rel.items():
+>     print(f"{value}: {rel_dict}")
+> ```
+
+> ```
+> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+> ```
+
+```python
+def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+    c = 0
+    get_candidates = self.model.attrs["get_candidates"]
+    for doc in docs:
+        for (e1, e2) in get_candidates(doc):
+            offset = (e1.start, e2.start)
+            if offset not in doc._.rel:
+                doc._.rel[offset] = {}
+            for j, label in enumerate(self.labels):
+                doc._.rel[offset][label] = rel_scores[c, j]
+            c += 1
+```
 
 
 

From d38dc466c5d17cc66f6be4edc028e13e41788b6c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 15:26:01 +0200
Subject: [PATCH 385/516] Adjust error [ci skip]

---
 spacy/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 9145a7b19..20edf45b5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -185,8 +185,8 @@ class Errors:
             "the documentation:\nhttps://nightly.spacy.io/usage/models")
     E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
             "component to the pipeline with: `nlp.add_pipe('sentencizer')`. "
-            "Alternatively, add the dependency parser, or set sentence "
-            "boundaries by setting `doc[i].is_sent_start`.")
+            "Alternatively, add the dependency parser or sentence recognizer, "
+            "or set sentence boundaries by setting `doc[i].is_sent_start`.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
     E033 = ("Cannot load into non-empty Doc of length {length}.")
     E035 = ("Error creating span with start {start} and end {end} for Doc of "

From 3c36a57e84a4792af59cab5ea5b76c2301c303a4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 17:46:29 +0200
Subject: [PATCH 386/516] Update data augmenters (#6196)

* Draft lower-case augmenter

* Make warning a debug log

* Update lowercase augmenter, docs and tests

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/tests/training/test_augmenters.py | 100 ++++++++++++++++++++++++
 spacy/tests/training/test_training.py   |  60 +-------------
 spacy/training/augment.py               |  31 ++++++++
 spacy/training/example.pyx              |   3 +-
 website/docs/api/top-level.md           |  32 ++++++--
 5 files changed, 161 insertions(+), 65 deletions(-)
 create mode 100644 spacy/tests/training/test_augmenters.py

diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
new file mode 100644
index 000000000..0bd4d5ef2
--- /dev/null
+++ b/spacy/tests/training/test_augmenters.py
@@ -0,0 +1,100 @@
+import pytest
+from spacy.training import Corpus
+from spacy.training.augment import create_orth_variants_augmenter
+from spacy.training.augment import create_lower_casing_augmenter
+from spacy.lang.en import English
+from spacy.tokens import DocBin, Doc
+from contextlib import contextmanager
+import random
+
+from ..util import make_tempdir
+
+
+@contextmanager
+def make_docbin(docs, name="roundtrip.spacy"):
+    with make_tempdir() as tmpdir:
+        output_file = tmpdir / name
+        DocBin(docs=docs).to_disk(output_file)
+        yield output_file
+
+
+@pytest.fixture
+def nlp():
+    return English()
+
+
+@pytest.fixture
+def doc(nlp):
+    # fmt: off
+    words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
+    tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
+    pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
+    ents = ["B-PERSON", "I-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"]
+    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
+    # fmt: on
+    doc = Doc(nlp.vocab, words=words, tags=tags, pos=pos, ents=ents)
+    doc.cats = cats
+    return doc
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_make_orth_variants(nlp, doc):
+    single = [
+        {"tags": ["NFP"], "variants": ["…", "..."]},
+        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
+    ]
+    augmenter = create_orth_variants_augmenter(
+        level=0.2, lower=0.5, orth_variants={"single": single}
+    )
+    with make_docbin([doc]) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        # Due to randomness, only test that it works without errors for now
+        list(reader(nlp))
+
+
+def test_lowercase_augmenter(nlp, doc):
+    augmenter = create_lower_casing_augmenter(level=1.0)
+    with make_docbin([doc]) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        corpus = list(reader(nlp))
+    eg = corpus[0]
+    assert eg.reference.text == doc.text.lower()
+    assert eg.predicted.text == doc.text.lower()
+    ents = [(e.start, e.end, e.label) for e in doc.ents]
+    assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents
+    for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents):
+        assert ref_ent.text == orig_ent.text.lower()
+    assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc]
+
+
+@pytest.mark.filterwarnings("ignore::UserWarning")
+def test_custom_data_augmentation(nlp, doc):
+    def create_spongebob_augmenter(randomize: bool = False):
+        def augment(nlp, example):
+            text = example.text
+            if randomize:
+                ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
+            else:
+                ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
+            example_dict = example.to_dict()
+            doc = nlp.make_doc("".join(ch))
+            example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
+            yield example
+            yield example.from_dict(doc, example_dict)
+
+        return augment
+
+    with make_docbin([doc]) as output_file:
+        reader = Corpus(output_file, augmenter=create_spongebob_augmenter())
+        corpus = list(reader(nlp))
+    orig_text = "Sarah 's sister flew to Silicon Valley via London . "
+    augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . "
+    assert corpus[0].text == orig_text
+    assert corpus[0].reference.text == orig_text
+    assert corpus[0].predicted.text == orig_text
+    assert corpus[1].text == augmented
+    assert corpus[1].reference.text == augmented
+    assert corpus[1].predicted.text == augmented
+    ents = [(e.start, e.end, e.label) for e in doc.ents]
+    assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents
+    assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 7d41c8908..07e1aef01 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,23 +1,20 @@
 import numpy
 from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
 from spacy.training import biluo_tags_to_spans, iob_to_biluo
-from spacy.training import Corpus, docs_to_json
-from spacy.training.example import Example
+from spacy.training import Corpus, docs_to_json, Example
 from spacy.training.converters import json_to_docs
-from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
 from thinc.api import compounding
 import pytest
 import srsly
-import random
 
 from ..util import make_tempdir
 
 
 @pytest.fixture
-def doc(en_vocab):
+def doc():
     nlp = English()  # make sure we get a new vocab every time
     # fmt: off
     words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
@@ -495,59 +492,6 @@ def test_roundtrip_docs_to_docbin(doc):
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
 
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_make_orth_variants(doc):
-    nlp = English()
-    orth_variants = {
-        "single": [
-            {"tags": ["NFP"], "variants": ["…", "..."]},
-            {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
-        ]
-    }
-    augmenter = create_orth_variants_augmenter(
-        level=0.2, lower=0.5, orth_variants=orth_variants
-    )
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "roundtrip.spacy"
-        DocBin(docs=[doc]).to_disk(output_file)
-        # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(output_file, augmenter=augmenter)
-        list(reader(nlp))
-
-
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_custom_data_augmentation(doc):
-    def create_spongebob_augmenter(randomize: bool = False):
-        def augment(nlp, example):
-            text = example.text
-            if randomize:
-                ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text]
-            else:
-                ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)]
-            example_dict = example.to_dict()
-            doc = nlp.make_doc("".join(ch))
-            example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
-            yield example
-            yield example.from_dict(doc, example_dict)
-
-        return augment
-
-    nlp = English()
-    with make_tempdir() as tmpdir:
-        output_file = tmpdir / "roundtrip.spacy"
-        DocBin(docs=[doc]).to_disk(output_file)
-        reader = Corpus(output_file, augmenter=create_spongebob_augmenter())
-        corpus = list(reader(nlp))
-    orig_text = "Sarah 's sister flew to Silicon Valley via London . "
-    augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . "
-    assert corpus[0].text == orig_text
-    assert corpus[0].reference.text == orig_text
-    assert corpus[0].predicted.text == orig_text
-    assert corpus[1].text == augmented
-    assert corpus[1].reference.text == augmented
-    assert corpus[1].predicted.text == augmented
-
-
 @pytest.mark.skip("Outdated")
 @pytest.mark.parametrize(
     "tokens_a,tokens_b,expected",
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 7415ad335..e6d10a195 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -34,16 +34,47 @@ def create_orth_variants_augmenter(
 ) -> Callable[["Language", Example], Iterator[Example]]:
     """Create a data augmentation callback that uses orth-variant replacement.
     The callback can be added to a corpus or other data iterator during training.
+
+    level (float): The percentage of texts that will be augmented.
+    lower (float): The percentage of texts that will be lowercased.
+    orth_variants (Dict[str, dict]): A dictionary containing the single and
+        paired orth variants. Typically loaded from a JSON file.
+    RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
     """
     return partial(
         orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower
     )
 
 
+@registry.augmenters("spacy.lower_case.v1")
+def create_lower_casing_augmenter(
+    level: float,
+) -> Callable[["Language", Example], Iterator[Example]]:
+    """Create a data augmentation callback that converts documents to lowercase.
+    The callback can be added to a corpus or other data iterator during training.
+
+    level (float): The percentage of texts that will be augmented.
+    RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
+    """
+    return partial(lower_casing_augmenter, level=level)
+
+
 def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
     yield example
 
 
+def lower_casing_augmenter(
+    nlp: "Language", example: Example, *, level: float,
+) -> Iterator[Example]:
+    if random.random() >= level:
+        yield example
+    else:
+        example_dict = example.to_dict()
+        doc = nlp.make_doc(example.text.lower())
+        example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in doc]
+        yield example.from_dict(doc, example_dict)
+
+
 def orth_variants_augmenter(
     nlp: "Language",
     example: Example,
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index f6225135c..1f3a36b33 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -12,6 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
 from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
+from ..util import logger
 
 
 cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@@ -390,7 +391,7 @@ def _fix_legacy_dict_data(example_dict):
     if "HEAD" in token_dict and "SENT_START" in token_dict:
         # If heads are set, we don't also redundantly specify SENT_START.
         token_dict.pop("SENT_START")
-        warnings.warn(Warnings.W092)
+        logger.debug(Warnings.W092)
     return {
         "token_annotation": token_dict,
         "doc_annotation": doc_dict
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index d7273b651..eb2eb5d71 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -689,7 +689,8 @@ Data augmentation is the process of applying small modifications to the training
 data. It can be especially useful for punctuation and case replacement – for
 example, if your corpus only uses smart quotes and you want to include
 variations using regular quotes, or to make the model less sensitive to
-capitalization by including a mix of capitalized and lowercase examples. See the [usage guide](/usage/training#data-augmentation) for details and examples.
+capitalization by including a mix of capitalized and lowercase examples. See the
+[usage guide](/usage/training#data-augmentation) for details and examples.
 
 ### spacy.orth_variants.v1 {#orth_variants tag="registered function"}
 
@@ -707,7 +708,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the
 > ```
 
 Create a data augmentation callback that uses orth-variant replacement. The
-callback can be added to a corpus or other data iterator during training. This
+callback can be added to a corpus or other data iterator during training. It's
 is especially useful for punctuation and case replacement, to help generalize
 beyond corpora that don't have smart quotes, or only have smart quotes etc.
 
@@ -718,6 +719,25 @@ beyond corpora that don't have smart quotes, or only have smart quotes etc.
 | `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ |
 | **CREATES**     | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                              |
 
+### spacy.lower_case.v1 {#lower_case tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [corpora.train.augmenter]
+> @augmenters = "spacy.lower_case.v1"
+> level = 0.3
+> ```
+
+Create a data augmentation callback that lowercases documents. The callback can
+be added to a corpus or other data iterator during training. It's especially
+useful for making the model less sensitive to capitalization.
+
+| Name        | Description                                                                                                                                                                  |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                    |
+| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
+
 ## Training data and alignment {#gold source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
@@ -827,10 +847,10 @@ utilities.
 ### util.get_lang_class {#util.get_lang_class tag="function"}
 
 Import and load a `Language` class. Allows lazy-loading
-[language data](/usage/linguistic-features#language-data) and importing languages using the
-two-letter language code. To add a language code for a custom language class,
-you can register it using the [`@registry.languages`](/api/top-level#registry)
-decorator.
+[language data](/usage/linguistic-features#language-data) and importing
+languages using the two-letter language code. To add a language code for a
+custom language class, you can register it using the
+[`@registry.languages`](/api/top-level#registry) decorator.
 
 > #### Example
 >

From f1d1f78636059abcbd680cd283d643c11310df30 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 22:44:21 +0200
Subject: [PATCH 387/516] Make warning debug log [ci skip]

---
 spacy/tokens/doc.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3404274ce..0499dc4a7 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
     while not heads_within_sents:
         heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
         if loop_count > 10:
-            warnings.warn(Warnings.W026)
+            util.logger.debug(Warnings.W026)
             break
         loop_count += 1
     # Set sentence starts

From 4b15ff7504a6af94b8e98f8406e430b437a889c5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 22:47:04 +0200
Subject: [PATCH 388/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 037ca6bcb..dce627a38 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a32"
+__version__ = "3.0.0a33"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 549758f67dea544ec64271fe88513dbc4117fed8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 4 Oct 2020 23:16:09 +0200
Subject: [PATCH 389/516] Adjust test for now

---
 spacy/tests/regression/test_issue5918.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index db957709c..e4ee0135d 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -23,7 +23,8 @@ def test_issue5918():
     assert len(doc.ents) == 3
     # make it so that the third span's head is within the entity (ent_iob=I)
     # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    with pytest.warns(UserWarning):
-        doc[29].head = doc[33]
+    # TODO: test for logging here
+    # with pytest.warns(UserWarning):
+    #     doc[29].head = doc[33]
     doc = merge_entities(doc)
     assert len(doc.ents) == 3

From 52b660e9dcc412fc1d4bbdf269c1bd31d9e7d3a4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 00:39:36 +0200
Subject: [PATCH 390/516] initialize and update explanation

---
 website/docs/api/pipe.md                   |   6 +
 website/docs/usage/layers-architectures.md | 149 ++++++++++++++++-----
 2 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 4f5ac6f61..de35f9eb4 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
 
+<Infobox variant="danger">
+
+This method needs to be overwritten with your own custom `get_loss` method.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 7e563cb5c..130a7144e 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -618,31 +618,97 @@ create a subclass of [`Pipe`](/api/pipe) that will hold the model:
 
 ```python
 from spacy.pipeline import Pipe
-from spacy.language import Language
 
 class RelationExtractor(Pipe):
      def __init__(self, vocab, model, name="rel", labels=[]):
+        self.model = model
         ...
 
     def predict(self, docs):
         ...
 
-    def set_annotations(self, docs, scores):
+    def set_annotations(self, docs, predictions):
          ...
-
-@Language.factory("relation_extractor")
-def make_relation_extractor(nlp, name, model, labels):
-    return RelationExtractor(nlp.vocab, model, name, labels=labels)
 ```
 
+Before the model can be used however, it needs to be 
+[initialized](/api/pipe#initialize). This function recieves either the full 
+training data set, or a representative sample. The training data can be used 
+to deduce all relevant labels. Alternatively, a list of labels can be provided, 
+or a script can call `rel_component.add_label()` to add each label separately.
+
+The number of labels will define the output dimensionality of the network, 
+and will be used to do 
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
+the layers of the neural network. This is triggerd by calling `model.initialize`.
+
+```python
+from itertools import islice
+
+def initialize(
+    self,
+    get_examples: Callable[[], Iterable[Example]],
+    *,
+    nlp: Language = None,
+    labels: Optional[List[str]] = None,
+):
+    if labels is not None:
+        for label in labels:
+            self.add_label(label)
+    else:
+        for example in get_examples():
+            relations = example.reference._.rel
+            for indices, label_dict in relations.items():
+                for label in label_dict.keys():
+                    self.add_label(label)
+    subbatch = list(islice(get_examples(), 10))
+    doc_sample = [eg.reference for eg in subbatch]
+    label_sample = self._examples_to_truth(subbatch)
+    self.model.initialize(X=doc_sample, Y=label_sample)
+```
+ 
+The `initialize` method will be triggered whenever this component is part of an 
+`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
+component and its internal model can be trained and used to make predictions.
+
+During training the function [`update`](/api/pipe#update) is invoked which delegates to 
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
+needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
+loss for a batch of examples, as well as the gradient of loss that will be used to update 
+the weights of the model layers.
+
+```python
+def update(
+    self,
+    examples: Iterable[Example],
+    *,
+    drop: float = 0.0,
+    set_annotations: bool = False,
+    sgd: Optional[Optimizer] = None,
+    losses: Optional[Dict[str, float]] = None,
+) -> Dict[str, float]:
+    ...
+    docs = [ex.predicted for ex in examples]
+    predictions, backprop = self.model.begin_update(docs)
+    loss, gradient = self.get_loss(examples, predictions)
+    backprop(gradient)
+    losses[self.name] += loss
+    ...
+    return losses
+```
+
+Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
+for the implementation of the `get_loss` function.
+
+When the internal model is trained, the component can be used to make novel predictions. 
 The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass. In our case, we can simply delegate to the internal model's
+subclass of `Pipe`. In our case, we can simply delegate to the internal model's
 [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
-    scores = self.model.predict(docs)
-    return self.model.ops.asarray(scores)
+    predictions = self.model.predict(docs)
+    return self.model.ops.asarray(predictions)
 ```
 
 The other method that needs to be implemented, is
@@ -650,7 +716,7 @@ The other method that needs to be implemented, is
 and modifies the given `Doc` object in place to hold the predictions. For our
 relation extraction component, we'll store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
-start offsets of each entity, as this defines an entity uniquely within one
+start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
 To interpret the scores predicted by the REL model correctly, we need to 
@@ -674,7 +740,7 @@ related to those exact entities:
 > ```
 
 ```python
-def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
+def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
     for doc in docs:
@@ -683,34 +749,45 @@ def set_annotations(self, docs: Iterable[Doc], rel_scores: Floats2d):
             if offset not in doc._.rel:
                 doc._.rel[offset] = {}
             for j, label in enumerate(self.labels):
-                doc._.rel[offset][label] = rel_scores[c, j]
+                doc._.rel[offset][label] = predictions[c, j]
             c += 1
 ```
 
-
-
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
-
-<!-- TODO: write trainable component section
-- Interaction with `predict`, `get_loss` and `set_annotations`
-- Initialization life-cycle with `initialize`, correlation with add_label
-Example: relation extraction component (implemented as project template)
-Avoid duplication with usage/processing-pipelines#trainable-components ?
--->
-
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
+Under the hood, when the pipe is applied to a document, it will delegate to these 
+two methods: 
 
 ```python
-def update(self, examples):
-    docs = [ex.predicted for ex in examples]
-    refs = [ex.reference for ex in examples]
-    predictions, backprop = self.model.begin_update(docs)
-    gradient = self.get_loss(predictions, refs)
-    backprop(gradient)
-
-def __call__(self, doc):
-    predictions = self.model([doc])
-    self.set_annotations(predictions)
+def __call__(self, Doc doc):
+    predictions = self.predict([doc])
+    self.set_annotations([doc], predictions)
+    return doc
 ```
--->
+
+Once our `Pipe` subclass is fully implemented, we can 
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
+the component with the 
+`Language.factory` decorator. This will enable the creation of the component with 
+`nlp.add_pipe`, or via the config.
+
+> ```
+> 
+> [components.relation_extractor]
+> factory = "relation_extractor"
+> labels = []
+> 
+> [components.relation_extractor.model]
+> @architectures = "rel_model.v1"
+> ...
+> ```
+
+```python
+from spacy.language import Language
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model, labels):
+    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+```
+
+<!-- TODO: refer once more to example project -->
+
+<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) -->

From 9a6c9b133b796d4b766189740ef1fc88f6dbe3ee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 01:05:37 +0200
Subject: [PATCH 391/516] various small fixes

---
 website/docs/usage/layers-architectures.md | 142 +++++++++++----------
 1 file changed, 74 insertions(+), 68 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 130a7144e..414562d6d 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -288,7 +288,7 @@ those parts of the network.
 
 To use our custom model including the PyTorch subnetwork, all we need to do is
 register the architecture using the
-[`architectures` registry](/api/top-level#registry). This will assign the
+[`architectures` registry](/api/top-level#registry). This assigns the
 architecture a name so spaCy knows how to find it, and allows passing in
 arguments like hyperparameters via the [config](/usage/training#config). The
 full example then becomes:
@@ -488,27 +488,27 @@ with Model.define_operators({">>": chain}):
 
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
-[trainable pipeline component](usage/processing-pipelines#trainable-components)
+[trainable pipeline component](/usage/processing-pipelines#trainable-components)
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
 ### Example: Pipeline component for relation extraction {#component-rel}
 
 This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We assume we want to implement a binary
-relation extraction method that determines whether two entities in a document
-are related or not, and if so, with what type of relation. We'll allow multiple
-types of relations between two such entities - i.e. it is a multi-label setting.
+extraction component from scratch. We'll implement a binary relation extraction
+method that determines whether or not two entities in a document are related,
+and if so, what type of relation. We'll allow multiple types of relations
+between two such entities (multi-label setting).
 
 There are two major steps required: first, we need to
 [implement a machine learning model](#component-rel-model) specific to this
-task, and then we'll use this model to
+task, and subsequently we use this model to
 [implement a custom pipeline component](#component-rel-pipe).
 
 #### Step 1: Implementing the Model {#component-rel-model}
 
-We'll need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes
-a list of documents as input, and outputs a two-dimensional matrix of scores:
+We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
+list of documents as input, and outputs a two-dimensional matrix of predictions:
 
 ```python
 @registry.architectures.register("rel_model.v1")
@@ -519,17 +519,16 @@ def create_relation_model(...) -> Model[List[Doc], Floats2d]:
 
 The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
-[`Tok2Vec`](/api/tok2vec) component or [`Transformer`](/api/transformer). This
-layer is assumed to be of type `Model[List["Doc"], List[Floats2d]]` as it
+[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
+layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
 transforms each document into a list of tokens, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that will generate pairs of entities that we want to
-classify as being related or not. These candidate pairs are typically formed
-within one document, which means we'll have a function that takes a `Doc` as
-input and outputs a `List` of `Span` tuples. For instance, a very
-straightforward implementation would be to just take any two entities from the
-same document:
+Next, we need a method that generates pairs of entities that we want to classify
+as being related or not. As these candidate pairs are typically formed within
+one document, this function takes a `Doc` as input and outputs a `List` of
+`Span` tuples. For instance, a very straightforward implementation would be to
+just take any two entities from the same document:
 
 ```python
 def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
@@ -549,12 +548,12 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
 >
 > [model.get_candidates]
 > @misc = "rel_cand_generator.v2"
-> max_length = 6
+> max_length = 20
 > ```
 
 But we could also refine this further by excluding relations of an entity with
 itself, and posing a maximum distance (in number of tokens) between two
-entities. We'll register this function in the
+entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 
@@ -573,10 +572,10 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
     return get_candidates
 ```
 
-Finally, we'll require a method that transforms the candidate pairs of entities
-into a 2D tensor using the specified Tok2Vec function, and this `Floats2d`
-object will then be processed by a final `output_layer` of the network. Taking
-all this together, we can define our relation model like this in the config:
+Finally, we require a method that transforms the candidate entity pairs into a
+2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d`
+object will then be processed by a final `output_layer` of the network. Putting
+all this together, we can define our relation model in a config file as such:
 
 ```
 [model]
@@ -588,7 +587,7 @@ all this together, we can define our relation model like this in the config:
 
 [model.get_candidates]
 @misc = "rel_cand_generator.v2"
-max_length = 6
+max_length = 20
 
 [model.create_candidate_tensor]
 @misc = "rel_cand_tensor.v1"
@@ -600,7 +599,7 @@ max_length = 6
 
 <!-- TODO: Link to project for implementation details -->
 
-When creating this model, we'll store the custom functions as
+When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
 references, so we can access them easily:
 
@@ -614,7 +613,7 @@ get_candidates = model.attrs["get_candidates"]
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
 To use our new relation extraction model as part of a custom component, we
-create a subclass of [`Pipe`](/api/pipe) that will hold the model:
+create a subclass of [`Pipe`](/api/pipe) that holds the model:
 
 ```python
 from spacy.pipeline import Pipe
@@ -624,6 +623,9 @@ class RelationExtractor(Pipe):
         self.model = model
         ...
 
+    def update(self, examples, ...):
+        ...
+
     def predict(self, docs):
         ...
 
@@ -631,18 +633,19 @@ class RelationExtractor(Pipe):
          ...
 ```
 
-Before the model can be used however, it needs to be 
-[initialized](/api/pipe#initialize). This function recieves either the full 
-training data set, or a representative sample. The training data can be used 
-to deduce all relevant labels. Alternatively, a list of labels can be provided, 
-or a script can call `rel_component.add_label()` to add each label separately.
+Before the model can be used, it needs to be
+[initialized](/api/pipe#initialize). This function receives either the full
+training data set, or a representative sample. This data set can be used to
+deduce all relevant labels. Alternatively, a list of labels can be provided, or
+a script can call `rel_component.add_label()` directly.
 
-The number of labels will define the output dimensionality of the network, 
-and will be used to do 
-[shape inference](https://thinc.ai/docs/usage-models#validation) throughout 
-the layers of the neural network. This is triggerd by calling `model.initialize`.
+The number of labels defines the output dimensionality of the network, and will
+be used to do [shape inference](https://thinc.ai/docs/usage-models#validation)
+throughout the layers of the neural network. This is triggered by calling
+`model.initialize`.
 
 ```python
+### {highlight="12,18,22"}
 from itertools import islice
 
 def initialize(
@@ -666,18 +669,21 @@ def initialize(
     label_sample = self._examples_to_truth(subbatch)
     self.model.initialize(X=doc_sample, Y=label_sample)
 ```
- 
-The `initialize` method will be triggered whenever this component is part of an 
-`nlp` pipeline, and `nlp.initialize()` is invoked. After doing so, the pipeline 
-component and its internal model can be trained and used to make predictions.
 
-During training the function [`update`](/api/pipe#update) is invoked which delegates to 
-[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and 
-needs a function [`get_loss`](/api/pipe#get_loss) that will calculate the 
-loss for a batch of examples, as well as the gradient of loss that will be used to update 
-the weights of the model layers.
+The `initialize` method is triggered whenever this component is part of an `nlp`
+pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After
+doing so, the pipeline component and its internal model can be trained and used
+to make predictions.
+
+During training, the function [`update`](/api/pipe#update) is invoked which
+delegates to
+[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of
+examples, as well as the gradient of loss that will be used to update the
+weights of the model layers.
 
 ```python
+### {highlight="12-14"}
 def update(
     self,
     examples: Iterable[Example],
@@ -697,13 +703,13 @@ def update(
     return losses
 ```
 
-Thinc provides some [loss functions](https://thinc.ai/docs/api-loss) that can be used 
-for the implementation of the `get_loss` function.
+Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can
+be used for the implementation of the `get_loss` function.
 
-When the internal model is trained, the component can be used to make novel predictions. 
-The [`predict`](/api/pipe#predict) function needs to be implemented for each
-subclass of `Pipe`. In our case, we can simply delegate to the internal model's
-[predict](https://thinc.ai/docs/api-model#predict) function:
+When the internal model is trained, the component can be used to make novel
+predictions. The [`predict`](/api/pipe#predict) function needs to be implemented
+for each subclass of `Pipe`. In our case, we can simply delegate to the internal
+model's [predict](https://thinc.ai/docs/api-model#predict) function:
 
 ```python
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
@@ -711,24 +717,24 @@ def predict(self, docs: Iterable[Doc]) -> Floats2d:
     return self.model.ops.asarray(predictions)
 ```
 
-The other method that needs to be implemented, is
-[`set_annotations`](/api/pipe#set_annotations). It takes the predicted scores,
-and modifies the given `Doc` object in place to hold the predictions. For our
-relation extraction component, we'll store the data as a dictionary in a custom
+The final method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). This function takes the
+predictions, and modifies the given `Doc` object in place to store them. For our
+relation extraction component, we store the data as a dictionary in a custom
 extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
 start offsets of each entity, as this defines an entity pair uniquely within one
 document.
 
-To interpret the scores predicted by the REL model correctly, we need to 
-refer to the model's `get_candidates` function that originally defined which 
-pairs of entities would be run through the model, so that the scores can be 
-related to those exact entities:
+To interpret the scores predicted by the REL model correctly, we need to refer
+to the model's `get_candidates` function that defined which pairs of entities
+were relevant candidates, so that the predictions can be linked to those exact
+entities:
 
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
-> print(f"spans: {[(e.start, e.text, e.label_) for e in doc.ents]}")
+> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]")
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
 > ```
@@ -740,6 +746,7 @@ related to those exact entities:
 > ```
 
 ```python
+###  {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
@@ -753,8 +760,8 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
             c += 1
 ```
 
-Under the hood, when the pipe is applied to a document, it will delegate to these 
-two methods: 
+Under the hood, when the pipe is applied to a document, it delegates to the
+`predict` and `set_annotations` functions:
 
 ```python
 def __call__(self, Doc doc):
@@ -763,18 +770,17 @@ def __call__(self, Doc doc):
     return doc
 ```
 
-Once our `Pipe` subclass is fully implemented, we can 
-[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories) 
-the component with the 
-`Language.factory` decorator. This will enable the creation of the component with 
-`nlp.add_pipe`, or via the config.
+Once our `Pipe` subclass is fully implemented, we can
+[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories)
+the component with the `Language.factory` decorator. This enables the creation
+of the component with `nlp.add_pipe`, or via the config.
 
 > ```
-> 
+>
 > [components.relation_extractor]
 > factory = "relation_extractor"
 > labels = []
-> 
+>
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
 > ...

From b0b93854cb2c522090c87544e33a19e6b361ed19 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 09:26:43 +0200
Subject: [PATCH 392/516] Update ru/uk lemmatizers for new nlp.initialize

---
 spacy/lang/ru/__init__.py   | 10 ++++++++--
 spacy/lang/ru/lemmatizer.py |  5 ++---
 spacy/lang/uk/__init__.py   |  4 ++--
 spacy/lang/uk/lemmatizer.py |  5 ++---
 spacy/tests/conftest.py     |  1 -
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 1d59ca043..2f3965fcc 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -25,8 +25,14 @@ class Russian(Language):
     default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    overwrite: bool = False,
+):
+    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
 __all__ = ["Russian"]
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 8d7996c63..3bcac8730 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
 
 from thinc.api import Model
 
-from ...lookups import Lookups
 from ...pipeline import Lemmatizer
 from ...symbols import POS
 from ...tokens import Token
@@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
         name: str = "lemmatizer",
         *,
         mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
     ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
 
         try:
             from pymorphy2 import MorphAnalyzer
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 73c065379..0abe9170e 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -26,8 +26,8 @@ class Ukrainian(Language):
     default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
-    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,):
+    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
 __all__ = ["Ukrainian"]
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 0d6febce6..009ec5044 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -3,7 +3,6 @@ from typing import Optional
 from thinc.api import Model
 
 from ..ru.lemmatizer import RussianLemmatizer
-from ...lookups import Lookups
 from ...vocab import Vocab
 
 
@@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
         name: str = "lemmatizer",
         *,
         mode: str = "pymorphy2",
-        lookups: Optional[Lookups] = None,
+        overwrite: bool = False,
     ) -> None:
-        super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
         try:
             from pymorphy2 import MorphAnalyzer
         except ImportError:
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 4a3d126d7..67860b7e4 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -248,7 +248,6 @@ def tt_tokenizer():
 @pytest.fixture(scope="session")
 def uk_tokenizer():
     pytest.importorskip("pymorphy2")
-    pytest.importorskip("pymorphy2.lang")
     return get_lang_class("uk")().tokenizer
 
 

From 03cfb2d2f4afbcc96f99757010ce3263cbc28ebd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 09:33:05 +0200
Subject: [PATCH 393/516] Always serialize lookups and vectors to disk

---
 spacy/lookups.py | 13 ++++++-------
 spacy/vocab.pyx  |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/lookups.py b/spacy/lookups.py
index fb5e3d748..133cb0672 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -289,13 +289,12 @@ class Lookups:
 
         DOCS: https://nightly.spacy.io/api/lookups#to_disk
         """
-        if len(self._tables):
-            path = ensure_path(path)
-            if not path.exists():
-                path.mkdir()
-            filepath = path / filename
-            with filepath.open("wb") as file_:
-                file_.write(self.to_bytes())
+        path = ensure_path(path)
+        if not path.exists():
+            path.mkdir()
+        filepath = path / filename
+        with filepath.open("wb") as file_:
+            file_.write(self.to_bytes())
 
     def from_disk(
         self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a22f12c65..93918250b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -445,9 +445,9 @@ cdef class Vocab:
         setters = ["strings", "vectors"]
         if "strings" not in exclude:
             self.strings.to_disk(path / "strings.json")
-        if "vectors" not in "exclude" and self.vectors is not None:
+        if "vectors" not in "exclude":
             self.vectors.to_disk(path)
-        if "lookups" not in "exclude" and self.lookups is not None:
+        if "lookups" not in "exclude":
             self.lookups.to_disk(path)
 
     def from_disk(self, path, *, exclude=tuple()):

From 1c641e41c3d46c5b555891427833200c0f0087b5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 11:50:11 +0200
Subject: [PATCH 394/516] Remove unused import [ci skip]

---
 spacy/tests/regression/test_issue5918.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index e4ee0135d..d25323ef6 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,6 +1,5 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
-import pytest
 
 
 def test_issue5918():

From e3acad626443c9cf0b81f600aae2b3b9529b63cd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 13:06:20 +0200
Subject: [PATCH 395/516] Update docs [ci skip]

---
 website/docs/usage/layers-architectures.md | 261 +++++++++++++--------
 1 file changed, 162 insertions(+), 99 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 414562d6d..24c7bf1cf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -86,7 +86,8 @@ see are: ​
 | ~~Ragged~~         | A container to handle variable-length sequence data in an unpadded contiguous array.                 |
 | ~~Padded~~         | A container to handle variable-length sequence data in a padded contiguous array.                    |
 
-The model type signatures help you figure out which model architectures and
+See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
+model type signatures help you figure out which model architectures and
 components can **fit together**. For instance, the
 [`TextCategorizer`](/api/textcategorizer) class expects a model typed
 ~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@@ -488,32 +489,57 @@ with Model.define_operators({">>": chain}):
 
 In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
-[trainable pipeline component](/usage/processing-pipelines#trainable-components)
+[trainable](/usage/processing-pipelines#trainable-components) pipeline component
 from scratch. This can be done by creating a new class inheriting from
 [`Pipe`](/api/pipe), and linking it up to your custom model implementation.
 
-### Example: Pipeline component for relation extraction {#component-rel}
+<Infobox title="Trainable component API" emoji="💡">
 
-This section outlines an example use-case of implementing a novel relation
-extraction component from scratch. We'll implement a binary relation extraction
-method that determines whether or not two entities in a document are related,
-and if so, what type of relation. We'll allow multiple types of relations
-between two such entities (multi-label setting).
+For details on how to implement pipeline components, check out the usage guide
+on [custom components](/usage/processing-pipelines#custom-component) and the
+overview of the `Pipe` methods used by
+[trainable components](/usage/processing-pipelines#trainable-components).
 
-There are two major steps required: first, we need to
-[implement a machine learning model](#component-rel-model) specific to this
-task, and subsequently we use this model to
-[implement a custom pipeline component](#component-rel-pipe).
+</Infobox>
+
+### Example: Entity elation extraction component {#component-rel}
+
+This section outlines an example use-case of implementing a **novel relation
+extraction component** from scratch. We'll implement a binary relation
+extraction method that determines whether or not **two entities** in a document
+are related, and if so, what type of relation. We'll allow multiple types of
+relations between two such entities (multi-label setting). There are two major
+steps required:
+
+1. Implement a [machine learning model](#component-rel-model) specific to this
+   task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
+   a relation for the available candidate pairs.
+2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
+   machine learning model that sets annotations on the [`Doc`](/api/doc) passing
+   through the pipeline.
+
+<!-- TODO: <Project id="tutorials/ner-relations">
+
+</Project> -->
 
 #### Step 1: Implementing the Model {#component-rel-model}
 
 We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
-list of documents as input, and outputs a two-dimensional matrix of predictions:
+**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
+matrix** (~~Floats2d~~) of predictions:
+
+> #### Model type annotations
+>
+> The `Model` class is a generic type that can specify its input and output
+> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
+> type checks and validation. See the section on [type signatures](#type-sigs)
+> for details.
 
 ```python
+### Register the model architecture
 @registry.architectures.register("rel_model.v1")
 def create_relation_model(...) -> Model[List[Doc], Floats2d]:
-    model = _create_my_model()
+    model = ...  # 👈 model will go here
     return model
 ```
 
@@ -521,17 +547,18 @@ The first layer in this model will typically be an
 [embedding layer](/usage/embeddings-transformers) such as a
 [`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
 layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
-transforms each document into a list of tokens, with each token being
+transforms each **document into a list of tokens**, with each token being
 represented by its embedding in the vector space.
 
-Next, we need a method that generates pairs of entities that we want to classify
-as being related or not. As these candidate pairs are typically formed within
-one document, this function takes a `Doc` as input and outputs a `List` of
-`Span` tuples. For instance, a very straightforward implementation would be to
-just take any two entities from the same document:
+Next, we need a method that **generates pairs of entities** that we want to
+classify as being related or not. As these candidate pairs are typically formed
+within one document, this function takes a [`Doc`](/api/doc) as input and
+outputs a `List` of `Span` tuples. For instance, a very straightforward
+implementation would be to just take any two entities from the same document:
 
 ```python
-def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+### Simple candiate generation
+def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
     candidates = []
     for ent1 in doc.ents:
         for ent2 in doc.ents:
@@ -539,27 +566,29 @@ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
     return candidates
 ```
 
-> ```
-> [model]
-> @architectures = "rel_model.v1"
->
-> [model.tok2vec]
-> ...
->
-> [model.get_candidates]
-> @misc = "rel_cand_generator.v2"
-> max_length = 20
-> ```
-
-But we could also refine this further by excluding relations of an entity with
-itself, and posing a maximum distance (in number of tokens) between two
+But we could also refine this further by **excluding relations** of an entity
+with itself, and posing a **maximum distance** (in number of tokens) between two
 entities. We register this function in the
 [`@misc` registry](/api/top-level#registry) so we can refer to it from the
 config, and easily swap it out for any other candidate generation function.
 
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [model]
+> @architectures = "rel_model.v1"
+>
+> [model.tok2vec]
+> # ...
+>
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
+> ```
+
 ```python
-### {highlight="1,2,7,8"}
-@registry.misc.register("rel_cand_generator.v2")
+### Extended candidate generation {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v1")
 def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
     def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
         candidates = []
@@ -573,17 +602,19 @@ def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span
 ```
 
 Finally, we require a method that transforms the candidate entity pairs into a
-2D tensor using the specified `Tok2Vec` function. The resulting `Floats2d`
-object will then be processed by a final `output_layer` of the network. Putting
-all this together, we can define our relation model in a config file as such:
+2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
+[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
+processed by a final `output_layer` of the network. Putting all this together,
+we can define our relation model in a config file as such:
 
-```
+```ini
+### config.cfg
 [model]
 @architectures = "rel_model.v1"
-...
+# ...
 
 [model.tok2vec]
-...
+# ...
 
 [model.get_candidates]
 @misc = "rel_cand_generator.v2"
@@ -594,10 +625,11 @@ max_length = 20
 
 [model.output_layer]
 @architectures = "rel_output_layer.v1"
-...
+# ...
 ```
 
-<!-- TODO: Link to project for implementation details -->
+<!-- TODO: link to project for implementation details -->
+<!-- TODO: maybe embed files from project that show the architectures? -->
 
 When creating this model, we store the custom functions as
 [attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
@@ -612,40 +644,55 @@ get_candidates = model.attrs["get_candidates"]
 
 #### Step 2: Implementing the pipeline component {#component-rel-pipe}
 
-To use our new relation extraction model as part of a custom component, we
+To use our new relation extraction model as part of a custom
+[trainable component](/usage/processing-pipelines#trainable-components), we
 create a subclass of [`Pipe`](/api/pipe) that holds the model:
 
 ```python
+### Pipeline component skeleton
 from spacy.pipeline import Pipe
 
 class RelationExtractor(Pipe):
-     def __init__(self, vocab, model, name="rel", labels=[]):
+     def __init__(self, vocab, model, name="rel"):
+        """Create a component instance."""
         self.model = model
-        ...
+        self.vocab = vocab
+        self.name = name
 
-    def update(self, examples, ...):
+    def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
+        """Learn from a batch of Example objects."""
         ...
 
     def predict(self, docs):
+        """Apply the model to a batch of Doc objects."""
         ...
 
     def set_annotations(self, docs, predictions):
+        """Modify a batch of Doc objects using the predictions."""
          ...
+
+    def initialize(self, get_examples, nlp=None, labels=None):
+        """Initialize the model before training."""
+        ...
+
+    def add_label(self, label):
+        """Add a label to the component."""
+        ...
 ```
 
 Before the model can be used, it needs to be
-[initialized](/api/pipe#initialize). This function receives either the full
-training data set, or a representative sample. This data set can be used to
-deduce all relevant labels. Alternatively, a list of labels can be provided, or
-a script can call `rel_component.add_label()` directly.
-
-The number of labels defines the output dimensionality of the network, and will
-be used to do [shape inference](https://thinc.ai/docs/usage-models#validation)
-throughout the layers of the neural network. This is triggered by calling
-`model.initialize`.
+[initialized](/usage/training#initialization). This function receives a callback
+to access the full **training data set**, or a representative sample. This data
+set can be used to deduce all **relevant labels**. Alternatively, a list of
+labels can be provided to `initialize`, or you can call the
+`RelationExtractoradd_label` directly. The number of labels defines the output
+dimensionality of the network, and will be used to do
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
+layers of the neural network. This is triggered by calling
+[`Model.initialize`](https://thinc.ai/api/model#initialize).
 
 ```python
-### {highlight="12,18,22"}
+### The initialize method {highlight="12,18,22"}
 from itertools import islice
 
 def initialize(
@@ -671,19 +718,22 @@ def initialize(
 ```
 
 The `initialize` method is triggered whenever this component is part of an `nlp`
-pipeline, and [`nlp.initialize()`](/api/language#initialize) is invoked. After
-doing so, the pipeline component and its internal model can be trained and used
-to make predictions.
+pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
+Typically, this happens when the pipeline is set up before training in
+[`spacy train`](/api/cli#training). After initialization, the pipeline component
+and its internal model can be trained and used to make predictions.
 
 During training, the function [`update`](/api/pipe#update) is invoked which
 delegates to
-[`self.model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
-[`get_loss`](/api/pipe#get_loss) function that calculate the loss for a batch of
-examples, as well as the gradient of loss that will be used to update the
-weights of the model layers.
+[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
+batch of examples, as well as the **gradient** of loss that will be used to
+update the weights of the model layers. Thinc provides several
+[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
+implementation of the `get_loss` function.
 
 ```python
-### {highlight="12-14"}
+### The update method {highlight="12-14"}
 def update(
     self,
     examples: Iterable[Example],
@@ -703,15 +753,14 @@ def update(
     return losses
 ```
 
-Thinc provides several [loss functions](https://thinc.ai/docs/api-loss) that can
-be used for the implementation of the `get_loss` function.
-
 When the internal model is trained, the component can be used to make novel
-predictions. The [`predict`](/api/pipe#predict) function needs to be implemented
-for each subclass of `Pipe`. In our case, we can simply delegate to the internal
-model's [predict](https://thinc.ai/docs/api-model#predict) function:
+**predictions**. The [`predict`](/api/pipe#predict) function needs to be
+implemented for each subclass of `Pipe`. In our case, we can simply delegate to
+the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
+that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
 
 ```python
+### The predict method
 def predict(self, docs: Iterable[Doc]) -> Floats2d:
     predictions = self.model.predict(docs)
     return self.model.ops.asarray(predictions)
@@ -721,32 +770,36 @@ The final method that needs to be implemented, is
 [`set_annotations`](/api/pipe#set_annotations). This function takes the
 predictions, and modifies the given `Doc` object in place to store them. For our
 relation extraction component, we store the data as a dictionary in a custom
-extension attribute `doc._.rel`. As keys, we represent the candidate pair by the
-start offsets of each entity, as this defines an entity pair uniquely within one
-document.
+[extension attribute](/usage/processing-pipelines#custom-components-attributes)
+`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
+each entity**, as this defines an entity pair uniquely within one document.
 
-To interpret the scores predicted by the REL model correctly, we need to refer
-to the model's `get_candidates` function that defined which pairs of entities
-were relevant candidates, so that the predictions can be linked to those exact
-entities:
+To interpret the scores predicted by the relation extraction model correctly, we
+need to refer to the model's `get_candidates` function that defined which pairs
+of entities were relevant candidates, so that the predictions can be linked to
+those exact entities:
 
 > #### Example output
 >
 > ```python
 > doc = nlp("Amsterdam is the capital of the Netherlands.")
-> print(f"spans: [(e.start, e.text, e.label_) for e in doc.ents]")
+> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
 > for value, rel_dict in doc._.rel.items():
 >     print(f"{value}: {rel_dict}")
-> ```
-
-> ```
-> spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
-> (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
-> (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+>
+> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
 > ```
 
 ```python
-###  {highlight="5-6,10"}
+### Registering the extension attribute
+from spacy.tokens import Doc
+Doc.set_extension("rel", default={})
+```
+
+```python
+### The set_annotations method {highlight="5-6,10"}
 def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
     c = 0
     get_candidates = self.model.attrs["get_candidates"]
@@ -761,9 +814,10 @@ def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
 ```
 
 Under the hood, when the pipe is applied to a document, it delegates to the
-`predict` and `set_annotations` functions:
+`predict` and `set_annotations` methods:
 
 ```python
+### The __call__ method
 def __call__(self, Doc doc):
     predictions = self.predict([doc])
     self.set_annotations([doc], predictions)
@@ -771,29 +825,38 @@ def __call__(self, Doc doc):
 ```
 
 Once our `Pipe` subclass is fully implemented, we can
-[register](http://localhost:8000/usage/processing-pipelines#custom-components-factories)
-the component with the `Language.factory` decorator. This enables the creation
-of the component with `nlp.add_pipe`, or via the config.
+[register](/usage/processing-pipelines#custom-components-factories) the
+component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
+assigns it a name and lets you create the component with
+[`nlp.add_pipe`](/api/language#add_pipe) and via the
+[config](/usage/training#config).
 
-> ```
+> #### config.cfg (excerpt)
 >
+> ```ini
 > [components.relation_extractor]
 > factory = "relation_extractor"
-> labels = []
 >
 > [components.relation_extractor.model]
 > @architectures = "rel_model.v1"
-> ...
+>
+> [components.relation_extractor.model.tok2vec]
+> # ...
+>
+> [components.relation_extractor.model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
 > ```
 
 ```python
+### Registering the pipeline component
 from spacy.language import Language
 
 @Language.factory("relation_extractor")
-def make_relation_extractor(nlp, name, model, labels):
-    return RelationExtractor(nlp.vocab, model, name, labels=labels)
+def make_relation_extractor(nlp, name, model):
+    return RelationExtractor(nlp.vocab, model, name)
 ```
 
-<!-- TODO: refer once more to example project -->
+<!-- TODO: <Project id="tutorials/ner-relations">
 
-<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg) -->
+</Project> -->

From fd2d48556c1e77f4492693e4a69dc8f4a34cfe34 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 13:43:32 +0200
Subject: [PATCH 396/516] fix E902 and E903 numbering

---
 spacy/errors.py                                | 4 ++--
 spacy/training/converters/conll_ner_to_docs.py | 2 +-
 spacy/training/converters/iob_to_docs.py       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 20edf45b5..9d9a716d2 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,10 +456,10 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
-    E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+    E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
             "Try checking whitespace and delimiters. See "
             "https://nightly.spacy.io/api/cli#convert")
-    E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
+    E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
             "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
     E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
             "dimension refers to the output width, after the linear projection "
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 28f0f87c3..c01686aee 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -103,7 +103,7 @@ def conll_ner_to_docs(
             lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
             cols = list(zip(*[line.split() for line in lines]))
             if len(cols) < 2:
-                raise ValueError(Errors.E093)
+                raise ValueError(Errors.E903)
             length = len(cols[0])
             words.extend(cols[0])
             sent_starts.extend([True] + [False] * (length - 1))
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index 73ad8953d..a2185fef7 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
                 sent_words, sent_iob = zip(*sent_tokens)
                 sent_tags = ["-"] * len(sent_words)
             else:
-                raise ValueError(Errors.E092)
+                raise ValueError(Errors.E902)
             words.extend(sent_words)
             tags.extend(sent_tags)
             iob.extend(sent_iob)

From 20f2a17a09dc053b5f2f06cff637fb92647137ad Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 13:45:57 +0200
Subject: [PATCH 397/516] Merge test_misc and test_util

---
 spacy/tests/test_misc.py | 134 ++++++++++++++++++++++++++++++++++++++
 spacy/tests/test_util.py | 137 ---------------------------------------
 2 files changed, 134 insertions(+), 137 deletions(-)
 delete mode 100644 spacy/tests/test_util.py

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index e6ef45f90..bdf54ad6a 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,6 +7,15 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList
+from thinc.api import Config, Optimizer, ConfigValidationError
+from spacy.training.batchers import minibatch_by_words
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc
 
 
 @pytest.fixture
@@ -157,3 +166,128 @@ def test_dot_to_dict(dot_notation, expected):
     result = util.dot_to_dict(dot_notation)
     assert result == expected
     assert util.dict_to_dot(result) == dot_notation
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 400, 199], [3]),
+        ([400, 400, 199, 3], [4]),
+        ([400, 400, 199, 3, 200], [3, 2]),
+        ([400, 400, 199, 3, 1], [5]),
+        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
+        ([400, 400, 199, 3, 1, 200], [3, 3]),
+        ([400, 400, 199, 3, 1, 999], [3, 3]),
+        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+        ([1, 2, 999], [3]),
+        ([1, 2, 999, 1], [4]),
+        ([1, 200, 999, 1], [2, 2]),
+        ([1, 999, 200, 1], [2, 2]),
+    ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(
+        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
+    )
+    assert [len(batch) for batch in batches] == expected_batches
+
+    max_size = batch_size + batch_size * tol
+    for batch in batches:
+        assert sum([len(doc) for doc in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 4000, 199], [1, 2]),
+        ([400, 400, 199, 3000, 200], [1, 4]),
+        ([400, 400, 199, 3, 1, 1500], [1, 5]),
+        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+        ([1, 2, 9999], [1, 2]),
+        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+    ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+    """ Test that oversized documents are returned in their own batch"""
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(
+        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
+    )
+    assert [len(batch) for batch in batches] == expected_batches
+
+
+def test_util_dot_section():
+    cfg_string = """
+    [nlp]
+    lang = "en"
+    pipeline = ["textcat"]
+
+    [components]
+
+    [components.textcat]
+    factory = "textcat"
+
+    [components.textcat.model]
+    @architectures = "spacy.TextCatBOW.v1"
+    exclusive_classes = true
+    ngram_size = 1
+    no_output_layer = false
+    """
+    nlp_config = Config().from_str(cfg_string)
+    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
+    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
+    default_config["nlp"]["lang"] = "nl"
+    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
+    # Test that creation went OK
+    assert isinstance(en_nlp, English)
+    assert isinstance(nl_nlp, Dutch)
+    assert nl_nlp.pipe_names == []
+    assert en_nlp.pipe_names == ["textcat"]
+    # not exclusive_classes
+    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
+    # Test that default values got overwritten
+    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
+    # Test proper functioning of 'dot_to_object'
+    with pytest.raises(KeyError):
+        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
+    with pytest.raises(KeyError):
+        dot_to_object(en_nlp.config, "nlp.unknownattribute")
+    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
+
+
+def test_simple_frozen_list():
+    t = SimpleFrozenList(["foo", "bar"])
+    assert t == ["foo", "bar"]
+    assert t.index("bar") == 1  # okay method
+    with pytest.raises(NotImplementedError):
+        t.append("baz")
+    with pytest.raises(NotImplementedError):
+        t.sort()
+    with pytest.raises(NotImplementedError):
+        t.extend(["baz"])
+    with pytest.raises(NotImplementedError):
+        t.pop()
+    t = SimpleFrozenList(["foo", "bar"], error="Error!")
+    with pytest.raises(NotImplementedError):
+        t.append("baz")
+
+
+def test_resolve_dot_names():
+    config = {
+        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+    }
+    result = util.resolve_dot_names(config, ["training.optimizer"])
+    assert isinstance(result[0], Optimizer)
+    with pytest.raises(ConfigValidationError) as e:
+        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
+    errors = e.value.errors
+    assert len(errors) == 1
+    assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
deleted file mode 100644
index f710a38eb..000000000
--- a/spacy/tests/test_util.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import pytest
-
-from spacy import util
-from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer, ConfigValidationError
-from spacy.training.batchers import minibatch_by_words
-from spacy.lang.en import English
-from spacy.lang.nl import Dutch
-from spacy.language import DEFAULT_CONFIG_PATH
-from spacy.schemas import ConfigSchemaTraining
-
-from .util import get_random_doc
-
-
-@pytest.mark.parametrize(
-    "doc_sizes, expected_batches",
-    [
-        ([400, 400, 199], [3]),
-        ([400, 400, 199, 3], [4]),
-        ([400, 400, 199, 3, 200], [3, 2]),
-        ([400, 400, 199, 3, 1], [5]),
-        ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded
-        ([400, 400, 199, 3, 1, 200], [3, 3]),
-        ([400, 400, 199, 3, 1, 999], [3, 3]),
-        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
-        ([1, 2, 999], [3]),
-        ([1, 2, 999, 1], [4]),
-        ([1, 200, 999, 1], [2, 2]),
-        ([1, 999, 200, 1], [2, 2]),
-    ],
-)
-def test_util_minibatch(doc_sizes, expected_batches):
-    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
-    tol = 0.2
-    batch_size = 1000
-    batches = list(
-        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
-    )
-    assert [len(batch) for batch in batches] == expected_batches
-
-    max_size = batch_size + batch_size * tol
-    for batch in batches:
-        assert sum([len(doc) for doc in batch]) < max_size
-
-
-@pytest.mark.parametrize(
-    "doc_sizes, expected_batches",
-    [
-        ([400, 4000, 199], [1, 2]),
-        ([400, 400, 199, 3000, 200], [1, 4]),
-        ([400, 400, 199, 3, 1, 1500], [1, 5]),
-        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
-        ([1, 2, 9999], [1, 2]),
-        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
-    ],
-)
-def test_util_minibatch_oversize(doc_sizes, expected_batches):
-    """ Test that oversized documents are returned in their own batch"""
-    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
-    tol = 0.2
-    batch_size = 1000
-    batches = list(
-        minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
-    )
-    assert [len(batch) for batch in batches] == expected_batches
-
-
-def test_util_dot_section():
-    cfg_string = """
-    [nlp]
-    lang = "en"
-    pipeline = ["textcat"]
-
-    [components]
-
-    [components.textcat]
-    factory = "textcat"
-
-    [components.textcat.model]
-    @architectures = "spacy.TextCatBOW.v1"
-    exclusive_classes = true
-    ngram_size = 1
-    no_output_layer = false
-    """
-    nlp_config = Config().from_str(cfg_string)
-    en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
-    default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
-    default_config["nlp"]["lang"] = "nl"
-    nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
-    # Test that creation went OK
-    assert isinstance(en_nlp, English)
-    assert isinstance(nl_nlp, Dutch)
-    assert nl_nlp.pipe_names == []
-    assert en_nlp.pipe_names == ["textcat"]
-    # not exclusive_classes
-    assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
-    # Test that default values got overwritten
-    assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
-    assert nl_nlp.config["nlp"]["pipeline"] == []  # default value []
-    # Test proper functioning of 'dot_to_object'
-    with pytest.raises(KeyError):
-        dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
-    with pytest.raises(KeyError):
-        dot_to_object(en_nlp.config, "nlp.unknownattribute")
-    T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
-    assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
-
-
-def test_simple_frozen_list():
-    t = SimpleFrozenList(["foo", "bar"])
-    assert t == ["foo", "bar"]
-    assert t.index("bar") == 1  # okay method
-    with pytest.raises(NotImplementedError):
-        t.append("baz")
-    with pytest.raises(NotImplementedError):
-        t.sort()
-    with pytest.raises(NotImplementedError):
-        t.extend(["baz"])
-    with pytest.raises(NotImplementedError):
-        t.pop()
-    t = SimpleFrozenList(["foo", "bar"], error="Error!")
-    with pytest.raises(NotImplementedError):
-        t.append("baz")
-
-
-def test_resolve_dot_names():
-    config = {
-        "training": {"optimizer": {"@optimizers": "Adam.v1"}},
-        "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
-    }
-    result = util.resolve_dot_names(config, ["training.optimizer"])
-    assert isinstance(result[0], Optimizer)
-    with pytest.raises(ConfigValidationError) as e:
-        util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
-    errors = e.value.errors
-    assert len(errors) == 1
-    assert errors[0]["loc"] == ["training", "xyz"]

From 6958510bdaaa279c8b4f5184bbdbbe6cf3c7cf8a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 13:53:07 +0200
Subject: [PATCH 398/516] Include spaCy version check in project CLI

---
 spacy/cli/_util.py                  |  7 +++++--
 spacy/cli/project/remote_storage.py |  7 +++++--
 spacy/cli/project/run.py            | 31 +++++++++++++++++++++++++----
 spacy/tests/test_misc.py            | 15 ++++++++++++++
 spacy/util.py                       | 27 +++++++++++++++++++++++++
 5 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c959c9861..676a7c8d7 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
 import sys
 import shutil
 from pathlib import Path
@@ -193,12 +193,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
                 )
 
 
-def get_hash(data) -> str:
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
     """Get the hash for a JSON-serializable object.
 
     data: The data to hash.
+    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
     RETURNS (str): The hash.
     """
+    if isinstance(data, dict):
+        data = {k: v for k, v in data.items() if k not in exclude}
     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
     return hashlib.md5(data_str).hexdigest()
 
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index e7e7cbbe8..7e2caa8d7 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -7,7 +7,8 @@ import tarfile
 from pathlib import Path
 
 from .._util import get_hash, get_checksum, download_file, ensure_pathy
-from ...util import make_tempdir
+from ...util import make_tempdir, get_minor_version
+from ... import about
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -129,7 +130,9 @@ def get_command_hash(
     currently installed packages, whatever environment variables have been marked
     as relevant, and the command.
     """
-    hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
+    spacy_v = get_minor_version(about.__version__)
+    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
+    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
     hashes.extend(cmd)
     creation_bytes = "".join(hashes).encode("utf8")
     return hashlib.md5(creation_bytes).hexdigest()
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 69c49fba7..94d4371d0 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -4,8 +4,10 @@ from wasabi import msg
 import sys
 import srsly
 
+from ... import about
+from ...git_info import GIT_VERSION
 from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList
+from ...util import SimpleFrozenList, is_minor_version_match
 from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
 from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
 
@@ -63,11 +65,11 @@ def project_run(
                 err_kwargs = {"exits": 1} if not dry else {}
                 msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:
+            msg.divider(subcommand)
             rerun = check_rerun(current_dir, cmd)
             if not rerun and not force:
                 msg.info(f"Skipping '{cmd['name']}': nothing changed")
             else:
-                msg.divider(subcommand)
                 run_commands(cmd["script"], dry=dry)
                 if not dry:
                     update_lockfile(current_dir, cmd)
@@ -171,12 +173,18 @@ def validate_subcommand(
         )
 
 
-def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
+def check_rerun(
+    project_dir: Path,
+    command: Dict[str, Any],
+    check_spacy_version: bool = True,
+    check_spacy_commit: bool = False,
+) -> bool:
     """Check if a command should be rerun because its settings or inputs/outputs
     changed.
 
     project_dir (Path): The current project directory.
     command (Dict[str, Any]): The command, as defined in the project.yml.
+    strict_version (bool):
     RETURNS (bool): Whether to re-run the command.
     """
     lock_path = project_dir / PROJECT_LOCK
@@ -189,10 +197,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
     # Always run commands with no outputs (otherwise they'd always be skipped)
     if not entry.get("outs", []):
         return True
+    # Always rerun if spaCy version or commit hash changed
+    spacy_v = entry.get("spacy_version")
+    commit = entry.get("spacy_git_version")
+    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
+        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
+        return True
+    if check_spacy_commit and commit != GIT_VERSION:
+        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+        return True
     # If the entry in the lockfile matches the lockfile entry that would be
     # generated from the current command, we don't rerun because it means that
     # all inputs/outputs, hashes and scripts are the same and nothing changed
-    return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
+    lock_entry = get_lock_entry(project_dir, command)
+    exclude = ["spacy_version", "spacy_git_version"]
+    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
 
 
 def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@@ -231,6 +252,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
         "script": command["script"],
         "deps": deps,
         "outs": [*outs, *outs_nc],
+        "spacy_version": about.__version__,
+        "spacy_git_version": GIT_VERSION,
     }
 
 
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index bdf54ad6a..b9a0a9d05 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
     assert util.is_unconstrained_version(constraint) is expected
 
 
+@pytest.mark.parametrize(
+    "a1,a2,b1,b2,is_match",
+    [
+        ("3.0.0", "3.0", "3.0.1", "3.0", True),
+        ("3.1.0", "3.1", "3.2.1", "3.2", False),
+        ("xxx", None, "1.2.3.dev0", "1.2", False),
+    ],
+)
+def test_minor_version(a1, a2, b1, b2, is_match):
+    assert util.get_minor_version(a1) == a2
+    assert util.get_minor_version(b1) == b2
+    assert util.is_minor_version_match(a1, b1) is is_match
+    assert util.is_minor_version_match(a2, b2) is is_match
+
+
 @pytest.mark.parametrize(
     "dot_notation,expected",
     [
diff --git a/spacy/util.py b/spacy/util.py
index 4d68e829c..4b2cb018a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -584,6 +584,33 @@ def get_base_version(version: str) -> str:
     return Version(version).base_version
 
 
+def get_minor_version(version: str) -> Optional[str]:
+    """Get the major + minor version (without patch or prerelease identifiers).
+
+    version (str): The version.
+    RETURNS (str): The major + minor version or None if version is invalid.
+    """
+    try:
+        v = Version(version)
+    except (TypeError, InvalidVersion):
+        return None
+    return f"{v.major}.{v.minor}"
+
+
+def is_minor_version_match(version_a: str, version_b: str) -> bool:
+    """Compare two versions and check if they match in major and minor, without
+    patch or prerelease identifiers. Used internally for compatibility checks
+    that should be insensitive to patch releases.
+
+    version_a (str): The first version
+    version_b (str): The second version.
+    RETURNS (bool): Whether the versions match.
+    """
+    a = get_minor_version(version_a)
+    b = get_minor_version(version_b)
+    return a is not None and b is not None and a == b
+
+
 def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
     """Load a model meta.json from a path and validate its contents.
 

From d2b9aafb8c8d91ea74c2418d9fb32f1ce8812bbf Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 14:14:49 +0200
Subject: [PATCH 399/516] Fix augmenter

---
 spacy/training/augment.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index e6d10a195..06656bdd8 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -120,8 +120,8 @@ def make_orth_variants(
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
     logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
-    words = token_dict.get("words", [])
-    tags = token_dict.get("tags", [])
+    words = token_dict.get("ORTH", [])
+    tags = token_dict.get("TAG", [])
     # keep unmodified if words or tags are not defined
     if words and tags:
         if lower:
@@ -131,7 +131,7 @@ def make_orth_variants(
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndsv)):
                 if (
-                    tags[word_idx] in ndsv[punct_idx]["tags"]
+                    tags[word_idx] in ndsv[punct_idx]["TAG"]
                     and words[word_idx] in ndsv[punct_idx]["variants"]
                 ):
                     words[word_idx] = punct_choices[punct_idx]
@@ -139,14 +139,14 @@ def make_orth_variants(
         punct_choices = [random.choice(x["variants"]) for x in ndpv]
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndpv)):
-                if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
+                if tags[word_idx] in ndpv[punct_idx]["TAG"] and words[
                     word_idx
                 ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
                     # backup option: random left vs. right from pair
                     pair_idx = random.choice([0, 1])
                     # best option: rely on paired POS tags like `` / ''
-                    if len(ndpv[punct_idx]["tags"]) == 2:
-                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                    if len(ndpv[punct_idx]["TAG"]) == 2:
+                        pair_idx = ndpv[punct_idx]["TAG"].index(tags[word_idx])
                     # next best option: rely on position in variants
                     # (may not be unambiguous, so order of variants matters)
                     else:
@@ -154,8 +154,8 @@ def make_orth_variants(
                             if words[word_idx] in pair:
                                 pair_idx = pair.index(words[word_idx])
                     words[word_idx] = punct_choices[punct_idx][pair_idx]
-        token_dict["words"] = words
-        token_dict["tags"] = tags
+        token_dict["ORTH"] = words
+        token_dict["TAG"] = tags
     # modify raw
     if raw is not None:
         variants = []

From 5d19dfc9d32c7fd039118d9fe0f8cf713e7af471 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 14:21:53 +0200
Subject: [PATCH 400/516] Update Chinese tokenizer for spacy-pkuseg fork

---
 spacy/lang/zh/__init__.py | 62 +++++++++++----------------------------
 spacy/tests/conftest.py   |  5 ++--
 2 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 55a77330a..8864ae119 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -17,8 +17,7 @@ from ... import util
 
 
 # fmt: off
-_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
-_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
+_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
 # fmt: on
 
 DEFAULT_CONFIG = """
@@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
         if self.segmenter == Segmenter.pkuseg:
             if reset:
                 try:
-                    import pkuseg
+                    import spacy_pkuseg
 
-                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
+                    self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
                 except ImportError:
                     msg = (
-                        "pkuseg not installed: unable to reset pkuseg "
+                        "spacy_pkuseg not installed: unable to reset pkuseg "
                         "user dict. Please " + _PKUSEG_INSTALL_MSG
                     )
                     raise ImportError(msg) from None
@@ -156,22 +155,6 @@ class ChineseTokenizer(DummyTokenizer):
                 self.pkuseg_seg.feature_extractor.save(tempdir)
                 self.pkuseg_seg.model.save(tempdir)
                 tempdir = Path(tempdir)
-                # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
-                # means that it will be saved with pickle protocol 5 with
-                # python 3.8, which can't be reloaded with python 3.6-3.7.
-                # To try to make the model compatible with python 3.6+, reload
-                # the data with pickle5 and convert it back to protocol 4.
-                try:
-                    import pickle5
-
-                    with open(tempdir / "features.pkl", "rb") as fileh:
-                        features = pickle5.load(fileh)
-                    with open(tempdir / "features.pkl", "wb") as fileh:
-                        pickle5.dump(features, fileh, protocol=4)
-                except ImportError as e:
-                    raise e
-                except Exception:
-                    warnings.warn(_PKUSEG_PICKLE_WARNING)
                 with open(tempdir / "features.pkl", "rb") as fileh:
                     pkuseg_features_b = fileh.read()
                 with open(tempdir / "weights.npz", "rb") as fileh:
@@ -218,17 +201,17 @@ class ChineseTokenizer(DummyTokenizer):
                 with open(tempdir / "weights.npz", "wb") as fileh:
                     fileh.write(pkuseg_data["weights_b"])
                 try:
-                    import pkuseg
+                    import spacy_pkuseg
                 except ImportError:
                     raise ImportError(
-                        "pkuseg not installed. To use this model, "
+                        "spacy_pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
                     ) from None
-                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
+                self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
             if pkuseg_data["processors_data"]:
                 processors_data = pkuseg_data["processors_data"]
                 (user_dict, do_process, common_words, other_words) = processors_data
-                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+                self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
                 self.pkuseg_seg.postprocesser.do_process = do_process
                 self.pkuseg_seg.postprocesser.common_words = set(common_words)
                 self.pkuseg_seg.postprocesser.other_words = set(other_words)
@@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
                     path.mkdir(parents=True)
                 self.pkuseg_seg.model.save(path)
                 self.pkuseg_seg.feature_extractor.save(path)
-                # try to convert features.pkl to pickle protocol 4
-                try:
-                    import pickle5
-
-                    with open(path / "features.pkl", "rb") as fileh:
-                        features = pickle5.load(fileh)
-                    with open(path / "features.pkl", "wb") as fileh:
-                        pickle5.dump(features, fileh, protocol=4)
-                except ImportError as e:
-                    raise e
-                except Exception:
-                    warnings.warn(_PKUSEG_PICKLE_WARNING)
 
         def save_pkuseg_processors(path):
             if self.pkuseg_seg:
@@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
 
         def load_pkuseg_model(path):
             try:
-                import pkuseg
+                import spacy_pkuseg
             except ImportError:
                 if self.segmenter == Segmenter.pkuseg:
                     raise ImportError(
-                        "pkuseg not installed. To use this model, "
+                        "spacy_pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
                     ) from None
             if path.exists():
-                self.pkuseg_seg = pkuseg.pkuseg(path)
+                self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
 
         def load_pkuseg_processors(path):
             try:
-                import pkuseg
+                import spacy_pkuseg
             except ImportError:
                 if self.segmenter == Segmenter.pkuseg:
                     raise ImportError(self._pkuseg_install_msg) from None
             if self.segmenter == Segmenter.pkuseg:
                 data = srsly.read_msgpack(path)
                 (user_dict, do_process, common_words, other_words) = data
-                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+                self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
                 self.pkuseg_seg.postprocesser.do_process = do_process
                 self.pkuseg_seg.postprocesser.common_words = set(common_words)
                 self.pkuseg_seg.postprocesser.other_words = set(other_words)
@@ -341,12 +312,13 @@ def try_jieba_import() -> None:
 
 def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
     try:
-        import pkuseg
+        import spacy_pkuseg
 
-        return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
     except ImportError:
-        msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+        msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
         raise ImportError(msg) from None
+    try:
+        return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
     except FileNotFoundError:
         msg = "Unable to load pkuseg model from: " + pkuseg_model
         raise FileNotFoundError(msg) from None
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 4a3d126d7..bb9f770bc 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -285,8 +285,7 @@ def zh_tokenizer_jieba():
 
 @pytest.fixture(scope="session")
 def zh_tokenizer_pkuseg():
-    pytest.importorskip("pkuseg")
-    pytest.importorskip("pickle5")
+    pytest.importorskip("spacy_pkuseg")
     config = {
         "nlp": {
             "tokenizer": {
@@ -296,7 +295,7 @@ def zh_tokenizer_pkuseg():
         },
         "initialize": {
             "tokenizer": {
-                "pkuseg_model": "default",
+                "pkuseg_model": "web",
             }
         },
     }

From f4f49f5877d4a0cca4ef9e03ea1c39aa742ba797 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 5 Oct 2020 14:58:56 +0200
Subject: [PATCH 401/516] update blis (#6198)

* allow higher blis version

* fix typo

* bump to 3.0.0a34

* fix pins in other files
---
 pyproject.toml                   | 2 +-
 requirements.txt                 | 2 +-
 setup.cfg                        | 2 +-
 spacy/about.py                   | 2 +-
 spacy/pipeline/morphologizer.pyx | 2 +-
 spacy/pipeline/pipe.pyx          | 2 +-
 spacy/pipeline/senter.pyx        | 2 +-
 spacy/pipeline/tagger.pyx        | 2 +-
 spacy/pipeline/textcat.py        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 611a95d27..d48886e0c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
     "thinc>=8.0.0a43,<8.0.0a50",
-    "blis>=0.4.0,<0.5.0",
+    "blis>=0.4.0,<0.8.0",
     "pytokenizations",
     "pathy"
 ]
diff --git a/requirements.txt b/requirements.txt
index 44dad38e3..29695e9b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a43,<8.0.0a50
-blis>=0.4.0,<0.5.0
+blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 7192ba9d4..d8362c4bd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ install_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     thinc>=8.0.0a43,<8.0.0a50
-    blis>=0.4.0,<0.5.0
+    blis>=0.4.0,<0.8.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.3.0,<3.0.0
     catalogue>=2.0.1,<2.1.0
diff --git a/spacy/about.py b/spacy/about.py
index dce627a38..392bfd589 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a33"
+__version__ = "3.0.0a34"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 82f3bf37d..6d97b062f 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -210,7 +210,7 @@ class Morphologizer(Tagger):
 
         examples (Iterable[Examples]): The batch of examples.
         scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
 
         DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
         """
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 41ca23ace..8e103a638 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -162,7 +162,7 @@ cdef class Pipe:
 
         examples (Iterable[Examples]): The batch of examples.
         scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
 
         DOCS: https://nightly.spacy.io/api/pipe#get_loss
         """
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 0bfef7c7b..8fb1e664f 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
 
         examples (Iterable[Examples]): The batch of examples.
         scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
 
         DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
         """
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 6cb582b36..94ac0c082 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -249,7 +249,7 @@ class Tagger(Pipe):
 
         examples (Iterable[Examples]): The batch of examples.
         scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
 
         DOCS: https://nightly.spacy.io/api/tagger#get_loss
         """
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index fc60ebf89..292598e3a 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
 
         examples (Iterable[Examples]): The batch of examples.
         scores: Scores representing the model's predictions.
-        RETUTNRS (Tuple[float, float]): The loss and the gradient.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
         """

From 251b3eb4e5c688e076f4e761a43ffbab9ea793b9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 14:59:13 +0200
Subject: [PATCH 402/516] add initialize method for entity_ruler

---
 spacy/errors.py               |  2 ++
 spacy/pipeline/entityruler.py | 30 +++++++++++++++++++++++++++++-
 spacy/training/initialize.py  |  2 +-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 20edf45b5..18abb6bba 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,8 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E900 = ("Patterns for component '{name}' not initialized. This can be fixed "
+            "by calling 'add_patterns' or 'initialize'.")
     E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
             "Try checking whitespace and delimiters. See "
             "https://nightly.spacy.io/api/cli#convert")
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 9166a69b8..a4bc098fb 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,7 +1,8 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any
+from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
 from collections import defaultdict
 from pathlib import Path
 import srsly
+from spacy.training import Example
 
 from ..language import Language
 from ..errors import Errors
@@ -133,6 +134,7 @@ class EntityRuler:
 
         DOCS: https://nightly.spacy.io/api/entityruler#call
         """
+        self._require_patterns()
         matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
         matches = set(
             [(m_id, start, end) for m_id, start, end in matches if start != end]
@@ -183,6 +185,27 @@ class EntityRuler:
                 all_labels.add(l)
         return tuple(all_labels)
 
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable[Example]],
+        *,
+        nlp: Optional[Language] = None,
+        patterns_path: Optional[Path] = None
+    ):
+        """Initialize the pipe for training.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+        patterns_path: Path to serialized patterns.
+
+        DOCS (TODO): https://nightly.spacy.io/api/entityruler#initialize
+        """
+        if patterns_path:
+            patterns = srsly.read_jsonl(patterns_path)
+            self.add_patterns(patterns)
+
+
     @property
     def ent_ids(self) -> Tuple[str, ...]:
         """All entity ids present in the match patterns `id` properties
@@ -292,6 +315,11 @@ class EntityRuler:
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(dict)
 
+    def _require_patterns(self) -> None:
+        """Raise an error if the component has no patterns."""
+        if not self.patterns or list(self.patterns) == [""]:
+            raise ValueError(Errors.E900.format(name=self.name))
+
     def _split_label(self, label: str) -> Tuple[str, str]:
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
 
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index bbdf4f62b..7c84caf95 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
             nlp.resume_training(sgd=optimizer)
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info("Initialized pipeline components")
+        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
     return nlp
 
 

From 8171e28b20aafc52ccf571b813b142b3355e550b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 15:09:52 +0200
Subject: [PATCH 403/516] Remove logging [ci skip]

This would be fired on each example, which is wrong
---
 spacy/training/augment.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index e6d10a195..ee5992b36 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -5,7 +5,7 @@ import copy
 from functools import partial
 from pydantic import BaseModel, StrictStr
 
-from ..util import registry, logger
+from ..util import registry
 from ..tokens import Doc
 from .example import Example
 
@@ -119,7 +119,6 @@ def make_orth_variants(
     orig_token_dict = copy.deepcopy(token_dict)
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
-    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
     words = token_dict.get("words", [])
     tags = token_dict.get("tags", [])
     # keep unmodified if words or tags are not defined

From 8ec79ad3fadd97f39b220c874e0df46921646fd0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:22:00 +0200
Subject: [PATCH 404/516] Allow configuration of MultiHashEmbed features

Update arguments to MultiHashEmbed layer so that the attributes can be
controlled. A kind of tricky scheme is used to allow optional
specification of the rows. I think it's an okay balance between
flexibility and convenience.
---
 spacy/ml/models/tok2vec.py | 100 +++++++++++++++++++++++++------------
 spacy/tests/test_models.py |  32 +++++++++++-
 2 files changed, 98 insertions(+), 34 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 1a0979cab..4abc1bee6 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict
 from thinc.types import Floats2d
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@@ -11,7 +11,7 @@ from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
+from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
 
 
 @registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -54,12 +54,16 @@ def build_hash_embed_cnn_tok2vec(
         a language such as Chinese.
     pretrained_vectors (bool): Whether to also use static vectors.
     """
+    if subword_features:
+        attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5}
+    else:
+        attrs = {"NORM": 1.0}
     return build_Tok2Vec_model(
         embed=MultiHashEmbed(
             width=width,
             rows=embed_size,
-            also_embed_subwords=subword_features,
-            also_use_static_vectors=bool(pretrained_vectors),
+            attrs=attrs,
+            include_static_vectors=bool(pretrained_vectors),
         ),
         encode=MaxoutWindowEncoder(
             width=width,
@@ -92,59 +96,89 @@ def build_Tok2Vec_model(
 
 
 @registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(
+def MultiHashEmbed_v1(
     width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
+) -> Model[List[Doc], List[Floats2d]]:
+    """Previous interface for MultiHashEmbed. This should be removed, it's only
+    here as a temporary compatibility."""
+    return MultiHashEmbed(
+        width=width,
+        rows=rows, 
+        attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM],
+        include_static_vectors=also_use_static_vectors
+    )
+
+@registry.architectures.register("spacy.MultiHashEmbed.v2")
+def MultiHashEmbed(
+    width: int,
+    rows: int,
+    attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]],
+    include_static_vectors: bool
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
     through a feed-forward subnetwork to build a mixed representations.
 
-    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
-    varying definitions depending on the Vocab of the Doc object passed in.
-    Vectors from pretrained static vectors can also be incorporated into the
-    concatenated representation.
+    The features used can be configured with the 'attrs' argument. The suggested
+    attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
+    account some subword information, without contruction a fully character-based
+    representation. If pretrained vectors are available, they can be included in
+    the representation as well, with the vectors table will be kept static
+    (i.e. it's not updated).
+
+    The `width` parameter specifices the output width of the layer and the widths
+    of all embedding tables. If static vectors are included, a learned linear
+    layer is used to map the vectors to the specified width before concatenating
+    it with the other embedding outputs. A single Maxout layer is then used to
+    reduce the concatenated vectors to the final width.
+    
+    The `rows` parameter controls the number of rows used by the `HashEmbed`
+    tables. The HashEmbed layer needs surprisingly few rows, due to its use of
+    the hashing trick. Generally between 2000 and 10000 rows is sufficient,
+    even for very large vocabularies. You can vary the number of rows per
+    attribute by specifying the attrs as a dict, mapping the keys to float
+    values which are interpreted as factors of `rows`. For instance, 
+    attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and 
+    rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are
+    assumed for all attributes.
 
     width (int): The output width. Also used as the width of the embedding tables.
         Recommended values are between 64 and 300.
-    rows (int): The number of rows for the embedding tables. Can be low, due
-        to the hashing trick. Embeddings for prefix, suffix and word shape
-        use half as many rows. Recommended values are between 2000 and 10000.
-    also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
-        features in the embeddings. If not using these, you may need more
-        rows in your hash embeddings, as there will be increased chance of
-        collisions.
-    also_use_static_vectors (bool): Whether to also use static word vectors.
+    rows (int): The base number of rows for the embedding tables. Can be low, due
+        to the hashing trick. The rows can be varied per attribute by providing
+        a dictionary as the value of `attrs`.
+    attrs (dict or list of attr IDs): The token attributes to embed. A separate
+        embedding table will be constructed for each attribute. Attributes
+        can be specified as a list or as a dictionary, which lets you control
+        the number of rows used for each table.
+    include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
+    if isinstance(attrs, dict):
+        # Exclude tables that would have 0 rows.
+        attrs = {key: value for key, value in attrs.items() if value > 0.0}
+    indices = {attr: i for i, attr in enumerate(attrs)}
     seed = 7
 
     def make_hash_embed(feature):
         nonlocal seed
+        row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0
         seed += 1
         return HashEmbed(
             width,
-            rows if feature == LOWER else rows // 2,
-            column=cols.index(feature),
+            int(rows * row_factor),
+            column=indices[feature],
             seed=seed,
             dropout=0.0,
         )
 
-    if also_embed_subwords:
-        embeddings = [
-            make_hash_embed(LOWER),
-            make_hash_embed(PREFIX),
-            make_hash_embed(SUFFIX),
-            make_hash_embed(SHAPE),
-        ]
-    else:
-        embeddings = [make_hash_embed(LOWER)]
-    concat_size = width * (len(embeddings) + also_use_static_vectors)
-    if also_use_static_vectors:
+    embeddings = [make_hash_embed(attr) for attr in attrs]
+    concat_size = width * (len(embeddings) + include_static_vectors)
+    if include_static_vectors:
         model = chain(
             concatenate(
                 chain(
-                    FeatureExtractor(cols),
+                    FeatureExtractor(list(attrs)),
                     list2ragged(),
                     with_array(concatenate(*embeddings)),
                 ),
@@ -155,7 +189,7 @@ def MultiHashEmbed(
         )
     else:
         model = chain(
-            FeatureExtractor(cols),
+            FeatureExtractor(list(attrs)),
             list2ragged(),
             with_array(concatenate(*embeddings)),
             with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index a123f459d..3bd3b903d 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -6,6 +6,7 @@ from numpy.testing import assert_array_equal
 import numpy
 
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
+from spacy.ml.models import MultiHashEmbed_v1
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
@@ -61,7 +62,10 @@ def get_tok2vec_kwargs():
     # This actually creates models, so seems best to put it in a function.
     return {
         "embed": MultiHashEmbed(
-            width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
+            width=32,
+            rows=500,
+            attrs=["NORM", "PREFIX", "SHAPE"],
+            include_static_vectors=False
         ),
         "encode": MaxoutWindowEncoder(
             width=32, depth=2, maxout_pieces=2, window_size=1
@@ -73,6 +77,32 @@ def test_tok2vec():
     return build_Tok2Vec_model(**get_tok2vec_kwargs())
 
 
+def test_multi_hash_embed():
+    embed = MultiHashEmbed(
+        width=32,
+        rows=500,
+        attrs=["NORM", "PREFIX", "SHAPE"],
+        include_static_vectors=False
+    )
+    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+    assert len(hash_embeds) == 3
+    # Check they look at different columns.
+    assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
+    # Check they use different seeds
+    assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
+    # Check they all have the same number of rows
+    assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
+    # Now try with different row factors
+    embed = MultiHashEmbed(
+        width=32,
+        rows=500,
+        attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5},
+        include_static_vectors=False
+    )
+    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+    assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
+ 
+
 @pytest.mark.parametrize(
     "seed,model_func,kwargs",
     [

From f2f1deca662a197c8e605e32238bfa015851f2ad Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:24:33 +0200
Subject: [PATCH 405/516] spacy/tests/

---
 spacy/tests/pipeline/test_tok2vec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 06212e351..78a677acf 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -25,8 +25,8 @@ def test_empty_doc():
         MultiHashEmbed(
             width=width,
             rows=embed_size,
-            also_use_static_vectors=False,
-            also_embed_subwords=True,
+            include_static_vectors=False,
+            attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
         ),
         MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
     )

From f4ca9a39cb5245da78f01d39f95efa53924ae15a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:27:06 +0200
Subject: [PATCH 406/516] spacy/tests/

---
 spacy/tests/pipeline/test_tok2vec.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 78a677acf..df844365b 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -45,8 +45,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
         MultiHashEmbed(
             width=width,
             rows=embed_size,
-            also_use_static_vectors=False,
-            also_embed_subwords=True,
+            include_static_vectors=False,
+            attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
         ),
         MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
     )
@@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
 @pytest.mark.parametrize(
     "width,embed_arch,embed_config,encode_arch,encode_config",
     [
-        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
-        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+        (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
+        (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
         (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
         (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],

From 7d93575f35a7fb8484096b772ce71834bfd1914a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:28:12 +0200
Subject: [PATCH 407/516] spacy/tests/

---
 spacy/tests/pipeline/test_tok2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index df844365b..aa60faf5b 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -62,7 +62,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     "width,embed_arch,embed_config,encode_arch,encode_config",
     [
         (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
-        (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+        (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
         (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
         (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],

From eb9ba61517e4e7f39b5521313e797bdbbf6740af Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:29:49 +0200
Subject: [PATCH 408/516] Format

---
 spacy/ml/models/tok2vec.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 4abc1bee6..6e5aed77b 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -103,17 +103,18 @@ def MultiHashEmbed_v1(
     here as a temporary compatibility."""
     return MultiHashEmbed(
         width=width,
-        rows=rows, 
+        rows=rows,
         attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM],
-        include_static_vectors=also_use_static_vectors
+        include_static_vectors=also_use_static_vectors,
     )
 
+
 @registry.architectures.register("spacy.MultiHashEmbed.v2")
 def MultiHashEmbed(
     width: int,
     rows: int,
     attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]],
-    include_static_vectors: bool
+    include_static_vectors: bool,
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it

From 90040aacec90f18d7e5a0c5f051352316f9e5cd0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 16:12:01 +0200
Subject: [PATCH 409/516] Fix merge

---
 spacy/training/augment.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index bbe164aed..685016b62 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -119,7 +119,6 @@ def make_orth_variants(
     orig_token_dict = copy.deepcopy(token_dict)
     ndsv = orth_variants.get("single", [])
     ndpv = orth_variants.get("paired", [])
-    logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
     words = token_dict.get("ORTH", [])
     tags = token_dict.get("TAG", [])
     # keep unmodified if words or tags are not defined

From 65abd777796b6850117180dc90399c5fb7f02ce3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 16:23:33 +0200
Subject: [PATCH 410/516] add finish_update to Pipe

---
 spacy/language.py                    |  2 +-
 spacy/pipeline/entity_linker.py      |  2 +-
 spacy/pipeline/multitask.pyx         |  2 +-
 spacy/pipeline/pipe.pyx              | 13 ++++++++++++-
 spacy/pipeline/tagger.pyx            |  4 ++--
 spacy/pipeline/textcat.py            |  4 ++--
 spacy/pipeline/tok2vec.py            |  2 +-
 spacy/pipeline/transition_parser.pyx |  4 ++--
 website/docs/api/pipe.md             | 18 ++++++++++++++++++
 9 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 9fdde03d5..be5886efa 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1094,7 +1094,7 @@ class Language:
                     and hasattr(proc, "model")
                     and proc.model not in (True, False, None)
                 ):
-                    proc.model.finish_update(sgd)
+                    proc.finish_update(sgd)
         return losses
 
     def rehearse(
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index b67a15d32..2a5f3962d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -238,7 +238,7 @@ class EntityLinker(Pipe):
         )
         bp_context(d_scores)
         if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         losses[self.name] += loss
         if set_annotations:
             self.set_annotations(docs, predictions)
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index ba351f16e..fa304b842 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -209,7 +209,7 @@ class ClozeMultitask(Pipe):
         loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
         bp_predictions(d_predictions)
         if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         if losses is not None:
             losses[self.name] += loss
         return losses
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 41ca23ace..585cdc780 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -132,7 +132,7 @@ cdef class Pipe:
         loss, d_scores = self.get_loss(examples, scores)
         bp_scores(d_scores)
         if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         losses[self.name] += loss
         if set_annotations:
             docs = [eg.predicted for eg in examples]
@@ -245,6 +245,17 @@ cdef class Pipe:
         with self.model.use_params(params):
             yield
 
+    def finish_update(self, sgd):
+        """Update parameters using the current parameter gradients.
+        The Optimizer instance contains the functionality to perform
+        the stochastic gradient descent.
+
+        sgd (thinc.api.Optimizer): The optimizer.
+
+        DOCS: https://nightly.spacy.io/api/pipe#finish_update
+        """
+        self.model.finish_update(sgd)
+
     def score(self, examples, **kwargs):
         """Score a batch of examples.
 
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 6cb582b36..5122e8ea9 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -203,7 +203,7 @@ class Tagger(Pipe):
         loss, d_tag_scores = self.get_loss(examples, tag_scores)
         bp_tag_scores(d_tag_scores)
         if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
 
         losses[self.name] += loss
         if set_annotations:
@@ -238,7 +238,7 @@ class Tagger(Pipe):
         target = self._rehearsal_model(examples)
         gradient = guesses - target
         backprop(gradient)
-        self.model.finish_update(sgd)
+        self.finish_update(sgd)
         if losses is not None:
             losses.setdefault(self.name, 0.0)
             losses[self.name] += (gradient**2).sum()
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index fc60ebf89..a37212e9e 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -212,7 +212,7 @@ class TextCategorizer(Pipe):
         loss, d_scores = self.get_loss(examples, scores)
         bp_scores(d_scores)
         if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         losses[self.name] += loss
         if set_annotations:
             docs = [eg.predicted for eg in examples]
@@ -256,7 +256,7 @@ class TextCategorizer(Pipe):
         gradient = scores - target
         bp_scores(gradient)
         if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         if losses is not None:
             losses[self.name] += (gradient ** 2).sum()
         return losses
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 89f9df757..0f309326e 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -188,7 +188,7 @@ class Tok2Vec(Pipe):
             accumulate_gradient(one_d_tokvecs)
             d_docs = bp_tokvecs(d_tokvecs)
             if sgd is not None:
-                self.model.finish_update(sgd)
+                self.finish_update(sgd)
             return d_docs
 
         batch_id = Tok2VecListener.get_batch_id(docs)
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index bcaa8e8d4..2ad0acd3a 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -315,7 +315,7 @@ cdef class Parser(Pipe):
 
         backprop_tok2vec(golds)
         if sgd not in (None, False):
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         if set_annotations:
             docs = [eg.predicted for eg in examples]
             self.set_annotations(docs, all_states)
@@ -367,7 +367,7 @@ cdef class Parser(Pipe):
         # Do the backprop
         backprop_tok2vec(docs)
         if sgd is not None:
-            self.model.finish_update(sgd)
+            self.finish_update(sgd)
         losses[self.name] += loss / n_scores
         del backprop
         del backprop_tok2vec
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index de35f9eb4..b98768dcf 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -294,6 +294,24 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |
 
+## Pipe.finish_update {#finish_update tag="method"}
+
+Update parameters using the current parameter gradients. Defaults to calling
+[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> optimizer = nlp.initialize()
+> losses = pipe.update(examples, sgd=None)
+> pipe.finish_update(sgd)
+> ```
+
+| Name  | Description                           |
+| ----- | ------------------------------------- |
+| `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
+
 ## Pipe.add_label {#add_label tag="method"}
 
 > #### Example

From 187234648cfb20974cdbf79b0d8a477c0aaf36b3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 16:24:28 +0200
Subject: [PATCH 411/516] Revert back to "default" as default for
 pkuseg_user_dict

---
 spacy/lang/zh/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 8864ae119..5d4d55aed 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -81,9 +81,11 @@ class ChineseTokenizer(DummyTokenizer):
         *,
         nlp: Optional[Language] = None,
         pkuseg_model: Optional[str] = None,
-        pkuseg_user_dict: str = "default",
+        pkuseg_user_dict: Optional[str] = "default",
     ):
         if self.segmenter == Segmenter.pkuseg:
+            if pkuseg_user_dict is None:
+                pkuseg_user_dict = pkuseg_model
             self.pkuseg_seg = try_pkuseg_import(
                 pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
             )

From dc06912c764991d2d6718919e5e96cae867a472d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 16:33:28 +0200
Subject: [PATCH 412/516] prevent loss keyerror for non-trainable components

---
 spacy/training/loggers.py | 17 ++++-------------
 spacy/training/loop.py    |  5 +++--
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index f0ca7064a..467f1e36b 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -41,19 +41,10 @@ def console_logger(progress_bar: bool = False):
                 if progress is not None:
                     progress.update(1)
                 return
-            try:
-                losses = [
-                    "{0:.2f}".format(float(info["losses"][pipe_name]))
-                    for pipe_name in logged_pipes
-                ]
-            except KeyError as e:
-                raise KeyError(
-                    Errors.E983.format(
-                        dict="scores (losses)",
-                        key=str(e),
-                        keys=list(info["losses"].keys()),
-                    )
-                ) from None
+            losses = [
+                "{0:.2f}".format(float(info["losses"][pipe_name]))
+                for pipe_name in logged_pipes if pipe_name in info["losses"]
+            ]
 
             scores = []
             for col in score_cols:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 0d4414964..8f0aea6d4 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -184,7 +184,7 @@ def train_while_improving(
                 and hasattr(proc, "model")
                 and proc.model not in (True, False, None)
             ):
-                proc.model.finish_update(optimizer)
+                proc.finish_update(optimizer)
         optimizer.step_schedules()
         if not (step % eval_frequency):
             if optimizer.averages:
@@ -287,7 +287,8 @@ def update_meta(
         if metric is not None:
             nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
     for pipe_name in nlp.pipe_names:
-        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+        if pipe_name in info["losses"]:
+            nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 
 
 def create_before_to_disk_callback(

From 9f1bc3f24c6c9f0412f815abe044274d3840fa23 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 16:40:23 +0200
Subject: [PATCH 413/516] Fix augment

---
 spacy/training/augment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 685016b62..c538f02d0 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -130,7 +130,7 @@ def make_orth_variants(
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndsv)):
                 if (
-                    tags[word_idx] in ndsv[punct_idx]["TAG"]
+                    tags[word_idx] in ndsv[punct_idx]["tags"]
                     and words[word_idx] in ndsv[punct_idx]["variants"]
                 ):
                     words[word_idx] = punct_choices[punct_idx]

From 4ed3e037df766aa2f2827a4b1a63a1f80a79485b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 16:40:23 +0200
Subject: [PATCH 414/516] Fix augment

---
 spacy/training/augment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 06656bdd8..7db8919e9 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -131,7 +131,7 @@ def make_orth_variants(
         for word_idx in range(len(words)):
             for punct_idx in range(len(ndsv)):
                 if (
-                    tags[word_idx] in ndsv[punct_idx]["TAG"]
+                    tags[word_idx] in ndsv[punct_idx]["tags"]
                     and words[word_idx] in ndsv[punct_idx]["variants"]
                 ):
                     words[word_idx] = punct_choices[punct_idx]

From 3ee3649b525a9bc1ddd8f531a10ffc213d185e46 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 16:59:49 +0200
Subject: [PATCH 415/516] Fix augment

---
 spacy/training/augment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index b6e22542a..e76ee49f7 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -144,8 +144,8 @@ def make_orth_variants(
                     # backup option: random left vs. right from pair
                     pair_idx = random.choice([0, 1])
                     # best option: rely on paired POS tags like `` / ''
-                    if len(ndpv[punct_idx]["TAG"]) == 2:
-                        pair_idx = ndpv[punct_idx]["TAG"].index(tags[word_idx])
+                    if len(ndpv[punct_idx]["tags"]) == 2:
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
                     # next best option: rely on position in variants
                     # (may not be unambiguous, so order of variants matters)
                     else:

From 84fedcebab288a19aebb4dc4462f346bf2cecc8f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 17:07:35 +0200
Subject: [PATCH 416/516] Make args keyword-only [ci skip]

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/cli/project/run.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 94d4371d0..ea4675d60 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -176,6 +176,7 @@ def validate_subcommand(
 def check_rerun(
     project_dir: Path,
     command: Dict[str, Any],
+    *,
     check_spacy_version: bool = True,
     check_spacy_commit: bool = False,
 ) -> bool:

From 4e3ace4b8c32b1b8806874e2c3120989f9ddaba9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 17:43:42 +0200
Subject: [PATCH 417/516] is_trainable method

---
 spacy/language.py             |  7 +++++--
 spacy/pipeline/entityruler.py | 17 +++++++++--------
 spacy/pipeline/pipe.pyx       |  3 +++
 spacy/training/loggers.py     | 10 +++++++---
 spacy/training/loop.py        |  3 ++-
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index be5886efa..c3c49d331 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1091,7 +1091,8 @@ class Language:
             for name, proc in self.pipeline:
                 if (
                     name not in exclude
-                    and hasattr(proc, "model")
+                    and hasattr(proc, "is_trainable")
+                    and proc.is_trainable()
                     and proc.model not in (True, False, None)
                 ):
                     proc.finish_update(sgd)
@@ -1297,7 +1298,9 @@ class Language:
         for name, pipe in self.pipeline:
             kwargs = component_cfg.get(name, {})
             kwargs.setdefault("batch_size", batch_size)
-            if not hasattr(pipe, "pipe"):
+            # non-trainable components may have a pipe() implementation that refers to dummy
+            # predict and set_annotations methods
+            if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
                 docs = _pipe(docs, pipe, kwargs)
             else:
                 docs = pipe.pipe(docs, **kwargs)
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index a4bc098fb..e89dd8410 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -2,8 +2,9 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
 from collections import defaultdict
 from pathlib import Path
 import srsly
-from spacy.training import Example
 
+from .pipe import Pipe
+from ..training import Example
 from ..language import Language
 from ..errors import Errors
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
@@ -51,7 +52,7 @@ def make_entity_ruler(
     )
 
 
-class EntityRuler:
+class EntityRuler(Pipe):
     """The EntityRuler lets you add spans to the `Doc.ents` using token-based
     rules or exact phrase matches. It can be combined with the statistical
     `EntityRecognizer` to boost accuracy, or used on its own to implement a
@@ -134,7 +135,6 @@ class EntityRuler:
 
         DOCS: https://nightly.spacy.io/api/entityruler#call
         """
-        self._require_patterns()
         matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
         matches = set(
             [(m_id, start, end) for m_id, start, end in matches if start != end]
@@ -315,11 +315,6 @@ class EntityRuler:
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(dict)
 
-    def _require_patterns(self) -> None:
-        """Raise an error if the component has no patterns."""
-        if not self.patterns or list(self.patterns) == [""]:
-            raise ValueError(Errors.E900.format(name=self.name))
-
     def _split_label(self, label: str) -> Tuple[str, str]:
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
 
@@ -348,6 +343,12 @@ class EntityRuler:
         validate_examples(examples, "EntityRuler.score")
         return Scorer.score_spans(examples, "ents", **kwargs)
 
+    def predict(self, docs):
+        pass
+
+    def set_annotations(self, docs, scores):
+        pass
+
     def from_bytes(
         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> "EntityRuler":
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 585cdc780..70cc1e54e 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -228,6 +228,9 @@ cdef class Pipe:
     def is_resizable(self):
         return hasattr(self, "model") and "resize_output" in self.model.attrs
 
+    def is_trainable(self):
+        return hasattr(self, "model") and isinstance(self.model, Model)
+
     def set_output(self, nO):
         if self.is_resizable():
             self.model.attrs["resize_output"](self.model, nO)
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 467f1e36b..3a133a0df 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False):
         nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
     ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
         msg = Printer(no_print=True)
-        # we assume here that only components are enabled that should be trained & logged
-        logged_pipes = nlp.pipe_names
+        # ensure that only trainable components are logged
+        logged_pipes = [
+            name
+            for name, proc in nlp.pipeline
+            if hasattr(proc, "is_trainable") and proc.is_trainable()
+        ]
         eval_frequency = nlp.config["training"]["eval_frequency"]
         score_weights = nlp.config["training"]["score_weights"]
         score_cols = [col for col, value in score_weights.items() if value is not None]
@@ -43,7 +47,7 @@ def console_logger(progress_bar: bool = False):
                 return
             losses = [
                 "{0:.2f}".format(float(info["losses"][pipe_name]))
-                for pipe_name in logged_pipes if pipe_name in info["losses"]
+                for pipe_name in logged_pipes
             ]
 
             scores = []
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 8f0aea6d4..12395e0b4 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -181,7 +181,8 @@ def train_while_improving(
         for name, proc in nlp.pipeline:
             if (
                 name not in exclude
-                and hasattr(proc, "model")
+                and hasattr(proc, "is_trainable")
+                and proc.is_trainable()
                 and proc.model not in (True, False, None)
             ):
                 proc.finish_update(optimizer)

From f102ef6b54bbc0ddaf7c093dee7fcacaf667c2ed Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 17:47:39 +0200
Subject: [PATCH 418/516] Read features.msgpack instead of features.pkl

---
 spacy/lang/zh/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 5d4d55aed..f9065f92c 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -157,7 +157,7 @@ class ChineseTokenizer(DummyTokenizer):
                 self.pkuseg_seg.feature_extractor.save(tempdir)
                 self.pkuseg_seg.model.save(tempdir)
                 tempdir = Path(tempdir)
-                with open(tempdir / "features.pkl", "rb") as fileh:
+                with open(tempdir / "features.msgpack", "rb") as fileh:
                     pkuseg_features_b = fileh.read()
                 with open(tempdir / "weights.npz", "rb") as fileh:
                     pkuseg_weights_b = fileh.read()
@@ -198,7 +198,7 @@ class ChineseTokenizer(DummyTokenizer):
         if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
             with tempfile.TemporaryDirectory() as tempdir:
                 tempdir = Path(tempdir)
-                with open(tempdir / "features.pkl", "wb") as fileh:
+                with open(tempdir / "features.msgpack", "wb") as fileh:
                     fileh.write(pkuseg_data["features_b"])
                 with open(tempdir / "weights.npz", "wb") as fileh:
                     fileh.write(pkuseg_data["weights_b"])

From 3ac3447eee4b417ab257068bb894474bd6d6c059 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 17:50:37 +0200
Subject: [PATCH 419/516] cleanup

---
 spacy/errors.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 59da84890..9d9a716d2 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,8 +456,6 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
-    E900 = ("Patterns for component '{name}' not initialized. This can be fixed "
-            "by calling 'add_patterns' or 'initialize'.")
     E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
             "Try checking whitespace and delimiters. See "
             "https://nightly.spacy.io/api/cli#convert")

From 193e0d5a98e81a52730e1721bacb7b220e93affe Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 18:04:08 +0200
Subject: [PATCH 420/516] add docs for entity_ruler.initialize

---
 spacy/pipeline/entityruler.py   |  2 +-
 website/docs/api/entityruler.md | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index e89dd8410..cad6dbdbc 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -199,7 +199,7 @@ class EntityRuler(Pipe):
         nlp (Language): The current nlp object the component is part of.
         patterns_path: Path to serialized patterns.
 
-        DOCS (TODO): https://nightly.spacy.io/api/entityruler#initialize
+        DOCS: https://nightly.spacy.io/api/entityruler#initialize
         """
         if patterns_path:
             patterns = srsly.read_jsonl(patterns_path)
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 7b7e5b635..052047635 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -74,6 +74,30 @@ be a token pattern (list) or a phrase pattern (string). For example:
 | `ent_id_sep`                      | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~                                                                                                                                                                 |
 | `patterns`                        | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~                                                                                                                                 |
 
+## EntityRuler.initialize {#initialize tag="method" new="3"}
+
+Initialize the component with patterns from a file.
+
+> #### Example
+>
+> ```python
+> entity_ruler = nlp.add_pipe("entity_ruler")
+> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path)
+> ```
+>
+> ```ini
+> ### config.cfg
+> [initialize.components.entity_ruler]
+> patterns_path = "data/patterns/patterns.jsonl"
+> ```
+
+| Name           | Description                                                                                                                                                          |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ |                                                                                                                                                                      |
+| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
+| `labels`       | Path to the .json file holding the serialized patterns. ~~Path~~                                                                                                     |
+
 ## EntityRuler.\_\len\_\_ {#len tag="method"}
 
 The number of all patterns added to the entity ruler.
@@ -256,6 +280,6 @@ Get all patterns that were added to the entity ruler.
 | Name              | Description                                                                                                           |
 | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
 | `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                     |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                      |
 | `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
 | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |

From d2806f11f2ad87b97a6571b6b71d5fe33f544ae0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 5 Oct 2020 18:08:32 +0200
Subject: [PATCH 421/516] Update to spacy-pkuseg==0.0.26 in Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a4df0f8c8..3f10e79cc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 SHELL := /bin/bash
 
 ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
 endif
 
 ifndef PYVER

From 6dcc4a0ba63370f2b27713b5f7e86e6a8de6c825 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 19:57:45 +0200
Subject: [PATCH 422/516] Simplify MultiHashEmbed signature

---
 spacy/ml/models/tok2vec.py           | 48 +++++++++++-----------------
 spacy/tests/pipeline/test_tok2vec.py | 16 +++++-----
 spacy/tests/test_models.py           |  8 ++---
 3 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 6e5aed77b..f0e846bac 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -55,13 +55,15 @@ def build_hash_embed_cnn_tok2vec(
     pretrained_vectors (bool): Whether to also use static vectors.
     """
     if subword_features:
-        attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5}
+        attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+        row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2]
     else:
-        attrs = {"NORM": 1.0}
+        attrs = ["NORM"]
+        row_sizes = [embed_size]
     return build_Tok2Vec_model(
         embed=MultiHashEmbed(
             width=width,
-            rows=embed_size,
+            rows=row_sizes,
             attrs=attrs,
             include_static_vectors=bool(pretrained_vectors),
         ),
@@ -103,7 +105,7 @@ def MultiHashEmbed_v1(
     here as a temporary compatibility."""
     return MultiHashEmbed(
         width=width,
-        rows=rows,
+        rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows],
         attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM],
         include_static_vectors=also_use_static_vectors,
     )
@@ -112,8 +114,8 @@ def MultiHashEmbed_v1(
 @registry.architectures.register("spacy.MultiHashEmbed.v2")
 def MultiHashEmbed(
     width: int,
-    rows: int,
-    attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]],
+    attrs: List[Union[str, int]],
+    rows: List[int],
     include_static_vectors: bool,
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
@@ -136,50 +138,38 @@ def MultiHashEmbed(
     The `rows` parameter controls the number of rows used by the `HashEmbed`
     tables. The HashEmbed layer needs surprisingly few rows, due to its use of
     the hashing trick. Generally between 2000 and 10000 rows is sufficient,
-    even for very large vocabularies. You can vary the number of rows per
-    attribute by specifying the attrs as a dict, mapping the keys to float
-    values which are interpreted as factors of `rows`. For instance, 
-    attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and 
-    rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are
-    assumed for all attributes.
+    even for very large vocabularies. A number of rows must be specified for each
+    table, so the `rows` list must be of the same length as the `attrs` parameter.
 
     width (int): The output width. Also used as the width of the embedding tables.
         Recommended values are between 64 and 300.
-    rows (int): The base number of rows for the embedding tables. Can be low, due
-        to the hashing trick. The rows can be varied per attribute by providing
-        a dictionary as the value of `attrs`.
-    attrs (dict or list of attr IDs): The token attributes to embed. A separate
-        embedding table will be constructed for each attribute. Attributes
-        can be specified as a list or as a dictionary, which lets you control
-        the number of rows used for each table.
+    attrs (list of attr IDs): The token attributes to embed. A separate
+        embedding table will be constructed for each attribute.
+    rows (List[int]): The number of rows in the embedding tables. Must have the
+        same length as attrs.
     include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    if isinstance(attrs, dict):
-        # Exclude tables that would have 0 rows.
-        attrs = {key: value for key, value in attrs.items() if value > 0.0}
-    indices = {attr: i for i, attr in enumerate(attrs)}
     seed = 7
 
-    def make_hash_embed(feature):
+    def make_hash_embed(index):
         nonlocal seed
-        row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0
         seed += 1
         return HashEmbed(
             width,
-            int(rows * row_factor),
-            column=indices[feature],
+            rows[index],
+            column=index,
             seed=seed,
             dropout=0.0,
         )
 
-    embeddings = [make_hash_embed(attr) for attr in attrs]
+    embeddings = [make_hash_embed(i) for i in range(len(attrs))]
     concat_size = width * (len(embeddings) + include_static_vectors)
     if include_static_vectors:
         model = chain(
             concatenate(
                 chain(
-                    FeatureExtractor(list(attrs)),
+                    FeatureExtractor(attrs),
                     list2ragged(),
                     with_array(concatenate(*embeddings)),
                 ),
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index aa60faf5b..e86d97a54 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -24,7 +24,7 @@ def test_empty_doc():
     tok2vec = build_Tok2Vec_model(
         MultiHashEmbed(
             width=width,
-            rows=embed_size,
+            rows=[embed_size, embed_size, embed_size, embed_size],
             include_static_vectors=False,
             attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
         ),
@@ -44,7 +44,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     tok2vec = build_Tok2Vec_model(
         MultiHashEmbed(
             width=width,
-            rows=embed_size,
+            rows=[embed_size] * 4,
             include_static_vectors=False,
             attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
         ),
@@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
 @pytest.mark.parametrize(
     "width,embed_arch,embed_config,encode_arch,encode_config",
     [
-        (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
-        (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+        (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
+        (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
         (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
         (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
@@ -116,11 +116,11 @@ cfg_string = """
     @architectures = "spacy.Tok2Vec.v1"
 
     [components.tok2vec.model.embed]
-    @architectures = "spacy.MultiHashEmbed.v1"
+    @architectures = "spacy.MultiHashEmbed.v2"
     width = ${components.tok2vec.model.encode.width}
-    rows = 2000
-    also_embed_subwords = true
-    also_use_static_vectors = false
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
 
     [components.tok2vec.model.encode]
     @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 3bd3b903d..d621be0ba 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -63,7 +63,7 @@ def get_tok2vec_kwargs():
     return {
         "embed": MultiHashEmbed(
             width=32,
-            rows=500,
+            rows=[500, 500, 500],
             attrs=["NORM", "PREFIX", "SHAPE"],
             include_static_vectors=False
         ),
@@ -80,7 +80,7 @@ def test_tok2vec():
 def test_multi_hash_embed():
     embed = MultiHashEmbed(
         width=32,
-        rows=500,
+        rows=[500, 500, 500],
         attrs=["NORM", "PREFIX", "SHAPE"],
         include_static_vectors=False
     )
@@ -95,8 +95,8 @@ def test_multi_hash_embed():
     # Now try with different row factors
     embed = MultiHashEmbed(
         width=32,
-        rows=500,
-        attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5},
+        rows=[1000, 50, 250],
+        attrs=["NORM", "PREFIX", "SHAPE"],
         include_static_vectors=False
     )
     hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]

From cdd2b79b6d2a87db04f59d478dfa0fd8c2d3abdb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 19:58:18 +0200
Subject: [PATCH 423/516] Remove deprecated MultiHashEmbed

---
 spacy/ml/models/tok2vec.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index f0e846bac..3a7da4a8e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -98,20 +98,6 @@ def build_Tok2Vec_model(
 
 
 @registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed_v1(
-    width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
-) -> Model[List[Doc], List[Floats2d]]:
-    """Previous interface for MultiHashEmbed. This should be removed, it's only
-    here as a temporary compatibility."""
-    return MultiHashEmbed(
-        width=width,
-        rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows],
-        attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM],
-        include_static_vectors=also_use_static_vectors,
-    )
-
-
-@registry.architectures.register("spacy.MultiHashEmbed.v2")
 def MultiHashEmbed(
     width: int,
     attrs: List[Union[str, int]],

From db84d175c3e5d661f9358b6d8b85cd2fe9316392 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 19:59:30 +0200
Subject: [PATCH 424/516] Fix test

---
 spacy/tests/pipeline/test_tok2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index e86d97a54..90882ae3f 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -116,7 +116,7 @@ cfg_string = """
     @architectures = "spacy.Tok2Vec.v1"
 
     [components.tok2vec.model.embed]
-    @architectures = "spacy.MultiHashEmbed.v2"
+    @architectures = "spacy.MultiHashEmbed.v1"
     width = ${components.tok2vec.model.encode.width}
     rows = [2000, 1000, 1000, 1000]
     attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]

From d58fb4270748b9a4d96b077d69532af2ee7ded05 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 20:00:42 +0200
Subject: [PATCH 425/516] Add spacy_version option and validation for
 project.yml

---
 spacy/cli/_util.py | 21 ++++++++++++++++++++-
 spacy/schemas.py   |  1 +
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 676a7c8d7..373650172 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -16,7 +16,8 @@ import os
 
 from ..schemas import ProjectConfigSchema, validate
 from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import ENV_VARS
+from ..util import is_compatible_version, ENV_VARS
+from .. import about
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
         msg.fail(invalid_err)
         print("\n".join(errors))
         sys.exit(1)
+    validate_project_version(config)
     validate_project_commands(config)
     # Make sure directories defined in config exist
     for subdir in config.get("directories", []):
@@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
     return dict(interpolated["project"])
 
 
+def validate_project_version(config: Dict[str, Any]) -> None:
+    """If the project defines a compatible spaCy version range, chec that it's
+    compatible with the current version of spaCy.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    spacy_version = config.get("spacy_version", None)
+    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
+        err = (
+            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
+            f"that's not compatible with the version of spaCy you're running "
+            f"({about.__version__}). You can edit version requirement in the "
+            f"{PROJECT_FILE} to load it, but the project may not run as expected."
+        )
+        msg.fail(err, exits=1)
+
+
 def validate_project_commands(config: Dict[str, Any]) -> None:
     """Check that project commands and workflows are valid, don't contain
     duplicates, don't clash  and only refer to commands that exist.
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 591b7e134..0d88d4090 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
     workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
     commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
     title: Optional[str] = Field(None, title="Project title")
+    spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
     # fmt: on
 
     class Config:

From 582701519eb8454b60b138559a1f5c9e6684fbef Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 20:00:49 +0200
Subject: [PATCH 426/516] Remove __release__ flag

---
 spacy/about.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index dce627a38..9329b48e6 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,7 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
 __version__ = "3.0.0a33"
-__release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From e50047f1c5e9949894bbba0a3183295fc79f2f2b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 20:02:45 +0200
Subject: [PATCH 427/516] Check lengths match

---
 spacy/ml/models/tok2vec.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 3a7da4a8e..65d2bffbb 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -136,6 +136,8 @@ def MultiHashEmbed(
     include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
+    if len(rows) != len(attrs):
+        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
     seed = 7
 
     def make_hash_embed(index):

From be99f1e4de604417bcee07602ae08178a23f6ede Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 20:11:16 +0200
Subject: [PATCH 428/516] Remove output dirs before training (#6204)

* Remove output dirs before training

* Re-raise error if cleaning fails
---
 spacy/errors.py        |  4 ++++
 spacy/training/loop.py | 32 +++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 9d9a716d2..bf3628ce9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,10 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E901 = ("Failed to remove existing output directory: {path}. If your "
+            "config and the components you train change between runs, a "
+            "non-empty output directory can lead to stale pipeline data. To "
+            "solve this, remove the existing directories in the output directory.")
     E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
             "Try checking whitespace and delimiters. See "
             "https://nightly.spacy.io/api/cli#convert")
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 0d4414964..67f61567e 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
+from wasabi import Printer
 import random
-import wasabi
 import sys
+import shutil
 
 from .example import Example
 from ..schemas import ConfigSchemaTraining
 from ..errors import Errors
-from ..util import resolve_dot_names, registry
+from ..util import resolve_dot_names, registry, logger
 
 if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
+DIR_MODEL_BEST = "model-best"
+DIR_MODEL_LAST = "model-last"
+
+
 def train(
     nlp: "Language",
     output_path: Optional[Path] = None,
@@ -38,7 +43,7 @@ def train(
     RETURNS (Path / None): The path to the final exported model.
     """
     # We use no_print here so we can respect the stdout/stderr options.
-    msg = wasabi.Printer(no_print=True)
+    msg = Printer(no_print=True)
     # Create iterator, which yields out info after each optimization step.
     config = nlp.config.interpolate()
     if config["training"]["seed"] is not None:
@@ -69,6 +74,7 @@ def train(
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
     )
+    clean_output_dir(output_path)
     stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
     if frozen_components:
         stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
@@ -83,7 +89,7 @@ def train(
                     update_meta(T, nlp, info)
                 with nlp.use_params(optimizer.averages):
                     nlp = before_to_disk(nlp)
-                    nlp.to_disk(output_path / "model-best")
+                    nlp.to_disk(output_path / DIR_MODEL_BEST)
     except Exception as e:
         if output_path is not None:
             # We don't want to swallow the traceback if we don't have a
@@ -100,7 +106,7 @@ def train(
     finally:
         finalize_logger()
         if output_path is not None:
-            final_model_path = output_path / "model-last"
+            final_model_path = output_path / DIR_MODEL_LAST
             if optimizer.averages:
                 with nlp.use_params(optimizer.averages):
                     nlp.to_disk(final_model_path)
@@ -305,3 +311,19 @@ def create_before_to_disk_callback(
         return modified_nlp
 
     return before_to_disk
+
+
+def clean_output_dir(path: Union[str, Path]) -> None:
+    """Remove an existing output directory. Typically used to ensure that that
+    a directory like model-best and its contents aren't just being overwritten
+    by nlp.to_disk, which could preserve existing subdirectories (e.g.
+    components that don't exist anymore).
+    """
+    if path is not None and path.exists():
+        for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
+            if subdir.exists():
+                try:
+                    shutil.rmtree(str(subdir))
+                    logger.debug(f"Removed existing output directory: {subdir}")
+                except Exception as e:
+                    raise IOError(Errors.E901.format(path=path)) from e

From b392d48e7667b95d820bf120dae4ab4a719af497 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 20:17:07 +0200
Subject: [PATCH 429/516] Fix test

---
 spacy/tests/test_models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index d621be0ba..bad964786 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -6,7 +6,6 @@ from numpy.testing import assert_array_equal
 import numpy
 
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
-from spacy.ml.models import MultiHashEmbed_v1
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES

From 919790cb47b408c827e4cb40a1c6d3343fe0a28f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 20:28:21 +0200
Subject: [PATCH 430/516] Upd MultiHashEmbed docs

---
 website/docs/api/architectures.md | 51 +++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 5cee45ba5..cea390bb1 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -136,25 +136,50 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 > [model]
 > @architectures = "spacy.MultiHashEmbed.v1"
 > width = 64
-> rows = 2000
-> also_embed_subwords = false
-> also_use_static_vectors = false
+> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+> rows = [2000, 1000, 1000, 1000]
+> include_static_vectors = true
 > ```
 
 Construct an embedding layer that separately embeds a number of lexical
-attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build mixed representations. The features used are
-the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
-[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
-vectors can also be incorporated into the concatenated representation.
+attributes using hash embedding, concatenates the results, and passes it
+through a feed-forward subnetwork to build a mixed representations.
+
+The features used can be configured with the 'attrs' argument. The suggested
+attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
+account some subword information, without contruction a fully character-based
+representation. If pretrained vectors are available, they can be included in
+the representation as well, with the vectors table will be kept static
+(i.e. it's not updated).
+
+The `width` parameter specifices the output width of the layer and the widths
+of all embedding tables. If static vectors are included, a learned linear
+layer is used to map the vectors to the specified width before concatenating
+it with the other embedding outputs. A single Maxout layer is then used to
+reduce the concatenated vectors to the final width.
+    
+The `rows` parameter controls the number of rows used by the `HashEmbed`
+tables. The HashEmbed layer needs surprisingly few rows, due to its use of
+the hashing trick. Generally between 2000 and 10000 rows is sufficient,
+even for very large vocabularies. A number of rows must be specified for each
+table, so the `rows` list must be of the same length as the `attrs` parameter.
+
+    attrs (list of attr IDs): The token attributes to embed. A separate
+        embedding table will be constructed for each attribute.
+    rows (List[int]): The number of rows in the embedding tables. Must have the
+        same length as attrs.
+    include_static_vectors (bool): Whether to also use static word vectors.
+        Requires a vectors table to be loaded in the Doc objects' vocab.
+
 
 | Name                      | Description                                                                                                                                                                                                       |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`                   | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~                                                                                          |
-| `rows`                    | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
-| `also_embed_subwords`     | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~       |
-| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~                                                                                    |
-| **CREATES**               | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                            |
+| `width`                   | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
+| `attrs`                   | The token attributes to embed. A separate |
+embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
+| `rows`                    | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ |
+| `include_static_vectors`  | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
+| **CREATES**               | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
 
 ### spacy.CharacterEmbed.v1 {#CharacterEmbed}
 

From 0135f6ed95de6cc2bd7639f491f7a43c4e693116 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 20:51:15 +0200
Subject: [PATCH 431/516] Enable commit check via env var

---
 spacy/cli/project/remote_storage.py |  6 ++++--
 spacy/cli/project/run.py            |  6 ++++--
 spacy/util.py                       | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 7e2caa8d7..6056458e2 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -7,7 +7,8 @@ import tarfile
 from pathlib import Path
 
 from .._util import get_hash, get_checksum, download_file, ensure_pathy
-from ...util import make_tempdir, get_minor_version
+from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
+from ...git_info import GIT_VERSION
 from ... import about
 
 if TYPE_CHECKING:
@@ -130,7 +131,8 @@ def get_command_hash(
     currently installed packages, whatever environment variables have been marked
     as relevant, and the command.
     """
-    spacy_v = get_minor_version(about.__version__)
+    check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
+    spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
     dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
     hashes = [spacy_v, site_hash, env_hash] + dep_checksums
     hashes.extend(cmd)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index ea4675d60..1a9b447ea 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -7,7 +7,8 @@ import srsly
 from ... import about
 from ...git_info import GIT_VERSION
 from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList, is_minor_version_match
+from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
+from ...util import check_bool_env_var
 from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
 from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
 
@@ -64,9 +65,10 @@ def project_run(
                 err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                 err_kwargs = {"exits": 1} if not dry else {}
                 msg.fail(err, err_help, **err_kwargs)
+        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
         with working_dir(project_dir) as current_dir:
             msg.divider(subcommand)
-            rerun = check_rerun(current_dir, cmd)
+            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
             if not rerun and not force:
                 msg.info(f"Skipping '{cmd['name']}': nothing changed")
             else:
diff --git a/spacy/util.py b/spacy/util.py
index 4b2cb018a..aa321b22f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
 
 class ENV_VARS:
     CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
+    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
 
 
 class registry(thinc.registry):
@@ -1342,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
         cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
         return hasattr(cls_func, attr)
     return False
+
+
+def check_bool_env_var(env_var: str) -> bool:
+    """Convert the value of an environment variable to a boolean. Add special
+    check for "0" (falsy) and consider everything else truthy, except unset.
+
+    env_var (str): The name of the environment variable to check.
+    RETURNS (bool): Its boolean value.
+    """
+    value = os.environ.get(env_var, False)
+    if value == "0":
+        return False
+    return bool(value)

From 706b7f6973e4f62622bf96370016b25878ec950f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 20:51:22 +0200
Subject: [PATCH 432/516] Update docs

---
 website/docs/usage/projects.md | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 6d5746308..5fced922d 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -216,15 +216,16 @@ pipelines.
 %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
 ```
 
-| Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `title`       | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `description` | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `vars`        | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
-| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
-| `assets`      | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
-| `workflows`   | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
-| `commands`    | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| Section         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `title`         | An optional project title used in `--help` message and [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `description`   | An optional project description used in [auto-generated docs](#custom-docs).                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| `vars`          | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                                                                |
+| `directories`   | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                                 |
+| `assets`        | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                                        |
+| `workflows`     | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command.                                                                                                                                                                                                                                                                                                                                         |
+| `commands`      | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded.                                                                                                                                                                                                                                                                                                                          |
 
 ### Data assets {#data-assets}
 

From 9aa07ad0018cb1e912aeeb97b9a0bde0ead7edfb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:05:41 +0200
Subject: [PATCH 433/516] Update quickstarts [ci skip]

---
 website/src/styles/quickstart.module.sass | 11 ++++--
 website/src/widgets/quickstart-install.js | 47 +++++++++++++++++++----
 website/src/widgets/quickstart-models.js  | 22 +++++++----
 3 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index a08d6bcb6..8ad106a78 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -38,7 +38,7 @@
     cursor: pointer
     display: inline-block
     padding: 0.35rem 0.5rem 0.25rem 0
-    margin: 0 1rem 0.75rem 0
+    margin: 0 1rem 0.5rem 0
     font-size: var(--font-size-xs)
     font-weight: bold
 
@@ -73,16 +73,19 @@
         background: var(--color-theme)
 
     .checkbox + &:before
+        $size: 18px
         content: ""
         display: inline-block
-        width: 20px
-        height: 20px
+        width: $size
+        height: $size
         border: 1px solid var(--color-subtle)
         vertical-align: middle
         margin-right: 0.5rem
         cursor: pointer
-        border-radius: var(--border-radius)
+        border-radius: $size / 4
         background: var(--color-back)
+        position: relative
+        top: -1px
 
     .checkbox:checked + &:before
         // Embed "check" icon here for simplicity
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 741973945..a8bdf21dc 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
 import { Quickstart, QS } from '../components/quickstart'
 import { repo } from '../components/util'
 
+const DEFAULT_MODELS = ['en']
+const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_CUDA = 'cuda100'
 const CUDA = {
@@ -68,9 +70,13 @@ const QuickstartInstall = ({ id, title }) => {
     const [train, setTrain] = useState(false)
     const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
     const [cuda, setCuda] = useState(DEFAULT_CUDA)
+    const [selectedModels, setModels] = useState(DEFAULT_MODELS)
+    const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
     const setters = {
         hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
         config: v => setTrain(v.includes('train')),
+        models: setModels,
+        optimize: v => setEfficiency(v.includes('efficiency')),
     }
     const showDropdown = {
         hardware: () => hardware === 'gpu',
@@ -89,13 +95,37 @@ const QuickstartInstall = ({ id, title }) => {
                     ...DATA,
                     {
                         id: 'models',
-                        title: 'Trained Pipelines',
+                        title: 'Trained pipelines',
                         multiple: true,
                         options: models
                             .sort((a, b) => a.name.localeCompare(b.name))
-                            .map(({ code, name }) => ({ id: code, title: name })),
+                            .map(({ code, name }) => ({
+                                id: code,
+                                title: name,
+                                checked: DEFAULT_MODELS.includes(code),
+                            })),
                     },
                 ]
+                if (selectedModels.length) {
+                    data.push({
+                        id: 'optimize',
+                        title: 'Select pipeline for',
+                        options: [
+                            {
+                                id: 'efficiency',
+                                title: 'efficiency',
+                                checked: DEFAULT_OPT === 'efficiency',
+                                help: 'Faster and smaller pipeline, but less accurate',
+                            },
+                            {
+                                id: 'accuracy',
+                                title: 'accuracy',
+                                checked: DEFAULT_OPT === 'accuracy',
+                                help: 'Larger and slower pipeline, but more accurate',
+                            },
+                        ],
+                    })
+                }
                 return (
                     <Quickstart
                         data={data}
@@ -149,11 +179,14 @@ const QuickstartInstall = ({ id, title }) => {
                             conda install -c conda-forge spacy-lookups-data
                         </QS>
 
-                        {models.map(({ code, models: modelOptions }) => (
-                            <QS models={code} key={code}>
-                                python -m spacy download {modelOptions[0]}
-                            </QS>
-                        ))}
+                        {models.map(({ code, models: modelOptions }) => {
+                            const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
+                            return (
+                                <QS models={code} key={code}>
+                                    python -m spacy download {pkg}
+                                </QS>
+                            )
+                        })}
                     </Quickstart>
                 )
             }}
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index ffd1b3df9..5f94c60cb 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -31,25 +31,33 @@ const data = [
     },
     {
         id: 'optimize',
-        title: 'Optimize for',
-        help:
-            'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
+        title: 'Select for',
         options: [
-            { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
-            { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
+            {
+                id: 'efficiency',
+                title: 'efficiency',
+                checked: DEFAULT_OPT === 'efficiency',
+                help: 'Faster and smaller pipeline, but less accurate',
+            },
+            {
+                id: 'accuracy',
+                title: 'accuracy',
+                checked: DEFAULT_OPT === 'accuracy',
+                help: 'Larger and slower pipeline, but more accurate',
+            },
         ],
     },
     {
         id: 'config',
         title: 'Options',
         multiple: true,
-        options: [{ id: 'example', title: 'Show usage example' }],
+        options: [{ id: 'example', title: 'Show text example' }],
     },
 ]
 
 const QuickstartInstall = ({ id, title, description, children }) => {
     const [lang, setLang] = useState(DEFAULT_LANG)
-    const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
+    const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
     const setters = {
         lang: setLang,
         optimize: v => setEfficiency(v.includes('efficiency')),

From 91d0fbb58821fcecf4b4af3d2bb32d12b490c565 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 21:13:53 +0200
Subject: [PATCH 434/516] Fix test

---
 spacy/tests/serialize/test_serialize_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index da048f3d6..8b3f5c2b8 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -89,9 +89,9 @@ def my_parser():
     tok2vec = build_Tok2Vec_model(
         MultiHashEmbed(
             width=321,
-            rows=5432,
-            also_embed_subwords=True,
-            also_use_static_vectors=False,
+            attrs=["LOWER", "SHAPE"],
+            rows=[5432, 5432],
+            include_static_vectors=False,
         ),
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )

From ff8b9807750e045f40c9a40208eba8c575c714cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 21:19:41 +0200
Subject: [PATCH 435/516] Upd quickstart template

---
 spacy/cli/templates/quickstart_training.jinja | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 3bd237b0a..c3419e67d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -171,8 +171,13 @@ factory = "tok2vec"
 [components.tok2vec.model.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = ${components.tok2vec.model.encode.width}
-rows = {{ 2000 if optimize == "efficiency" else 7000 }}
-also_embed_subwords = {{ "true" if has_letters else "false" }}
+{% if has_letters -%}
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 2500, 2500, 2500]
+{% else -%}
+attrs = ["ORTH", "SHAPE"]
+rows = [5000, 2500]
+{% endif -%}
 also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
 
 [components.tok2vec.model.encode]

From b7e01d20246efbeeb1c6f9babbb08ac965a45582 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 21:21:30 +0200
Subject: [PATCH 436/516] Fix quickstart

---
 spacy/cli/templates/quickstart_training.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index c3419e67d..d92de9c15 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -178,7 +178,7 @@ rows = [5000, 2500, 2500, 2500]
 attrs = ["ORTH", "SHAPE"]
 rows = [5000, 2500]
 {% endif -%}
-also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
+include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
 
 [components.tok2vec.model.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"

From 4cf73d85bc86e2b31a517437ef68ed8dd87f5038 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:37:09 +0200
Subject: [PATCH 437/516] Add [zh] to extras [ci skip]

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index d8362c4bd..e77bda2fc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -92,6 +92,8 @@ ko =
     natto-py==0.9.0
 th =
     pythainlp>=2.0
+zh =
+    spacy-pkuseg==0.0.26
 
 [bdist_wheel]
 universal = false

From 2d0c0134bcaa2527a40d13e62be594bf05ac389b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:38:23 +0200
Subject: [PATCH 438/516] Adjust message [ci skip]

---
 spacy/lang/zh/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index f9065f92c..ed988c1ba 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -206,7 +206,7 @@ class ChineseTokenizer(DummyTokenizer):
                     import spacy_pkuseg
                 except ImportError:
                     raise ImportError(
-                        "spacy_pkuseg not installed. To use this model, "
+                        "spacy-pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
                     ) from None
                 self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
@@ -256,7 +256,7 @@ class ChineseTokenizer(DummyTokenizer):
             except ImportError:
                 if self.segmenter == Segmenter.pkuseg:
                     raise ImportError(
-                        "spacy_pkuseg not installed. To use this model, "
+                        "spacy-pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
                     ) from None
             if path.exists():
@@ -317,7 +317,7 @@ def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
         import spacy_pkuseg
 
     except ImportError:
-        msg = "spacy_pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+        msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
         raise ImportError(msg) from None
     try:
         return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)

From 8a39d5414e536d3ff5c3cde1fae71f604d1b3762 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:43:51 +0200
Subject: [PATCH 439/516] Update quickstart [ci skip]

---
 website/src/widgets/quickstart-install.js | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index a8bdf21dc..ab91b8e30 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -17,6 +17,7 @@ const CUDA = {
     '10.1': 'cuda101',
     '10.2': 'cuda102',
 }
+const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
 const DATA = [
     {
         id: 'os',
@@ -81,7 +82,13 @@ const QuickstartInstall = ({ id, title }) => {
     const showDropdown = {
         hardware: () => hardware === 'gpu',
     }
-    const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
+    const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
+    const pipExtras = [
+        hardware === 'gpu' && cuda,
+        train && 'transformers',
+        train && 'lookups',
+        ...modelExtras,
+    ]
         .filter(e => e)
         .join(',')
     return (

From 9614e53b02749e8fec394c0f8a7f965a392918d2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:55:18 +0200
Subject: [PATCH 440/516] Tidy up and auto-format

---
 spacy/ml/models/tok2vec.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 65d2bffbb..61edb86c4 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Union, Dict
+from typing import Optional, List, Union
 from thinc.types import Floats2d
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@@ -11,7 +11,7 @@ from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
+from ...attrs import intify_attr
 
 
 @registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
     window_size: int,
     maxout_pieces: int,
     subword_features: bool,
-    pretrained_vectors: Optional[bool]
+    pretrained_vectors: Optional[bool],
 ) -> Model[List[Doc], List[Floats2d]]:
     """Build spaCy's 'standard' tok2vec layer, which uses hash embedding
     with subword features and a CNN with layer-normalized maxout.
@@ -56,7 +56,7 @@ def build_hash_embed_cnn_tok2vec(
     """
     if subword_features:
         attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
-        row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2]
+        row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
     else:
         attrs = ["NORM"]
         row_sizes = [embed_size]
@@ -120,7 +120,7 @@ def MultiHashEmbed(
     layer is used to map the vectors to the specified width before concatenating
     it with the other embedding outputs. A single Maxout layer is then used to
     reduce the concatenated vectors to the final width.
-    
+
     The `rows` parameter controls the number of rows used by the `HashEmbed`
     tables. The HashEmbed layer needs surprisingly few rows, due to its use of
     the hashing trick. Generally between 2000 and 10000 rows is sufficient,
@@ -143,13 +143,7 @@ def MultiHashEmbed(
     def make_hash_embed(index):
         nonlocal seed
         seed += 1
-        return HashEmbed(
-            width,
-            rows[index],
-            column=index,
-            seed=seed,
-            dropout=0.0,
-        )
+        return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
 
     embeddings = [make_hash_embed(i) for i in range(len(attrs))]
     concat_size = width * (len(embeddings) + include_static_vectors)

From 1a554bdcb14f7409bf4111092962b0f9ba0000c4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:55:27 +0200
Subject: [PATCH 441/516] Update docs and docstring [ci skip]

---
 spacy/ml/models/tok2vec.py        |  4 +--
 website/docs/api/architectures.md | 52 +++++++++----------------------
 2 files changed, 17 insertions(+), 39 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 61edb86c4..23cfe883b 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -110,12 +110,12 @@ def MultiHashEmbed(
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
-    account some subword information, without contruction a fully character-based
+    account some subword information, without construction a fully character-based
     representation. If pretrained vectors are available, they can be included in
     the representation as well, with the vectors table will be kept static
     (i.e. it's not updated).
 
-    The `width` parameter specifices the output width of the layer and the widths
+    The `width` parameter specifies the output width of the layer and the widths
     of all embedding tables. If static vectors are included, a learned linear
     layer is used to map the vectors to the specified width before concatenating
     it with the other embedding outputs. A single Maxout layer is then used to
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index cea390bb1..5246a3ed6 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -142,44 +142,22 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 > ```
 
 Construct an embedding layer that separately embeds a number of lexical
-attributes using hash embedding, concatenates the results, and passes it
-through a feed-forward subnetwork to build a mixed representations.
+attributes using hash embedding, concatenates the results, and passes it through
+a feed-forward subnetwork to build a mixed representations. The features used
+can be configured with the `attrs` argument. The suggested attributes are
+`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
+some subword information, without construction a fully character-based
+representation. If pretrained vectors are available, they can be included in the
+representation as well, with the vectors table will be kept static (i.e. it's
+not updated).
 
-The features used can be configured with the 'attrs' argument. The suggested
-attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
-account some subword information, without contruction a fully character-based
-representation. If pretrained vectors are available, they can be included in
-the representation as well, with the vectors table will be kept static
-(i.e. it's not updated).
-
-The `width` parameter specifices the output width of the layer and the widths
-of all embedding tables. If static vectors are included, a learned linear
-layer is used to map the vectors to the specified width before concatenating
-it with the other embedding outputs. A single Maxout layer is then used to
-reduce the concatenated vectors to the final width.
-    
-The `rows` parameter controls the number of rows used by the `HashEmbed`
-tables. The HashEmbed layer needs surprisingly few rows, due to its use of
-the hashing trick. Generally between 2000 and 10000 rows is sufficient,
-even for very large vocabularies. A number of rows must be specified for each
-table, so the `rows` list must be of the same length as the `attrs` parameter.
-
-    attrs (list of attr IDs): The token attributes to embed. A separate
-        embedding table will be constructed for each attribute.
-    rows (List[int]): The number of rows in the embedding tables. Must have the
-        same length as attrs.
-    include_static_vectors (bool): Whether to also use static word vectors.
-        Requires a vectors table to be loaded in the Doc objects' vocab.
-
-
-| Name                      | Description                                                                                                                                                                                                       |
-| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`                   | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
-| `attrs`                   | The token attributes to embed. A separate |
-embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
-| `rows`                    | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ |
-| `include_static_vectors`  | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
-| **CREATES**               | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+| Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`                  | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~                                                              |
+| `attrs`                  | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~                                                                                                                                                                                                                                                                                                                        |
+| `rows`                   | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
+| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
+| **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |
 
 ### spacy.CharacterEmbed.v1 {#CharacterEmbed}
 

From 126268ce50d08d38aefa15e7925632c156c792d4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 5 Oct 2020 21:58:18 +0200
Subject: [PATCH 442/516] Auto-format [ci skip]

---
 spacy/lang/uk/__init__.py                 |  4 +++-
 spacy/lang/zh/__init__.py                 |  6 ++----
 spacy/tests/doc/test_retokenize_split.py  | 10 +++++++---
 spacy/tests/pipeline/test_pipe_methods.py |  2 +-
 spacy/tests/test_models.py                |  8 ++++----
 spacy/training/augment.py                 |  2 +-
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 0abe9170e..24c88e5a7 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -26,7 +26,9 @@ class Ukrainian(Language):
     default_config={"model": None, "mode": "pymorphy2"},
     default_score_weights={"lemma_acc": 1.0},
 )
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False,):
+def make_lemmatizer(
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+):
     return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 
 
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index ed988c1ba..30560ed0d 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -54,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
 
 
 class ChineseTokenizer(DummyTokenizer):
-    def __init__(
-        self, nlp: Language, segmenter: Segmenter = Segmenter.char,
-    ):
+    def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
         self.vocab = nlp.vocab
         if isinstance(segmenter, Segmenter):
             segmenter = segmenter.value
@@ -87,7 +85,7 @@ class ChineseTokenizer(DummyTokenizer):
             if pkuseg_user_dict is None:
                 pkuseg_user_dict = pkuseg_model
             self.pkuseg_seg = try_pkuseg_import(
-                pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
+                pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
             )
 
     def __call__(self, text: str) -> Doc:
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index da4a46a47..30f945165 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
     # Retokenize to split out the words in the token at doc[2].
     token = doc[2]
     with doc.retokenize() as retokenizer:
-      retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
+        retokenizer.split(
+            token,
+            ["brown", "fox", "jumps", "over", "the"],
+            heads=[(token, idx) for idx in range(5)],
+        )
 
-    assert doc[9].text  == "w/"
+    assert doc[9].text == "w/"
     assert doc[9].norm_ == "with"
-    assert doc[5].text  == "over"
+    assert doc[5].text == "over"
     assert doc[5].norm_ == "over"
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index a4297a1d1..4b96992e1 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
 
 
 @pytest.mark.parametrize(
-    "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
+    "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
 )
 def test_pipe_label_data_exports_labels(pipe):
     nlp = Language()
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index bad964786..17408f7e8 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -64,7 +64,7 @@ def get_tok2vec_kwargs():
             width=32,
             rows=[500, 500, 500],
             attrs=["NORM", "PREFIX", "SHAPE"],
-            include_static_vectors=False
+            include_static_vectors=False,
         ),
         "encode": MaxoutWindowEncoder(
             width=32, depth=2, maxout_pieces=2, window_size=1
@@ -81,7 +81,7 @@ def test_multi_hash_embed():
         width=32,
         rows=[500, 500, 500],
         attrs=["NORM", "PREFIX", "SHAPE"],
-        include_static_vectors=False
+        include_static_vectors=False,
     )
     hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
     assert len(hash_embeds) == 3
@@ -96,11 +96,11 @@ def test_multi_hash_embed():
         width=32,
         rows=[1000, 50, 250],
         attrs=["NORM", "PREFIX", "SHAPE"],
-        include_static_vectors=False
+        include_static_vectors=False,
     )
     hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
     assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
- 
+
 
 @pytest.mark.parametrize(
     "seed,model_func,kwargs",
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index e76ee49f7..13ae45bd2 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
 
 
 def lower_casing_augmenter(
-    nlp: "Language", example: Example, *, level: float,
+    nlp: "Language", example: Example, *, level: float
 ) -> Iterator[Example]:
     if random.random() >= level:
         yield example

From ff9ac39c88d8eac8e599041a63a69ff754690f5a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 5 Oct 2020 22:50:14 +0200
Subject: [PATCH 443/516] read entity_ruler patterns with srsly.read_jsonl.v1

---
 spacy/language.py                           |  4 +++-
 spacy/pipeline/entityruler.py               |  9 ++++---
 spacy/pipeline/transition_parser.pyx        |  4 +++-
 spacy/schemas.py                            |  2 +-
 spacy/tests/pipeline/test_attributeruler.py |  2 +-
 spacy/tests/pipeline/test_entity_ruler.py   | 26 +++++++++++++++++++++
 website/docs/api/entityruler.md             |  9 ++++---
 7 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index c3c49d331..ba244617e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1410,7 +1410,9 @@ class Language:
             kwargs = component_cfg.get(name, {})
             # Allow component_cfg to overwrite the top-level kwargs.
             kwargs.setdefault("batch_size", batch_size)
-            if hasattr(proc, "pipe"):
+            # non-trainable components may have a pipe() implementation that refers to dummy
+            # predict and set_annotations methods
+            if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
                 f = functools.partial(proc.pipe, **kwargs)
             else:
                 # Apply the function, but yield the doc
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index cad6dbdbc..6ca586d05 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable
+from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
 from collections import defaultdict
 from pathlib import Path
 import srsly
@@ -190,19 +190,18 @@ class EntityRuler(Pipe):
         get_examples: Callable[[], Iterable[Example]],
         *,
         nlp: Optional[Language] = None,
-        patterns_path: Optional[Path] = None
+        patterns: Optional[Sequence[PatternType]] = None,
     ):
         """Initialize the pipe for training.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
         nlp (Language): The current nlp object the component is part of.
-        patterns_path: Path to serialized patterns.
+        patterns Optional[Iterable[PatternType]]: The list of patterns.
 
         DOCS: https://nightly.spacy.io/api/entityruler#initialize
         """
-        if patterns_path:
-            patterns = srsly.read_jsonl(patterns_path)
+        if patterns:
             self.add_patterns(patterns)
 
 
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 2ad0acd3a..3b4406757 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -437,7 +437,9 @@ cdef class Parser(Pipe):
             for name, component in nlp.pipeline:
                 if component is self:
                     break
-                if hasattr(component, "pipe"):
+                # non-trainable components may have a pipe() implementation that refers to dummy
+                # predict and set_annotations methods
+                if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
                     doc_sample = list(component.pipe(doc_sample, batch_size=8))
                 else:
                     doc_sample = [component(doc) for doc in doc_sample]
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 591b7e134..f4d306fd7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -119,7 +119,7 @@ def validate_init_settings(
     if types don't match or required values are missing.
 
     func (Callable): The initialize method of a given component etc.
-    settings (Dict[str, Any]): The settings from the repsective [initialize] block.
+    settings (Dict[str, Any]): The settings from the respective [initialize] block.
     section (str): Initialize section, for error message.
     name (str): Name of the block in the section.
     exclude (Iterable[str]): Parameter names to exclude from schema.
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index c967bcdcd..fedeb192f 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc.has_annotation("LEMMA")
     assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
-    # initialize with patterns from asset
+    # initialize with patterns from misc registry
     nlp.config["initialize"]["components"]["attribute_ruler"] = {
         "patterns": {"@misc": "attribute_ruler_patterns"}
     }
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index d70d0326e..96deab24b 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -1,4 +1,6 @@
 import pytest
+
+from spacy import registry
 from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
@@ -11,6 +13,7 @@ def nlp():
 
 
 @pytest.fixture
+@registry.misc("entity_ruler_patterns")
 def patterns():
     return [
         {"label": "HELLO", "pattern": "hello world"},
@@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns):
     assert doc.ents[1].label_ == "BYE"
 
 
+def test_entity_ruler_init_patterns(nlp, patterns):
+    # initialize with patterns
+    ruler = nlp.add_pipe("entity_ruler")
+    assert len(ruler.labels) == 0
+    ruler.initialize(lambda: [], patterns=patterns)
+    assert len(ruler.labels) == 4
+    doc = nlp("hello world bye bye")
+    assert doc.ents[0].label_ == "HELLO"
+    assert doc.ents[1].label_ == "BYE"
+    nlp.remove_pipe("entity_ruler")
+    # initialize with patterns from misc registry
+    nlp.config["initialize"]["components"]["entity_ruler"] = {
+        "patterns": {"@misc": "entity_ruler_patterns"}
+    }
+    ruler = nlp.add_pipe("entity_ruler")
+    assert len(ruler.labels) == 0
+    nlp.initialize()
+    assert len(ruler.labels) == 4
+    doc = nlp("hello world bye bye")
+    assert doc.ents[0].label_ == "HELLO"
+    assert doc.ents[1].label_ == "BYE"
+
+
 def test_entity_ruler_existing(nlp, patterns):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 052047635..b8aab2f50 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -82,13 +82,16 @@ Initialize the component with patterns from a file.
 >
 > ```python
 > entity_ruler = nlp.add_pipe("entity_ruler")
-> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path)
+> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns)
 > ```
 >
 > ```ini
 > ### config.cfg
 > [initialize.components.entity_ruler]
-> patterns_path = "data/patterns/patterns.jsonl"
+>
+> [initialize.components.entity_ruler.patterns]
+> @readers = "srsly.read_jsonl.v1"
+> path = "corpus/entity_ruler_patterns.jsonl
 > ```
 
 | Name           | Description                                                                                                                                                          |
@@ -96,7 +99,7 @@ Initialize the component with patterns from a file.
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                                                      |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                 |
-| `labels`       | Path to the .json file holding the serialized patterns. ~~Path~~                                                                                                     |
+| `patterns`     | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~                                                        |
 
 ## EntityRuler.\_\len\_\_ {#len tag="method"}
 

From fd0f60e2bc6046454dd5624b71aaebf21364e76e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 6 Oct 2020 09:28:53 +0200
Subject: [PATCH 444/516] updates to data format for training and pretraining

---
 website/docs/api/data-formats.md | 58 ++++++++++++++++----------------
 website/docs/usage/training.md   |  2 +-
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index c1b9bfef4..a97dcd2f6 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -180,24 +180,24 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                                               |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                    |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                              |
-| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                           |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                            |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                 |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                         |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                           |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                 |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                   |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                           |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                       |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                             |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                           |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                       |
+| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
+| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
+| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                                                           |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                                                                                                     |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -205,17 +205,17 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/embeddings-transformers#pretraining). It's
 used when you run [`spacy pretrain`](/api/cli#pretrain).
 
-| Name           | Description                                                                                            |
-| -------------- | ------------------------------------------------------------------------------------------------------ |
-| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                  |
-| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                         |
-| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                |
-| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
-| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
-| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
-| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
-| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
-| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
+| Name           | Description                                                                                                                                                                                                  |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                                                                                                                        |
+| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                                                                                               |
+| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                                                                                                      |
+| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                                                                                                       |
+| `optimizer`    | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
+| `corpus`       | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~                                                                                               |
+| `batcher`      | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
+| `component`    | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~                                                                                                            |
+| `layer`        | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~                                                                                                                 |
 
 ### initialize {#config-initialize tag="section"}
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 1981f03b7..64b3b85ad 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
 If you don't want a component to be updated, you can **freeze** it by adding it
 to the `frozen_components` list in the `[training]` block. Frozen components are
 **not updated** during training and are included in the final trained pipeline
-as-is.
+as-is. They are also excluded when calling `nlp.initialize()`.
 
 > #### Note on frozen components
 >

From 9b4cf7b0b6b614ff044ae610217a3a73dcf35851 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 6 Oct 2020 09:47:23 +0200
Subject: [PATCH 445/516] update output of debug config command

---
 spacy/cli/_util.py      |  2 +-
 website/docs/api/cli.md | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 373650172..60e400fb4 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -278,7 +278,7 @@ def show_validation_error(
                 "fill-config' command to fill in all the defaults, if possible:",
                 spaced=True,
             )
-            print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
+            print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
         sys.exit(1)
     except InterpolationError as e:
         msg.fail("Config validation error", e, exits=1)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e51e698dd..138b4b94b 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show
 
 ```
 ✘ Config validation error
+dropout     field required
+optimizer   field required
+optimize    extra fields not permitted
 
-training -> dropout     field required
-training -> optimizer   field required
-training -> optimize    extra fields not permitted
-
-{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
+{'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
 
 If your config contains missing values, you can run the 'init fill-config'
 command to fill in all the defaults, if possible:
 
-python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg
+python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg
 ```
 
 </Accordion>

From 2e961817cbcf63afa1ee81ea8338850fc4cc157f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Oct 2020 10:23:01 +0200
Subject: [PATCH 446/516] Update docs [ci skip]

---
 website/docs/usage/training.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 64b3b85ad..e63e25e52 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
 If you don't want a component to be updated, you can **freeze** it by adding it
 to the `frozen_components` list in the `[training]` block. Frozen components are
 **not updated** during training and are included in the final trained pipeline
-as-is. They are also excluded when calling `nlp.initialize()`.
+as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
 
 > #### Note on frozen components
 >

From 2fd7122074259bde66d79e2f6a289809d545777e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Oct 2020 10:31:48 +0200
Subject: [PATCH 447/516] Update docs [ci skip]

---
 website/docs/api/attributeruler.md | 4 ++--
 website/docs/api/data-formats.md   | 3 +++
 website/docs/api/entityruler.md    | 9 +++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index b89759080..d60362a47 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -128,8 +128,8 @@ Get all patterns that have been added to the attribute ruler in the
 
 ## AttributeRuler.initialize {#initialize tag="method"}
 
-Initialize the component with data. Typically called before training to load in
-rules from a file. This method is typically called by
+Initialize the component with data and used before training to load in rules
+from a file. This method is typically called by
 [`Language.initialize`](/api/language#initialize) and lets you customize
 arguments it receives via the
 [`[initialize.components]`](/api/data-formats#config-initialize) block in the
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index a97dcd2f6..c4cc5b1e4 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -224,6 +224,9 @@ It's used by [`Language.initialize`](/api/language#initialize) and typically
 called right before training (but not at runtime). The section allows you to
 specify local file paths or custom functions to load data resources from,
 without requiring them at runtime when you load the trained pipeline back in.
+Also see the usage guides on the
+[config lifecycle](/usage/training#config-lifecycle) and
+[custom initialization](/usage/training#initialization).
 
 > #### Example
 >
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index b8aab2f50..76a4b3604 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -76,7 +76,12 @@ be a token pattern (list) or a phrase pattern (string). For example:
 
 ## EntityRuler.initialize {#initialize tag="method" new="3"}
 
-Initialize the component with patterns from a file.
+Initialize the component with data and used before training to load in rules
+from a file. This method is typically called by
+[`Language.initialize`](/api/language#initialize) and lets you customize
+arguments it receives via the
+[`[initialize.components]`](/api/data-formats#config-initialize) block in the
+config.
 
 > #### Example
 >
@@ -204,7 +209,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a
 
 ## EntityRuler.from_disk {#from_disk tag="method"}
 
-Load the entity ruler from a file. Expects either a file containing
+Load the entity ruler from a path. Expects either a file containing
 newline-delimited JSON (JSONL) with one entry per line, or a directory
 containing a `patterns.jsonl` file and a `cfg` file with the component
 configuration.

From 59982d5ef8155fe8a3b1d58b016d669c05426f0b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Oct 2020 10:40:43 +0200
Subject: [PATCH 448/516] Add pip upgrade step to README

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3e5e5febe..5d310492d 100644
--- a/README.md
+++ b/README.md
@@ -104,9 +104,11 @@ For detailed installation instructions, see the
 ### pip
 
 Using pip, spaCy releases are available as source packages and binary wheels (as
-of `v2.0.13`).
+of `v2.0.13`). Before you install spaCy and its dependencies, make sure that
+your `pip`, `setuptools` and `wheel` are up to date.
 
 ```bash
+pip install -U pip setuptools wheel
 pip install spacy
 ```
 

From aa9c9f3bf0acf88c596b006c157b5f56ed306aeb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Oct 2020 11:21:17 +0200
Subject: [PATCH 449/516] Update Chinese usage for spacy-pkuseg

---
 website/docs/usage/models.md | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index dc41385f2..fe3ee6e04 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -98,10 +98,10 @@ The Chinese language class supports three word segmentation options, `char`,
 > # Jieba
 > cfg = {"segmenter": "jieba"}
 > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
-> # PKUSeg with "default" model provided by pkuseg
+> # PKUSeg with "mixed" model provided by pkuseg
 > cfg = {"segmenter": "pkuseg"}
 > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
-> nlp.tokenizer.initialize(pkuseg_model="default")
+> nlp.tokenizer.initialize(pkuseg_model="mixed")
 > ```
 
 ```ini
@@ -115,7 +115,7 @@ segmenter = "char"
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `char`    | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`.                                                                                                            |
 | `jieba`   | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`.                                                                                                                                                          |
-| `pkuseg`  | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
+| `pkuseg`  | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/explosion/spacy-pkuseg) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. |
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -133,10 +133,10 @@ runtime.
 The `initialize` method for the Chinese tokenizer class supports the following
 config settings for loading `pkuseg` models:
 
-| Name               | Description                                                                                                                           |
-| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `pkuseg_model`     | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~                                                  |
-| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ |
+| Name               | Description                                                                                                                                                            |
+| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pkuseg_model`     | Name of a model provided by `spacy-pkuseg` or the path to a local model directory. ~~str~~                                                                             |
+| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`, the default provided dictionary. ~~str~~ |
 
 The initialization settings are typically provided in the
 [training config](/usage/training#config) and the data is loaded in before
@@ -164,14 +164,17 @@ You can also initialize the tokenizer for a blank language class by calling its
 cfg = {"segmenter": "pkuseg"}
 nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
 
-# Load "default" model
-nlp.tokenizer.initialize(pkuseg_model="default")
+# Load spaCy's OntoNotes model
+nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes")
+
+# Load pkuseg's "news" model
+nlp.tokenizer.initialize(pkuseg_model="news")
 
 # Load local model
 nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model")
 
 # Override the user directory
-nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict")
+nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes", pkuseg_user_dict="/path/to/user_dict")
 ```
 
 You can also modify the user dictionary on-the-fly:
@@ -195,13 +198,13 @@ The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg`
 model trained only on
 [Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
 models provided by `pkuseg` include data restricted to research use. For
-research use, `pkuseg` provides models for several different domains
-(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses,
-`pkuseg` provides a simple
-[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage):
+research use, `pkuseg` provides models for several different domains (`"mixed"`
+(equivalent to `"default"` from `pkuseg` packages), `"news"` `"web"`,
+`"medicine"`, `"tourism"`) and for other uses, `pkuseg` provides a simple
+[training API](https://github.com/explosion/spacy-pkuseg/blob/master/readme/readme_english.md#usage):
 
 ```python
-import pkuseg
+import spacy_pkuseg as pkuseg
 from spacy.lang.zh import Chinese
 
 # Train pkuseg model

From 2a17566da3c7c39dfb6639f00f0453d9e988cb8f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Oct 2020 14:15:08 +0200
Subject: [PATCH 450/516] Update docs [ci skip]

---
 website/docs/images/layers-architectures.svg | 97 --------------------
 website/docs/images/trainable_component.svg  | 55 +++++++++++
 website/docs/usage/layers-architectures.md   |  6 +-
 website/docs/usage/processing-pipelines.md   | 14 +--
 website/docs/usage/spacy-101.md              | 69 +++++++++++++-
 5 files changed, 134 insertions(+), 107 deletions(-)
 delete mode 100644 website/docs/images/layers-architectures.svg
 create mode 100644 website/docs/images/trainable_component.svg

diff --git a/website/docs/images/layers-architectures.svg b/website/docs/images/layers-architectures.svg
deleted file mode 100644
index 22e705ba1..000000000
--- a/website/docs/images/layers-architectures.svg
+++ /dev/null
@@ -1,97 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="918" height="633" viewBox="0 0 918 633">
-  <defs>
-    <linearGradient id="a" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#F03969"/>
-      <stop offset="100%" stop-color="#3AD787"/>
-    </linearGradient>
-    <linearGradient id="b" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#3AD787"/>
-      <stop offset="100%" stop-color="#F03969"/>
-    </linearGradient>
-    <linearGradient id="c" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#8978B5"/>
-      <stop offset="100%" stop-color="#F03969"/>
-    </linearGradient>
-    <linearGradient id="d" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#8978B5"/>
-      <stop offset="100%" stop-color="#F03969"/>
-    </linearGradient>
-    <linearGradient id="e" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#3AD787"/>
-      <stop offset="100%" stop-color="#F03969"/>
-    </linearGradient>
-    <linearGradient id="f" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#F03969"/>
-      <stop offset="100%" stop-color="#8978B5"/>
-    </linearGradient>
-    <linearGradient id="g" x1="50%" x2="50%" y1="0%" y2="100%">
-      <stop offset="0%" stop-color="#8978B5"/>
-      <stop offset="100%" stop-color="#F03969"/>
-    </linearGradient>
-  </defs>
-  <g fill="none" fill-rule="evenodd">
-    <g stroke="#F03969">
-      <rect width="289" height="115" fill="#EAC1CC" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(303 64)"/>
-      <rect width="64.3" height="21.5" x="3.8" y="1.8" fill="#F03969" stroke-width="3.5" rx="10.8" transform="translate(409 51)"/>
-    </g>
-    <rect width="132" height="59" x="2.5" y="2.5" fill="#D7CCF4" stroke="#8978B5" stroke-width="5" rx="12" transform="translate(683 78)"/>
-    <rect width="132" height="59" x="2.5" y="2.5" fill="#D7CCF4" stroke="#8978B5" stroke-width="5" rx="12" transform="translate(78 78)"/>
-    <path fill="url(#a)" fill-rule="nonzero" d="M12.8 28.8H20l-10 20-10-20h7.2V0h5.6v28.8z" transform="translate(328 213)"/>
-    <path fill="url(#b)" fill-rule="nonzero" d="M12.8 28.8H20l-10 20-10-20h7.2V0h5.6v28.8z" transform="rotate(-180 186 131)"/>
-    <path fill="url(#c)" fill-rule="nonzero" d="M726.2 144.7h-12V209H709v-69.5h17.2V132l19.8 10-19.8 9.9v-7.2z" transform="matrix(0 -1 -1 0 898 898)"/>
-    <path fill="url(#d)" fill-rule="nonzero" d="M199.8 220.3V103H228v5.2h-23v112h6.9l-10 19.8-9.9-19.7h7.8z" transform="rotate(-90 210 171.5)"/>
-    <path fill="url(#e)" fill-rule="nonzero" d="M432 178v82h7v-82z" transform="rotate(-180 435.5 219)"/>
-    <path fill="#EDFFF6" stroke="#3AD787" stroke-dasharray="0 16" stroke-linecap="round" stroke-linejoin="round" stroke-width="6" d="M107 328h708a30 30 0 0130 30v206a30 30 0 01-30 30H107a30 30 0 01-30-30V358a30 30 0 0130-30z"/>
-    <rect width="288" height="99" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(300 272)"/>
-    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M315 304l23 20-23 19h-90a12 12 0 01-12-12v-15a12 12 0 0112-12h90zM664 304l23 20-23 19h-98a12 12 0 01-12-12v-15a12 12 0 0112-12h98z"/>
-    <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(396 259)"/>
-    <rect width="217" height="119" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(186 407)"/>
-    <rect width="241" height="119" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(476 407)"/>
-    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M803 439l23 20-23 19h-98a12 12 0 01-12-12v-15a12 12 0 0112-12h98zM506 439l23 20-23 19H383a12 12 0 01-12-12v-15a12 12 0 0112-12h123z"/>
-    <g stroke="#3AD787">
-      <rect width="114" height="55" fill="#B5F3D4" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(176 509)"/>
-      <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(190 496)"/>
-    </g>
-    <g stroke="#3AD787">
-      <rect width="114" height="55" fill="#B5F3D4" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(300 509)"/>
-      <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(314 496)"/>
-    </g>
-    <g stroke="#3AD787">
-      <rect width="114" height="55" fill="#B5F3D4" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(542 509)"/>
-      <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(556 496)"/>
-    </g>
-    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M197 439l23 20-23 19h-90a12 12 0 01-12-12v-15a12 12 0 0112-12h90z"/>
-    <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(251 393)"/>
-    <rect width="69.5" height="23.5" x="8.8" y="1.8" fill="#3AD787" stroke="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(559 394)"/>
-    <path fill="url(#f)" fill-rule="nonzero" d="M12.2 51H20L10 71 0 51h7V0h5.2v51z" transform="rotate(-90 363 -240.5)"/>
-    <path fill="#8978B5" fill-rule="nonzero" d="M850.5 108.3v-7.8l20 10-20 10v-7h-21v-5.2h21zM48.5 108.3v-7.8l20 10-20 10v-7h-21v-5.2h21z"/>
-    <path fill="url(#g)" fill-rule="nonzero" d="M12.2 51H20L10 71 0 51h7V0h5.2v51z" transform="rotate(-90 172.5 -51)"/>
-    <rect width="119" height="48" fill="#EAC1CC" stroke="#F03969" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(293 155)"/>
-    <rect width="217" height="48" fill="#EAC1CC" stroke="#F03969" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(453 155)"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M356.7 122c.4 0 .6 0 .9.3l2.3 2.6a12.8 12.8 0 01-4.8 3.7c-1.9.9-4.1 1.3-6.8 1.3-2.3 0-4.4-.4-6.3-1.2a14 14 0 01-8-8.6 19.8 19.8 0 01.2-13.3 15.2 15.2 0 0115.1-9.8 14.5 14.5 0 0110.7 4.2l-2 2.8-.4.4c-.1.2-.4.2-.7.2-.4 0-.7-.1-1-.4a12.8 12.8 0 00-3.5-1.7c-.8-.3-1.8-.4-3.1-.4-1.5 0-2.9.3-4.1.8a9.2 9.2 0 00-3.3 2.2c-.9 1-1.6 2.2-2 3.6-.6 1.4-.8 3-.8 4.7 0 1.9.2 3.5.7 4.9.5 1.4 1.2 2.6 2 3.5a8.8 8.8 0 007 3l2.1-.1a9.2 9.2 0 003.4-1.2l1.4-1.1.5-.3.5-.1zm17-15.7c1.7 0 3.2.3 4.6.9a9.9 9.9 0 015.7 6c.6 1.5.8 3 .8 4.9 0 1.8-.2 3.4-.8 4.9a9.7 9.7 0 01-5.8 6 12.3 12.3 0 01-9.2 0 9.8 9.8 0 01-5.7-6 14 14 0 01-.8-5c0-1.7.3-3.3.8-4.8a10 10 0 015.8-6c1.4-.6 3-.9 4.6-.9zm0 19.3c1.9 0 3.3-.6 4.2-1.9a9.7 9.7 0 001.3-5.6c0-2.4-.4-4.3-1.3-5.6-1-1.3-2.3-2-4.2-2-2 0-3.3.7-4.2 2a9.7 9.7 0 00-1.4 5.6c0 2.5.5 4.3 1.4 5.6.9 1.3 2.3 2 4.2 2zm15.4 4v-22.9h3.3c.7 0 1.2.3 1.4 1l.3 1.7 1.3-1.2a8 8 0 013-1.6 7 7 0 011.8-.3c1.4 0 2.6.4 3.5 1.2a7 7 0 012 3.1 7 7 0 013.2-3.2 9.6 9.6 0 017.5-.4c.9.3 1.7.9 2.4 1.6.6.8 1.1 1.7 1.5 2.7.3 1.1.5 2.3.5 3.7v14.5h-5.4V115c0-1.4-.3-2.5-1-3.3-.6-.7-1.5-1-2.8-1-.5 0-1 0-1.5.2a3.7 3.7 0 00-2.1 2.2c-.2.6-.3 1.2-.3 2v14.4h-5.5V115c0-1.5-.3-2.6-.9-3.3-.6-.7-1.5-1-2.7-1-.8 0-1.5.1-2.2.5-.7.4-1.3 1-1.9 1.6v16.7h-5.4zm37 7.4v-30.3h3.4c.3 0 .6 0 .9.2l.5.8.4 2.1c1-1 2-1.9 3.1-2.5a8.4 8.4 0 017.6-.2c1 .5 1.9 1.2 2.6 2.2.8 1 1.3 2.2 1.7 3.7a18.6 18.6 0 010 9.7c-.5 1.4-1.2 2.7-2 3.8a8.8 8.8 0 01-7 3.4 8 8 0 01-3.3-.6 8 8 0 01-2.4-1.6v9.3h-5.4zm10.6-26.4c-1.1 0-2 .3-2.9.7-.8.5-1.5 1.2-2.2 2v10.3a5.2 5.2 0 004.3 2c.9 0 1.6 0 2.3-.4.7-.3 1.2-.8 1.7-1.4a7 7 0 001-2.4c.3-1 .4-2.1.4-3.5 0-1.3 0-2.4-.3-3.3-.2-1-.5-1.7-1-2.3-.3-.6-.8-1-1.4-1.3-.5-.2-1.2-.4-1.9-.4zm24-4.3c1.7 0 3.2.3 4.6.9a9.9 9.9 0 015.7 6c.6 1.5.8 3 .8 4.9 0 1.8-.2 3.4-.8 4.9a9.7 9.7 0 01-5.8 6 12.3 12.3 0 01-9.2 0 9.8 9.8 0 01-5.7-6 14 14 0 01-.8-5c0-1.7.3-3.3.8-4.8a10 10 0 015.8-6c1.4-.6 2.9-.9 4.6-.9zm0 19.3c1.9 0 3.3-.6 4.2-1.9a9.7 9.7 0 001.3-5.6c0-2.4-.4-4.3-1.3-5.6-1-1.3-2.3-2-4.2-2-2 0-3.3.7-4.2 2a9.7 9.7 0 00-1.4 5.6c0 2.5.5 4.3 1.4 5.6.9 1.3 2.3 2 4.2 2zm15.4 4v-22.9h3.3c.7 0 1.2.3 1.4 1l.4 1.8a12.4 12.4 0 013-2.3 8.1 8.1 0 014-.9 8 8 0 013.2.7c1 .4 1.8 1 2.4 1.8.6.7 1.1 1.6 1.5 2.7.3 1 .5 2.2.5 3.5v14.5h-5.5V115a5 5 0 00-1-3.2c-.6-.8-1.5-1.2-2.8-1.2-1 0-1.8.2-2.7.7-.8.4-1.6 1-2.3 1.7v16.5h-5.4zm34.3-23.3c1.5 0 2.8.3 4 .7a8.6 8.6 0 015 5.3 12.4 12.4 0 01.8 5.4l-.2.7-.4.3-.6.1h-14c.2 2.4.8 4 1.9 5.2 1 1 2.5 1.6 4.3 1.6.9 0 1.6 0 2.3-.3.6-.2 1.2-.4 1.6-.7l1.3-.7a2 2 0 011-.3c.3 0 .4 0 .6.2.2 0 .3.2.4.3l1.6 2a9 9 0 01-2 1.8 11.3 11.3 0 01-4.8 1.8l-2.5.2c-1.6 0-3-.3-4.4-.8a9.8 9.8 0 01-3.5-2.4 11 11 0 01-2.4-3.8 15 15 0 010-9.7c.4-1.3 1.1-2.5 2-3.5 1-1 2.1-1.9 3.5-2.5 1.3-.6 2.8-.9 4.5-.9zm.1 4a5 5 0 00-3.7 1.3c-.9 1-1.4 2.2-1.7 3.9h10.2a7 7 0 00-.3-2c-.2-.6-.4-1.2-.8-1.7s-1-.8-1.5-1.1a5 5 0 00-2.2-.4zm14.1 19.2v-22.8h3.4c.7 0 1.1.3 1.3 1l.4 1.8a12.4 12.4 0 013-2.3 8.1 8.1 0 014-.9 8 8 0 013.3.7c1 .4 1.7 1 2.4 1.8.6.7 1 1.6 1.4 2.7.3 1 .5 2.2.5 3.5v14.5H539V115a5 5 0 00-1-3.2c-.6-.8-1.6-1.2-2.9-1.2-.9 0-1.8.2-2.6.7-.8.4-1.6 1-2.3 1.7v16.5h-5.5zm32.5.4c-2 0-3.5-.6-4.5-1.7a6.5 6.5 0 01-1.6-4.6v-12.8h-2.3a1 1 0 01-.8-.3c-.2-.2-.3-.4-.3-.8v-2.2l3.6-.6 1.2-6.2c0-.3.2-.6.4-.7.2-.2.5-.3.8-.3h2.8v7.2h6v4h-6v12.3c0 .7.2 1.3.5 1.7.4.4.9.6 1.5.6.3 0 .6 0 .8-.2a4.6 4.6 0 001-.5h.8l.4.4 1.6 2.7c-.8.6-1.7 1.1-2.7 1.5-1 .3-2.1.5-3.2.5z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M430.5 56.8c.6 0 1.2.1 1.7.3l1.4.8.8 1.3c.2.4.3 1 .3 1.6 0 .6 0 1.1-.3 1.6l-.8 1.3a4 4 0 01-1.4.8c-.5.2-1.1.3-1.7.3h-2.3v5.1H426V57h4.5zm-2.3 6h2.3a2 2 0 001.4-.5c.4-.4.6-.9.6-1.5a2 2 0 00-.6-1.5 2 2 0 00-1.4-.6h-2.3v4.1zm8.2 5h3.5V62h-3v-2h5.2v7.8h2.8V70h-8.5v-2zm2.9-10.6c0-.4.1-.6.3-.9.3-.2.6-.3 1-.3h.4c.4 0 .7.1 1 .3.2.3.3.5.3.9s-.1.7-.4.9c-.2.2-.5.3-.9.3h-.4c-.4 0-.7-.1-1-.3-.2-.2-.3-.5-.3-.9zm7.2 2.7h2.1v1.7h.2c.1-.6.4-1 .9-1.4.4-.3 1-.5 1.7-.5.5 0 1 .1 1.4.3.4.2.8.4 1 .8.4.3.6.7.8 1.2l.2 1.6v2.5c0 .6 0 1.2-.2 1.7s-.4.9-.7 1.2c-.3.4-.7.6-1 .8l-1.5.3c-.7 0-1.3-.2-1.7-.5-.5-.4-.8-.8-1-1.4V68.8a19 19 0 000 1.1V73h-2.2V60zm2.2 3.8V66c0 .6.2 1.1.5 1.5.4.3.8.5 1.4.5.6 0 1.1-.2 1.4-.5.4-.4.6-1 .6-1.5v-2.4a2 2 0 00-.6-1.5c-.3-.3-.8-.5-1.4-.5-.6 0-1 .2-1.4.5a2 2 0 00-.5 1.5zm7.8 0c0-.7 0-1.2.3-1.7a3.5 3.5 0 012.2-2c.5-.2 1.1-.3 1.8-.3.6 0 1.2.1 1.7.3l1.3.8c.4.3.7.8.9 1.2.2.5.3 1 .3 1.7v1.8h-6.3v.6c0 .7.2 1.2.5 1.6.4.4 1 .6 1.6.6l1.2-.3c.3-.1.6-.3.7-.6h2.2c0 .4-.3.8-.5 1.1l-1 .9-1.2.5-1.4.2c-.7 0-1.3-.1-1.8-.3-.5-.2-1-.4-1.4-.8-.3-.3-.6-.7-.8-1.2-.2-.5-.3-1-.3-1.6v-2.6zm2.2.2h4.1v-.2c0-.7-.2-1.2-.5-1.6a2 2 0 00-1.5-.6 2 2 0 00-1.6.6c-.3.4-.5.9-.5 1.5v.3z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M749.2 107.9c0 1.6-.3 3-.8 4.4a10.1 10.1 0 01-5.7 5.7 12 12 0 01-4.5.8H730V97h8.2c1.7 0 3.2.3 4.5.8a10 10 0 015.7 5.7c.5 1.3.8 2.8.8 4.4zm-4.1 0a10 10 0 00-.5-3.2c-.3-1-.8-1.8-1.4-2.4a5.9 5.9 0 00-2.1-1.6c-.8-.3-1.8-.5-2.9-.5H734v15.4h4.2c1.1 0 2-.2 2.9-.6.8-.3 1.5-.8 2.1-1.5.6-.6 1-1.4 1.4-2.4.3-1 .5-2 .5-3.2zm13.6-4.8c1.1 0 2.2.2 3 .5a6.7 6.7 0 014 4.1c.3 1 .5 2.1.5 3.3 0 1.2-.2 2.3-.5 3.3a6.6 6.6 0 01-4 4.1 8.4 8.4 0 01-6.2 0 6.7 6.7 0 01-3.9-4c-.4-1-.5-2.2-.5-3.4 0-1.2.1-2.3.5-3.3a6.8 6.8 0 014-4c.9-.4 2-.6 3-.6zm0 13c1.2 0 2.2-.4 2.8-1.3.6-.8 1-2 1-3.8 0-1.6-.4-2.9-1-3.8-.6-.8-1.6-1.3-2.8-1.3-1.3 0-2.3.5-2.9 1.3-.6 1-1 2.2-1 3.8 0 1.7.4 3 1 3.8.6.9 1.6 1.3 2.9 1.3zm21.2-9.5l-.3.3-.5.1a1 1 0 01-.5-.1 17.8 17.8 0 00-1.6-.8 4 4 0 00-1.3-.2c-.6 0-1.2.1-1.7.4-.4.2-.9.5-1.2 1-.3.4-.5 1-.7 1.6-.2.6-.2 1.3-.2 2.1s0 1.6.2 2.2c.2.7.4 1.2.8 1.6a3.3 3.3 0 002.8 1.4 3.9 3.9 0 002.3-.7l.6-.5c.2-.2.4-.2.7-.2.3 0 .5 0 .6.3l1.1 1.4a6.7 6.7 0 01-2.8 2l-1.6.4-1.6.1a6.2 6.2 0 01-4.9-2 7.5 7.5 0 01-1.5-2.6c-.3-1-.5-2.1-.5-3.4 0-1.1.1-2.2.5-3.1.3-1 .8-1.8 1.4-2.6.6-.7 1.4-1.2 2.3-1.6 1-.4 2-.6 3.2-.6 1.2 0 2.2.2 3 .5.9.4 1.7 1 2.4 1.6l-1 1.4zM140.2 107.9c0 1.6-.3 3-.8 4.4a10.1 10.1 0 01-5.7 5.7 12 12 0 01-4.5.8H121V97h8.2c1.7 0 3.2.3 4.5.8a10 10 0 015.7 5.7c.5 1.3.8 2.8.8 4.4zm-4.1 0a10 10 0 00-.5-3.2c-.3-1-.8-1.8-1.4-2.4a5.9 5.9 0 00-2.1-1.6c-.8-.3-1.8-.5-2.9-.5H125v15.4h4.2c1.1 0 2-.2 2.9-.6.8-.3 1.5-.8 2.1-1.5.6-.6 1-1.4 1.4-2.4.3-1 .5-2 .5-3.2zm13.6-4.8c1.1 0 2.2.2 3 .5a6.7 6.7 0 014 4.1c.3 1 .5 2.1.5 3.3 0 1.2-.2 2.3-.5 3.3a6.6 6.6 0 01-4 4.1 8.4 8.4 0 01-6.2 0 6.7 6.7 0 01-3.9-4c-.4-1-.5-2.2-.5-3.4 0-1.2.1-2.3.5-3.3a6.8 6.8 0 014-4c.9-.4 2-.6 3-.6zm0 13c1.2 0 2.2-.4 2.8-1.3.6-.8 1-2 1-3.8 0-1.6-.4-2.9-1-3.8-.6-.8-1.6-1.3-2.8-1.3-1.3 0-2.3.5-2.9 1.3-.6 1-1 2.2-1 3.8 0 1.7.4 3 1 3.8.6.9 1.6 1.3 2.9 1.3zm21.2-9.5l-.3.3-.5.1a1 1 0 01-.5-.1 17.8 17.8 0 00-1.6-.8 4 4 0 00-1.3-.2c-.6 0-1.2.1-1.7.4-.4.2-.9.5-1.2 1-.3.4-.5 1-.7 1.6-.2.6-.2 1.3-.2 2.1s0 1.6.2 2.2c.2.7.4 1.2.8 1.6a3.3 3.3 0 002.8 1.4 3.9 3.9 0 002.3-.7l.6-.5c.2-.2.4-.2.7-.2.3 0 .5 0 .6.3l1.1 1.4a6.7 6.7 0 01-2.8 2l-1.6.4-1.6.1a6.2 6.2 0 01-4.9-2 7.5 7.5 0 01-1.5-2.6c-.3-1-.5-2.1-.5-3.4 0-1.1.1-2.2.5-3.1.3-1 .8-1.8 1.4-2.6.6-.7 1.4-1.2 2.3-1.6 1-.4 2-.6 3.2-.6 1.2 0 2.2.2 3 .5.9.4 1.7 1 2.4 1.6l-1 1.4zM401 323.5a17.9 17.9 0 011.3 3 49.6 49.6 0 011.3-3l9.4-17.7.4-.5.4-.3.5-.1h5.2v31.8h-5.2V316a26 26 0 01.1-2.6l-9.6 18c-.2.4-.5.8-.9 1-.3.2-.7.3-1.2.3h-.8c-.5 0-.9 0-1.2-.3-.4-.2-.7-.6-.9-1l-9.7-18a19.5 19.5 0 01.1 2.6v20.6H385v-31.8h5.1l.6.1.4.3.4.5 9.5 17.7zm34.2-9.7c1.7 0 3.3.2 4.6.8a9.9 9.9 0 015.8 6c.5 1.4.8 3 .8 4.7a14 14 0 01-.8 4.9c-.5 1.4-1.3 2.6-2.3 3.7-1 1-2.1 1.7-3.5 2.3-1.3.5-2.9.8-4.6.8-1.6 0-3.2-.3-4.6-.8a9.8 9.8 0 01-5.8-6c-.5-1.5-.8-3-.8-4.9 0-1.7.3-3.3.8-4.8a10 10 0 015.8-6c1.4-.5 3-.7 4.6-.7zm0 19c2 0 3.3-.6 4.2-1.9a9.5 9.5 0 001.4-5.5c0-2.4-.5-4.3-1.4-5.6-.9-1.2-2.3-1.9-4.2-1.9-1.9 0-3.3.7-4.2 2a9.5 9.5 0 00-1.3 5.5c0 2.4.4 4.3 1.3 5.5 1 1.3 2.3 2 4.2 2zm31.3 3.9c-.7 0-1.2-.4-1.4-1l-.4-2.2-1.5 1.4a9.3 9.3 0 01-3.6 1.8c-.7.2-1.4.3-2.2.3a7.5 7.5 0 01-6-3c-.8-1-1.3-2.2-1.7-3.6a18.3 18.3 0 010-9.6c.5-1.4 1.1-2.7 2-3.7.8-1.1 1.9-2 3-2.5 1.2-.6 2.6-.9 4-.9a7.8 7.8 0 015.7 2.2V304h5.4v32.7h-3.3zm-7.2-4c1.1 0 2-.2 2.8-.7.8-.5 1.6-1.1 2.3-2v-10.1a5.3 5.3 0 00-4.4-2c-.8 0-1.5 0-2.2.4-.7.3-1.2.8-1.7 1.4-.5.6-.8 1.4-1 2.4a15 15 0 000 6.7c.1 1 .4 1.7.8 2.3.4.5.9 1 1.5 1.2.5.3 1.2.4 1.9.4zm25.7-19c1.4 0 2.8.3 4 .7a8.6 8.6 0 015 5.2 12.1 12.1 0 01.7 5.4c0 .3 0 .5-.2.6 0 .2-.2.3-.3.4h-14.7c.3 2.4.9 4 2 5.2 1 1 2.5 1.6 4.3 1.6.9 0 1.6-.1 2.3-.3l1.6-.7 1.3-.7c.3-.2.7-.3 1-.3.2 0 .4 0 .6.2.2 0 .3.2.4.3l1.6 2a9 9 0 01-2 1.8 11.4 11.4 0 01-4.9 1.7l-2.4.2c-1.6 0-3-.3-4.4-.8a9.8 9.8 0 01-3.5-2.3c-1-1-1.8-2.3-2.4-3.8a14.6 14.6 0 010-9.5c.4-1.4 1.1-2.6 2-3.6a11.2 11.2 0 018-3.3zm.1 4a5 5 0 00-3.7 1.3c-.9.9-1.4 2.2-1.7 3.8h10.2c0-.7 0-1.4-.3-2-.2-.6-.5-1.2-.9-1.6-.4-.5-.9-.9-1.5-1.1-.6-.3-1.3-.4-2-.4zM505 304v32.7h-5.4V304h5.4z"/>
-    <g fill-rule="nonzero">
-      <path fill="#3D4251" d="M225.3 316v10.9h5.6v2H223v-13h2.3zm7.7 10.9h3.6V321h-3.1v-2h5.2v7.8h2.9v2H233v-2zm3-10.5c0-.4 0-.7.3-.9.2-.2.5-.3.9-.3h.5c.3 0 .6 0 .9.3.2.2.3.5.3.9 0 .3 0 .6-.3.8-.3.2-.6.3-1 .3h-.4c-.4 0-.7 0-1-.3-.2-.2-.3-.5-.3-.8zm6.9 10h2.3c0 .3.2.6.5.7.3.2.7.3 1.1.3h.8c.5 0 1-.1 1.3-.3.3-.3.4-.6.4-1 0-.6-.5-1-1.5-1.2l-1.4-.1c-1.1-.2-2-.5-2.5-1-.6-.4-.8-1.1-.8-2 0-1 .3-1.7 1-2.2.6-.5 1.6-.7 2.8-.7h.7c1.1 0 2 .2 2.7.7.6.4 1 1 1.1 1.9h-2.3a1 1 0 00-.5-.7 2 2 0 00-1-.2h-.7c-1 0-1.6.4-1.6 1.1 0 .6.4 1 1.3 1.1l1.5.2c1.2.2 2 .5 2.6 1 .6.5.8 1.2.8 2 0 1-.3 1.8-1 2.3-.7.6-1.7.8-3 .8h-.7c-1.1 0-2-.2-2.7-.7-.7-.5-1.1-1.1-1.2-2zm10-7.4h2.8v-3h2.2v3h3.8v2h-3.8v4.9c0 .3.1.5.3.7.2.2.4.3.8.3h2.5v2h-2.7c-1 0-1.7-.2-2.3-.8a3 3 0 01-.8-2.2V321h-2.8v-2z"/>
-      <path fill="#67708A" d="M265.8 330.9V314h5v2.1H268v12.7h2.8v2.1z"/>
-      <path fill="#3D4251" d="M274 316h4c.7 0 1.3 0 1.8.3.6.2 1 .5 1.4.8.4.4.7.8.9 1.3.2.5.3 1 .3 1.7v4.7c0 .6 0 1.2-.3 1.7a3.7 3.7 0 01-2.3 2.1c-.5.2-1.1.3-1.8.3h-4v-13zm2.3 10.9h1.7c.7 0 1.2-.2 1.6-.6.4-.4.6-.9.6-1.5v-4.7a2 2 0 00-.6-1.5c-.4-.4-1-.6-1.6-.6h-1.7v8.9zm12.1 2.2c-.6 0-1.2 0-1.8-.3a4 4 0 01-1.3-.8c-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.4c0-.6.1-1.2.3-1.6a3.5 3.5 0 012.2-2l1.8-.3c.7 0 1.3 0 1.8.2l1.3.8 1 1.3.2 1.6v2.4c0 .6 0 1.1-.3 1.6a3.5 3.5 0 01-2.2 2c-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.1 1 .5 1.4.4.4.9.5 1.5.5a2 2 0 001.5-.5c.4-.3.6-.8.6-1.4v-2.4c0-.6-.2-1.1-.6-1.5a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.4.4-.5.9-.5 1.5v2.4zm12.4 4c-.6 0-1.2-.2-1.7-.3-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.6c0-.6.1-1 .3-1.6.3-.4.5-.8 1-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.3 0 4 4 0 012.7 3.3H301c0-.5-.3-.9-.7-1.2-.4-.2-.8-.4-1.4-.4-.6 0-1.1.2-1.5.5-.3.4-.5.8-.5 1.4v2.6c0 .6.2 1 .5 1.4.4.3.9.5 1.5.5s1-.2 1.4-.5c.4-.2.6-.6.7-1h2.2c0 .5-.2 1-.4 1.4a4 4 0 01-2.3 1.8 5 5 0 01-1.6.2z"/>
-      <path fill="#67708A" d="M306 330.9v-2.1h2.8v-12.7H306V314h5.1v16.9z"/>
-    </g>
-    <path fill="#3D4251" fill-rule="nonzero" d="M569.3 329.2c-.7 0-1.2-.1-1.8-.3l-1.3-.8c-.4-.4-.7-.8-.9-1.3-.2-.4-.3-1-.3-1.6V320c0-.6.1-1.1.3-1.6a3.6 3.6 0 012.2-2c.6-.2 1.1-.3 1.8-.3.6 0 1.2 0 1.8.3.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v5.3c0 .6-.1 1.1-.3 1.6-.2.5-.5 1-.9 1.3a4 4 0 01-1.4.8c-.5.2-1 .3-1.7.3zm-2-4c0 .6.1 1.1.5 1.5.4.3.9.5 1.5.5a2 2 0 001.5-.5c.4-.4.5-.9.5-1.5V320c0-.6-.1-1-.5-1.4a2 2 0 00-1.5-.6 2 2 0 00-1.5.6c-.4.3-.6.8-.6 1.4v5.3zm10.5-6v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.2v6.1c0 1.2-.4 2.2-1 2.9-.8.6-1.8 1-3 1-1.3 0-2.3-.4-3-1-.7-.7-1-1.7-1-2.9v-6.1h2.2zm7.5 0h2.7v-3h2.2v3h3.8v2h-3.8v4.8c0 .3.1.5.3.7.2.2.5.3.8.3h2.5v2h-2.7c-1 0-1.7-.3-2.3-.8a3 3 0 01-.8-2.2v-4.8h-2.7v-2zm10.7 0h2.2v1.7h.2c0-.6.4-1 .8-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.8.4 1 .8.4.3.6.7.8 1.2l.2 1.5v2.5a5 5 0 01-.2 1.6c-.2.5-.4.9-.7 1.2-.3.4-.7.6-1 .8l-1.5.3a3 3 0 01-1.8-.5c-.4-.4-.7-.8-.8-1.4h-.2V327.9a18.1 18.1 0 00.1 1v3.1H596v-12.8zm2.3 3.7v2.4c0 .6.2 1 .5 1.4.3.3.8.5 1.4.5.6 0 1-.2 1.4-.5.4-.4.6-.9.6-1.5V323a2 2 0 00-2-2c-.6 0-1 .2-1.4.6a2 2 0 00-.5 1.4zm10.3-3.7v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.2v6.1c0 1.2-.3 2.2-1 2.9-.7.6-1.7 1-3 1s-2.3-.4-3-1c-.7-.7-1-1.7-1-2.9v-6.1h2.2zm7.5 0h2.7v-3h2.2v3h3.8v2H621v4.8c0 .3.1.5.3.7.2.2.5.3.8.3h2.5v2H622c-1 0-1.7-.3-2.3-.8a3 3 0 01-.8-2.2v-4.8h-2.7v-2zm10.1-1v-2h9.4v2h-3.5V329h-2.3v-10.8h-3.6zm10.2 1h2.5l2 5.2a4.8 4.8 0 01.3 1.3v.7h.2a7.2 7.2 0 01.2-1.3c0-.3 0-.5.2-.7l1.8-5.2h2.4l-4.7 12.8h-2.4l1.3-3.6-3.8-9.2zm11 0h2.2v1.7h.1c.2-.6.4-1 .9-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.7.4 1 .8.3.3.6.7.7 1.2.2.4.3 1 .3 1.5v2.5a5 5 0 01-.3 1.6c-.1.5-.4.9-.7 1.2-.3.4-.6.6-1 .8-.4.2-1 .3-1.4.3a3 3 0 01-1.8-.5c-.5-.4-.7-.8-.9-1.4h-.1V327.9a18.1 18.1 0 000 1v3.1h-2.2v-12.8zm2.3 3.7v2.4c0 .6.1 1 .5 1.4.3.3.8.5 1.4.5.6 0 1-.2 1.4-.5.3-.4.5-.9.5-1.5V323c0-.6-.2-1-.5-1.4a2 2 0 00-1.4-.6c-.6 0-1 .2-1.4.6a2 2 0 00-.5 1.4zm7.8 0c0-.6 0-1.2.3-1.6.2-.5.5-1 .8-1.2.4-.4.8-.6 1.4-.8a5.2 5.2 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v1.7h-6.3v.7c0 .6.2 1.2.5 1.5.4.4.9.6 1.5.6.5 0 1-.1 1.3-.3.3-.1.6-.3.7-.6h2.2a3 3 0 01-.5 1.1l-1 .8c-.3.3-.7.4-1.2.6a5.7 5.7 0 01-3.2-.1c-.6-.2-1-.4-1.4-.8a4 4 0 01-1.2-2.8v-2.5zm2.2.2h4.1v-.2a2 2 0 00-.5-1.6 2 2 0 00-1.6-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M417.6 265l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H448v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V267h-3.2z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M257.8 467h10.4v4H253v-26h4.8v22zm27.9 4h-2c-.4 0-.8 0-1-.2-.2-.1-.4-.4-.5-.8l-.4-1.3a15.2 15.2 0 01-2.8 2 7 7 0 01-1.5.4 9.1 9.1 0 01-4-.1c-.6-.3-1.2-.6-1.6-1-.5-.4-.9-1-1.1-1.6-.3-.6-.4-1.3-.4-2.1 0-.7.1-1.4.5-2 .4-.8 1-1.4 1.8-2 .9-.5 2-1 3.4-1.3 1.4-.3 3.2-.5 5.3-.6v-1c0-1.3-.3-2.2-.8-2.8-.5-.6-1.3-.9-2.3-.9a5.5 5.5 0 00-3 .8l-1 .6a2 2 0 01-1 .2c-.3 0-.5 0-.8-.2l-.5-.6-.8-1.4c2.1-2 4.7-2.9 7.7-2.9 1 0 2 .2 2.8.5a6.1 6.1 0 013.5 3.8c.3.8.5 1.8.5 2.8V471zm-8.6-2.8l1.2-.1a4.7 4.7 0 002.1-1 9 9 0 001-.9v-3c-1.3 0-2.4 0-3.3.2-.8.2-1.5.4-2 .7-.6.2-1 .5-1.2.8a2 2 0 00-.3 1.1c0 .8.2 1.4.7 1.7.4.3 1 .5 1.8.5zm19.4 7.8c0 .4-.3.6-.5.8l-1 .2h-3.2l3.4-7.4-7.4-17h3.9c.3 0 .6 0 .8.2l.4.6 4 9.5a8.5 8.5 0 01.5 2 15.3 15.3 0 01.7-2l3.7-9.5a1.3 1.3 0 011.2-.9h3.6l-10 23.5zm20.2-23.7c1.2 0 2.3.1 3.2.5a7 7 0 014.2 4.3 10 10 0 01.5 4.3l-.1.6-.3.2-.5.1h-11.4c.1 2 .6 3.3 1.5 4.2 1 .9 2 1.3 3.5 1.3a6 6 0 001.9-.2l1.4-.6 1-.5.8-.3.5.1.3.3 1.3 1.6-1.6 1.5a9.2 9.2 0 01-4 1.4l-2 .2c-1.2 0-2.4-.3-3.5-.7a8 8 0 01-2.9-1.9 8.9 8.9 0 01-1.9-3 12 12 0 010-7.9c.3-1 .9-2 1.7-2.9.7-.8 1.6-1.4 2.7-2 1.1-.4 2.4-.6 3.7-.6zm.1 3.1a4 4 0 00-3 1.1c-.7.8-1.2 1.8-1.4 3.1h8.3c0-.5 0-1-.2-1.6a3.4 3.4 0 00-2-2.2c-.4-.3-1-.4-1.7-.4zm11.5 15.6v-18.5h2.6c.4 0 .8.1 1 .3l.3.9.3 2.2c.6-1.1 1.4-2 2.3-2.7.9-.7 1.8-1 3-1 .9 0 1.6.2 2.2.6l-.6 3.4c0 .2 0 .3-.2.4l-.5.2a3 3 0 01-.7-.2l-1.3-.1c-.9 0-1.7.3-2.3.8-.7.5-1.2 1.2-1.7 2.2V471h-4.4zM559.8 467h10.4v4H555v-26h4.8v22zm27.9 4h-2c-.4 0-.8 0-1-.2-.2-.1-.4-.4-.5-.8l-.4-1.3a15.2 15.2 0 01-2.8 2 7 7 0 01-1.5.4 9.1 9.1 0 01-4-.1c-.6-.3-1.2-.6-1.6-1-.5-.4-.9-1-1.1-1.6-.3-.6-.4-1.3-.4-2.1 0-.7.1-1.4.5-2 .4-.8 1-1.4 1.8-2 .9-.5 2-1 3.4-1.3 1.4-.3 3.2-.5 5.3-.6v-1c0-1.3-.3-2.2-.8-2.8-.5-.6-1.3-.9-2.3-.9a5.5 5.5 0 00-3 .8l-1 .6a2 2 0 01-1 .2c-.3 0-.5 0-.8-.2l-.5-.6-.8-1.4c2.1-2 4.7-2.9 7.7-2.9 1 0 2 .2 2.8.5a6.1 6.1 0 013.5 3.8c.3.8.5 1.8.5 2.8V471zm-8.6-2.8l1.2-.1a4.7 4.7 0 002.1-1 9 9 0 001-.9v-3c-1.3 0-2.4 0-3.3.2-.8.2-1.5.4-2 .7-.6.2-1 .5-1.2.8a2 2 0 00-.3 1.1c0 .8.2 1.4.7 1.7.4.3 1 .5 1.8.5zm19.4 7.8c0 .4-.3.6-.5.8l-1 .2h-3.2l3.4-7.4-7.4-17h3.9c.3 0 .6 0 .8.2l.4.6 4 9.5a8.5 8.5 0 01.5 2 15.3 15.3 0 01.7-2l3.7-9.5a1.3 1.3 0 011.2-.9h3.6l-10 23.5zm20.2-23.7c1.2 0 2.3.1 3.2.5a7 7 0 014.2 4.3 10 10 0 01.5 4.3l-.1.6-.3.2-.5.1h-11.4c.1 2 .6 3.3 1.5 4.2 1 .9 2 1.3 3.5 1.3a6 6 0 001.9-.2l1.4-.6 1-.5.8-.3.5.1.3.3 1.3 1.6-1.6 1.5a9.2 9.2 0 01-4 1.4l-2 .2c-1.2 0-2.4-.3-3.5-.7a8 8 0 01-2.9-1.9 8.9 8.9 0 01-1.9-3 12 12 0 010-7.9c.3-1 .9-2 1.7-2.9.7-.8 1.6-1.4 2.7-2 1.1-.4 2.4-.6 3.7-.6zm.1 3.1a4 4 0 00-3 1.1c-.7.8-1.2 1.8-1.4 3.1h8.3c0-.5 0-1-.2-1.6a3.4 3.4 0 00-2-2.2c-.4-.3-1-.4-1.7-.4zm11.5 15.6v-18.5h2.6c.4 0 .8.1 1 .3l.3.9.3 2.2c.6-1.1 1.4-2 2.3-2.7.9-.7 1.8-1 3-1 .9 0 1.6.2 2.2.6l-.6 3.4c0 .2 0 .3-.2.4l-.5.2a3 3 0 01-.7-.2l-1.3-.1c-.9 0-1.7.3-2.3.8-.7.5-1.2 1.2-1.7 2.2V471h-4.4zM708.3 464.2c-.7 0-1.2-.1-1.8-.3l-1.3-.8c-.4-.4-.7-.8-.9-1.3-.2-.4-.3-1-.3-1.6V455c0-.6.1-1.1.3-1.6a3.6 3.6 0 012.2-2c.6-.2 1.1-.3 1.8-.3.6 0 1.2 0 1.8.3.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v5.3c0 .6-.1 1.1-.3 1.6-.2.5-.5 1-.9 1.3a4 4 0 01-1.4.8c-.5.2-1 .3-1.7.3zm-2-4c0 .6.1 1.1.5 1.5.4.3.9.5 1.5.5a2 2 0 001.5-.5c.4-.4.5-.9.5-1.5V455c0-.6-.1-1-.5-1.4a2 2 0 00-1.5-.6 2 2 0 00-1.5.6c-.4.3-.6.8-.6 1.4v5.3zm10.5-6v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.2v6.1c0 1.2-.4 2.2-1 2.9-.8.6-1.8 1-3 1-1.3 0-2.3-.4-3-1-.7-.7-1-1.7-1-2.9v-6.1h2.2zm7.5 0h2.7v-3h2.2v3h3.8v2h-3.8v4.8c0 .3.1.5.3.7.2.2.5.3.8.3h2.5v2h-2.7c-1 0-1.7-.3-2.3-.8a3 3 0 01-.8-2.2v-4.8h-2.7v-2zm10.7 0h2.2v1.7h.2c0-.6.4-1 .8-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.8.4 1 .8.4.3.6.7.8 1.2l.2 1.5v2.5a5 5 0 01-.2 1.6c-.2.5-.4.9-.7 1.2-.3.4-.7.6-1 .8l-1.5.3a3 3 0 01-1.8-.5c-.4-.4-.7-.8-.8-1.4h-.2V462.9a18.1 18.1 0 00.1 1v3.1H735v-12.8zm2.3 3.7v2.4c0 .6.2 1 .5 1.4.3.3.8.5 1.4.5.6 0 1-.2 1.4-.5.4-.4.6-.9.6-1.5V458a2 2 0 00-2-2c-.6 0-1 .2-1.4.6a2 2 0 00-.5 1.4zm10.3-3.7v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.2v6.1c0 1.2-.3 2.2-1 2.9-.7.6-1.7 1-3 1s-2.3-.4-3-1c-.7-.7-1-1.7-1-2.9v-6.1h2.2zm7.5 0h2.7v-3h2.2v3h3.8v2H760v4.8c0 .3.1.5.3.7.2.2.5.3.8.3h2.5v2H761c-1 0-1.7-.3-2.3-.8a3 3 0 01-.8-2.2v-4.8h-2.7v-2zm10.1-1v-2h9.4v2h-3.5V464h-2.3v-10.8h-3.6zm10.2 1h2.5l2 5.2a4.8 4.8 0 01.3 1.3v.7h.2a7.2 7.2 0 01.2-1.3c0-.3 0-.5.2-.7l1.8-5.2h2.4l-4.7 12.8h-2.4l1.3-3.6-3.8-9.2zm11 0h2.2v1.7h.1c.2-.6.4-1 .9-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.7.4 1 .8.3.3.6.7.7 1.2.2.4.3 1 .3 1.5v2.5a5 5 0 01-.3 1.6c-.1.5-.4.9-.7 1.2-.3.4-.6.6-1 .8-.4.2-1 .3-1.4.3a3 3 0 01-1.8-.5c-.5-.4-.7-.8-.9-1.4h-.1V462.9a18.1 18.1 0 000 1v3.1h-2.2v-12.8zm2.3 3.7v2.4c0 .6.1 1 .5 1.4.3.3.8.5 1.4.5.6 0 1-.2 1.4-.5.3-.4.5-.9.5-1.5V458c0-.6-.2-1-.5-1.4a2 2 0 00-1.4-.6c-.6 0-1 .2-1.4.6a2 2 0 00-.5 1.4zm7.8 0c0-.6 0-1.2.3-1.6.2-.5.5-1 .8-1.2.4-.4.8-.6 1.4-.8a5.2 5.2 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v1.7h-6.3v.7c0 .6.2 1.2.5 1.5.4.4.9.6 1.5.6.5 0 1-.1 1.3-.3.3-.1.6-.3.7-.6h2.2a3 3 0 01-.5 1.1l-1 .8c-.3.3-.7.4-1.2.6a5.7 5.7 0 01-3.2-.1c-.6-.2-1-.4-1.4-.8a4 4 0 01-1.2-2.8v-2.5zm2.2.2h4.1v-.2a2 2 0 00-.5-1.6 2 2 0 00-1.6-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zM386.3 464.2c-.7 0-1.2-.1-1.8-.3l-1.3-.8c-.4-.4-.7-.8-.9-1.3-.2-.4-.3-1-.3-1.6V455c0-.6.1-1.1.3-1.6a3.6 3.6 0 012.2-2c.6-.2 1.1-.3 1.8-.3.6 0 1.2 0 1.8.3.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v5.3c0 .6-.1 1.1-.3 1.6-.2.5-.5 1-.9 1.3a4 4 0 01-1.3.8l-1.8.3zm-2-4c0 .6.1 1.1.5 1.5.4.3.9.5 1.5.5a2 2 0 001.5-.5c.4-.4.5-.9.5-1.5V455c0-.6-.1-1-.5-1.4a2 2 0 00-1.5-.6 2 2 0 00-1.5.6c-.4.3-.6.8-.6 1.4v5.3zm10.5-6v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.2v6.1c0 1.2-.3 2.2-1 2.9-.7.6-1.7 1-3 1s-2.3-.4-3-1c-.7-.7-1-1.7-1-2.9v-6.1h2.2zm7.5 0h2.7v-3h2.3v3h3.7v2h-3.7v4.8c0 .3 0 .5.2.7.2.2.5.3.8.3h2.5v2h-2.6c-1 0-1.8-.3-2.3-.8a3 3 0 01-.9-2.2v-4.8h-2.7v-2zm10.8 0h2.2v1.7h.1c.1-.6.4-1 .9-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.7.4 1 .8.3.3.6.7.7 1.2.2.4.3 1 .3 1.5v2.5a5 5 0 01-.3 1.6c-.1.5-.4.9-.7 1.2-.3.4-.6.6-1 .8l-1.4.3a3 3 0 01-1.8-.5c-.5-.4-.8-.8-.9-1.4h-.1V462.9a18.1 18.1 0 000 1v3.1h-2.2v-12.8zm2.2 3.7v2.4c0 .6.2 1 .6 1.4.3.3.8.5 1.3.5.7 0 1.1-.2 1.5-.5.3-.4.5-.9.5-1.5V458c0-.6-.2-1-.5-1.4a2 2 0 00-1.5-.6c-.5 0-1 .2-1.3.6a2 2 0 00-.6 1.4zm10.3-3.7v6.1c0 1.3.6 2 1.8 2 1.2 0 1.8-.7 1.8-2v-6.1h2.3v6.1c0 1.2-.4 2.2-1 2.9-.8.6-1.8 1-3 1-1.3 0-2.3-.4-3-1-.7-.7-1.1-1.7-1.1-2.9v-6.1h2.2zm7.5 0h2.8v-3h2.2v3h3.8v2H438v4.8c0 .3.1.5.3.7.2.2.5.3.8.3h2.5v2H439c-1 0-1.7-.3-2.3-.8a3 3 0 01-.8-2.2v-4.8H433v-2zm10.2-1v-2h9.4v2h-3.5V464h-2.3v-10.8h-3.6zm10.2 1h2.5l2 5.2a4.8 4.8 0 01.3 1.3v.7h.2a7.2 7.2 0 01.2-1.3c0-.3 0-.5.2-.7l1.8-5.2h2.4l-4.7 12.8H456l1.4-3.6-3.9-9.2zm11 0h2.2v1.7h.2c0-.6.4-1 .8-1.4a3 3 0 011.8-.5c.5 0 1 .1 1.4.3.4.2.8.4 1 .8.4.3.6.7.7 1.2.2.4.3 1 .3 1.5v2.5a5 5 0 01-.3 1.6c-.1.5-.3.9-.6 1.2-.3.4-.7.6-1.1.8-.4.2-.9.3-1.4.3a3 3 0 01-1.8-.5c-.4-.4-.7-.8-.8-1.4h-.2V462.9a18.1 18.1 0 000 1v3.1h-2.2v-12.8zm2.3 3.7v2.4c0 .6.1 1 .5 1.4.3.3.8.5 1.4.5.6 0 1-.2 1.4-.5.4-.4.5-.9.5-1.5V458c0-.6-.1-1-.5-1.4a2 2 0 00-1.4-.6c-.6 0-1 .2-1.4.6a2 2 0 00-.5 1.4zm7.8 0c0-.6 0-1.2.3-1.6.2-.5.5-1 .9-1.2.3-.4.8-.6 1.3-.8.5-.2 1.1-.3 1.8-.3.6 0 1.2.1 1.7.3.6.2 1 .4 1.4.8a4 4 0 011.2 2.8v1.7h-6.4v.7c0 .6.2 1.2.6 1.5.3.4.8.6 1.5.6.5 0 .9-.1 1.2-.3.4-.1.6-.3.8-.6h2.2a3 3 0 01-.6 1.1c-.2.3-.5.6-.9.8l-1.2.6a5.7 5.7 0 01-3.3-.1c-.5-.2-1-.4-1.3-.8-.4-.3-.7-.7-1-1.2a4 4 0 01-.2-1.6v-2.5zm2.2.2h4.1v-.2a2 2 0 00-.5-1.6 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.6 1.5v.3zm11.4 1.1h1.8l.6-3.3h-1.8v-1.5h2l.6-3.2h1.7l-.6 3.2h2.5l.6-3.2h1.6l-.6 3.2h1.5v1.5h-1.8l-.5 3.3h1.7v1.5h-2l-.6 3.3h-1.6l.5-3.3h-2.5l-.5 3.3H489l.6-3.3h-1.5v-1.5zm3.4 0h2.5l.6-3.3h-2.5l-.6 3.3zm7.9 2.8h3.4v-7.6a13.4 13.4 0 01.1-1.3l-.2.2a6.3 6.3 0 01-.6.4l-2.5 1.7v-2.2l2.9-2h2.6V462h2.8v2h-8.5v-2zM199.7 534.3c0 .2-.1.3-.3.4h-.3a1 1 0 01-.5-.1 16.8 16.8 0 00-1.5-.8l-1.2-.1-1.1.1-.8.4-.4.7a2 2 0 00-.2.8c0 .3.1.7.3.9.2.3.5.5.9.6l1.1.5a57.6 57.6 0 012.7 1c.5.2.9.5 1.2.8a3.6 3.6 0 011.1 2.8c0 .8 0 1.4-.3 2a4.6 4.6 0 01-2.8 2.8 7.6 7.6 0 01-7.6-1.7l.9-1.4.2-.3h.4c.2 0 .4 0 .6.2a18 18 0 001.8 1l1.4.2c.9 0 1.5-.2 2-.6.4-.4.7-1 .7-1.7 0-.5-.1-.8-.4-1-.2-.3-.4-.5-.8-.7l-1.2-.5a39 39 0 01-2.6-1c-.5-.1-.9-.4-1.2-.7-.4-.3-.6-.8-.8-1.2a4.7 4.7 0 011-5 5 5 0 011.6-1 7.4 7.4 0 014.8 0c.8.3 1.5.7 2 1.2l-.7 1.4zm5.6 1.8v7.1c0 .7.2 1.2.5 1.6.3.4.8.6 1.4.6.5 0 1-.1 1.4-.3.4-.2.8-.5 1.1-.9v-8.1h2.7v11.2h-1.6c-.4 0-.6-.1-.7-.5l-.2-.9-.7.7a4.4 4.4 0 01-2.8.9 4 4 0 01-1.6-.3c-.5-.2-.9-.5-1.2-.9-.3-.4-.6-.8-.7-1.4-.2-.5-.3-1-.3-1.7v-7.1h2.7zm10 11.2V531h2.8v6.4l1.5-1.1a4.4 4.4 0 013.7 0c.5.2 1 .6 1.3 1 .4.5.6 1.1.8 1.8a9 9 0 010 4.8c-.2.8-.5 1.4-1 2a4.5 4.5 0 01-4.4 1.5 3.3 3.3 0 01-1.5-.8l-.6-.6v.8l-.3.4-.4.1h-1.8zm5.4-9.3c-.6 0-1 .1-1.5.3l-1.1 1v5.1a2.6 2.6 0 002.2 1c.4 0 .8 0 1-.2l1-.7.5-1.2.1-1.7-.1-1.7-.5-1c-.2-.4-.4-.6-.7-.7l-1-.2zm10-7v16.3H228V531h2.8zm12 16.3h-1.9l-.3-.6-.2-.8a9.3 9.3 0 01-1.7 1.2l-1 .3a5.6 5.6 0 01-2.4-.1c-.4-.1-.7-.3-1-.6-.3-.2-.5-.5-.7-1-.2-.3-.2-.8-.2-1.3 0-.4 0-.8.3-1.2.2-.4.6-.8 1.1-1.1.5-.4 1.2-.6 2.1-.9.9-.2 2-.3 3.2-.3v-.7c0-.8-.1-1.3-.5-1.7-.3-.3-.7-.5-1.4-.5a3.4 3.4 0 00-1.8.5l-.6.3-.6.2c-.2 0-.4 0-.5-.2l-.3-.3-.5-.9a6.7 6.7 0 014.7-1.8c.6 0 1.2.2 1.8.4a3.7 3.7 0 012 2.2c.3.6.4 1.2.4 1.8v7.1zm-5.3-1.7h.8a3 3 0 001.2-.7l.6-.5v-1.9l-2 .2c-.5.1-1 .2-1.2.4-.4.1-.6.3-.7.5-.2.2-.2.5-.2.7 0 .5.1.8.4 1 .3.2.6.3 1 .3zm12 4.8l-.4.5-.6.1h-2l2.1-4.5L244 536h2.3l.5.1c.2.1.2.2.3.4l2.4 5.8a5.2 5.2 0 01.4 1.2 9.3 9.3 0 01.4-1.2l2.3-5.8a.8.8 0 01.7-.5h2.2l-6.2 14.3zm12.3-14.5c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2H259c0 1.2.4 2 1 2.6a3 3 0 002.1.8c.4 0 .8 0 1.1-.2.4 0 .6-.2.9-.3l.6-.3.5-.2h.3l.2.3.8 1-1 .8a5.7 5.7 0 01-2.4.9h-1.2a6 6 0 01-2.2-.3c-.7-.3-1.3-.6-1.8-1.2-.5-.5-.9-1.1-1.2-1.8a7.3 7.3 0 010-4.8c.2-.7.6-1.3 1-1.8a5 5 0 011.8-1.2c.6-.3 1.4-.4 2.2-.4zm0 2c-.7 0-1.3.1-1.8.6-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm7.1 9.4v-11.2h1.6l.6.1.2.5.2 1.4c.4-.7.8-1.2 1.4-1.7a3 3 0 011.8-.6c.6 0 1 .2 1.4.4l-.4 2v.3l-.4.1h-.4a3 3 0 00-.8-.1c-.6 0-1 .1-1.5.4-.4.3-.7.8-1 1.4v7h-2.7z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M211.6 502l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H242v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V504h-3.2z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M323.7 534.3c0 .2-.1.3-.3.4h-.3a1 1 0 01-.5-.1 16.8 16.8 0 00-1.5-.8l-1.2-.1-1.1.1-.8.4-.4.7a2 2 0 00-.2.8c0 .3.1.7.3.9.2.3.5.5.9.6l1.1.5a57.6 57.6 0 012.7 1c.5.2.9.5 1.2.8a3.6 3.6 0 011.1 2.8c0 .8 0 1.4-.3 2a4.6 4.6 0 01-2.8 2.8 7.6 7.6 0 01-7.6-1.7l.9-1.4.2-.3h.4c.2 0 .4 0 .6.2a18 18 0 001.8 1l1.4.2c.9 0 1.5-.2 2-.6.4-.4.7-1 .7-1.7 0-.5-.1-.8-.4-1-.2-.3-.4-.5-.8-.7l-1.2-.5a39 39 0 01-2.6-1c-.5-.1-.9-.4-1.2-.7-.4-.3-.6-.8-.8-1.2a4.7 4.7 0 011-5 5 5 0 011.6-1 7.4 7.4 0 014.8 0c.8.3 1.5.7 2 1.2l-.7 1.4zm5.6 1.8v7.1c0 .7.2 1.2.5 1.6.3.4.8.6 1.4.6.5 0 1-.1 1.4-.3.4-.2.8-.5 1.1-.9v-8.1h2.7v11.2h-1.6c-.4 0-.6-.1-.7-.5l-.2-.9-.7.7a4.4 4.4 0 01-2.8.9 4 4 0 01-1.6-.3c-.5-.2-.9-.5-1.2-.9-.3-.4-.6-.8-.7-1.4-.2-.5-.3-1-.3-1.7v-7.1h2.7zm10 11.2V531h2.8v6.4l1.5-1.1a4.4 4.4 0 013.7 0c.5.2 1 .6 1.3 1 .4.5.6 1.1.8 1.8a9 9 0 010 4.8c-.2.8-.5 1.4-1 2a4.5 4.5 0 01-4.4 1.5 3.3 3.3 0 01-1.5-.8l-.6-.6v.8l-.3.4-.4.1h-1.8zm5.4-9.3c-.6 0-1 .1-1.5.3l-1.1 1v5.1a2.6 2.6 0 002.2 1c.4 0 .8 0 1-.2l1-.7.5-1.2.1-1.7-.1-1.7-.5-1c-.2-.4-.4-.6-.7-.7l-1-.2zm10-7v16.3H352V531h2.8zm12 16.3h-1.9l-.3-.6-.2-.8a9.3 9.3 0 01-1.7 1.2l-1 .3a5.6 5.6 0 01-2.4-.1c-.4-.1-.7-.3-1-.6-.3-.2-.5-.5-.7-1-.2-.3-.2-.8-.2-1.3 0-.4 0-.8.3-1.2.2-.4.6-.8 1.1-1.1.5-.4 1.2-.6 2.1-.9.9-.2 2-.3 3.2-.3v-.7c0-.8-.1-1.3-.5-1.7-.3-.3-.7-.5-1.4-.5a3.4 3.4 0 00-1.8.5l-.6.3-.6.2c-.2 0-.4 0-.5-.2l-.3-.3-.5-.9a6.7 6.7 0 014.7-1.8c.6 0 1.2.2 1.8.4a3.7 3.7 0 012 2.2c.3.6.4 1.2.4 1.8v7.1zm-5.3-1.7h.8a3 3 0 001.2-.7l.6-.5v-1.9l-2 .2c-.5.1-1 .2-1.2.4-.4.1-.6.3-.7.5-.2.2-.2.5-.2.7 0 .5.1.8.4 1 .3.2.6.3 1 .3zm12 4.8l-.4.5-.6.1h-2l2.1-4.5L368 536h2.3l.5.1c.2.1.2.2.3.4l2.4 5.8a5.2 5.2 0 01.4 1.2 9.3 9.3 0 01.4-1.2l2.3-5.8a.8.8 0 01.7-.5h2.2l-6.2 14.3zm12.3-14.5c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2H383c0 1.2.4 2 1 2.6a3 3 0 002.1.8c.4 0 .8 0 1.1-.2.4 0 .6-.2.9-.3l.6-.3.5-.2h.3l.2.3.8 1-1 .8a5.7 5.7 0 01-2.4.9h-1.2a6 6 0 01-2.2-.3c-.7-.3-1.3-.6-1.8-1.2-.5-.5-.9-1.1-1.2-1.8a7.3 7.3 0 010-4.8c.2-.7.6-1.3 1-1.8a5 5 0 011.8-1.2c.6-.3 1.4-.4 2.2-.4zm0 2c-.7 0-1.3.1-1.8.6-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm7.1 9.4v-11.2h1.6l.6.1.2.5.2 1.4c.4-.7.8-1.2 1.4-1.7a3 3 0 011.8-.6c.6 0 1 .2 1.4.4l-.4 2v.3l-.4.1h-.4a3 3 0 00-.8-.1c-.6 0-1 .1-1.5.4-.4.3-.7.8-1 1.4v7h-2.7z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M335.6 502l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H366v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V504h-3.2z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M565.7 534.3c0 .2-.1.3-.3.4h-.3a1 1 0 01-.5-.1 16.8 16.8 0 00-1.5-.8l-1.2-.1-1.1.1-.8.4-.4.7a2 2 0 00-.2.8c0 .3.1.7.3.9.2.3.5.5.9.6l1.1.5a57.6 57.6 0 012.7 1c.5.2.9.5 1.2.8a3.6 3.6 0 011.1 2.8c0 .8 0 1.4-.3 2a4.6 4.6 0 01-2.8 2.8 7.6 7.6 0 01-7.6-1.7l.9-1.4.2-.3h.4c.2 0 .4 0 .6.2a18 18 0 001.8 1l1.4.2c.9 0 1.5-.2 2-.6.4-.4.7-1 .7-1.7 0-.5-.1-.8-.4-1-.2-.3-.4-.5-.8-.7l-1.2-.5a39 39 0 01-2.6-1c-.5-.1-.9-.4-1.2-.7-.4-.3-.6-.8-.8-1.2a4.7 4.7 0 011-5 5 5 0 011.6-1 7.4 7.4 0 014.8 0c.8.3 1.5.7 2 1.2l-.7 1.4zm5.6 1.8v7.1c0 .7.2 1.2.5 1.6.3.4.8.6 1.4.6.5 0 1-.1 1.4-.3.4-.2.8-.5 1.1-.9v-8.1h2.7v11.2h-1.6c-.4 0-.6-.1-.7-.5l-.2-.9-.7.7a4.4 4.4 0 01-2.8.9 4 4 0 01-1.6-.3c-.5-.2-.9-.5-1.2-.9-.3-.4-.6-.8-.7-1.4-.2-.5-.3-1-.3-1.7v-7.1h2.7zm10 11.2V531h2.8v6.4l1.5-1.1a4.4 4.4 0 013.7 0c.5.2 1 .6 1.3 1 .4.5.6 1.1.8 1.8a9 9 0 010 4.8c-.2.8-.5 1.4-1 2a4.5 4.5 0 01-4.4 1.5 3.3 3.3 0 01-1.5-.8l-.6-.6v.8l-.3.4-.4.1h-1.8zm5.4-9.3c-.6 0-1 .1-1.5.3l-1.1 1v5.1a2.6 2.6 0 002.2 1c.4 0 .8 0 1-.2l1-.7.5-1.2.1-1.7-.1-1.7-.5-1c-.2-.4-.4-.6-.7-.7l-1-.2zm10-7v16.3H594V531h2.8zm12 16.3h-1.9l-.3-.6-.2-.8a9.3 9.3 0 01-1.7 1.2l-1 .3a5.6 5.6 0 01-2.4-.1c-.4-.1-.7-.3-1-.6-.3-.2-.5-.5-.7-1-.2-.3-.2-.8-.2-1.3 0-.4 0-.8.3-1.2.2-.4.6-.8 1.1-1.1.5-.4 1.2-.6 2.1-.9.9-.2 2-.3 3.2-.3v-.7c0-.8-.1-1.3-.5-1.7-.3-.3-.7-.5-1.4-.5a3.4 3.4 0 00-1.8.5l-.6.3-.6.2c-.2 0-.4 0-.5-.2l-.3-.3-.5-.9a6.7 6.7 0 014.7-1.8c.6 0 1.2.2 1.8.4a3.7 3.7 0 012 2.2c.3.6.4 1.2.4 1.8v7.1zm-5.3-1.7h.8a3 3 0 001.2-.7l.6-.5v-1.9l-2 .2c-.5.1-1 .2-1.2.4-.4.1-.6.3-.7.5-.2.2-.2.5-.2.7 0 .5.1.8.4 1 .3.2.6.3 1 .3zm12 4.8l-.4.5-.6.1h-2l2.1-4.5L610 536h2.3l.5.1c.2.1.2.2.3.4l2.4 5.8a5.2 5.2 0 01.4 1.2 9.3 9.3 0 01.4-1.2l2.3-5.8a.8.8 0 01.7-.5h2.2l-6.2 14.3zm12.3-14.5c.7 0 1.4 0 2 .3a4.3 4.3 0 012.5 2.6 6 6 0 01.4 2.7l-.1.3-.2.2H625c0 1.2.4 2 1 2.6a3 3 0 002.1.8c.4 0 .8 0 1.1-.2.4 0 .6-.2.9-.3l.6-.3.5-.2h.3l.2.3.8 1-1 .8a5.7 5.7 0 01-2.4.9h-1.2a6 6 0 01-2.2-.3c-.7-.3-1.3-.6-1.8-1.2-.5-.5-.9-1.1-1.2-1.8a7.3 7.3 0 010-4.8c.2-.7.6-1.3 1-1.8a5 5 0 011.8-1.2c.6-.3 1.4-.4 2.2-.4zm0 2c-.7 0-1.3.1-1.8.6-.4.4-.7 1-.8 1.9h5v-1l-.5-.8a2 2 0 00-.8-.6l-1-.2zm7.1 9.4v-11.2h1.6l.6.1.2.5.2 1.4c.4-.7.8-1.2 1.4-1.7a3 3 0 011.8-.6c.6 0 1 .2 1.4.4l-.4 2v.3l-.4.1h-.4a3 3 0 00-.8-.1c-.6 0-1 .1-1.5.4-.4.3-.7.8-1 1.4v7h-2.7z"/>
-    <path fill="#FFF" fill-rule="nonzero" d="M577.6 502l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H608v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V504h-3.2z"/>
-    <g fill-rule="nonzero">
-      <path fill="#3D4251" d="M109.3 451v10.9h5.6v2H107v-13h2.3zm7.7 10.9h3.6V456h-3.1v-2h5.2v7.8h2.9v2H117v-2zm3-10.5c0-.4 0-.7.3-.9.2-.2.5-.3.9-.3h.5c.3 0 .6 0 .9.3.2.2.3.5.3.9 0 .3 0 .6-.3.8-.3.2-.6.3-1 .3h-.4c-.4 0-.7 0-1-.3-.2-.2-.3-.5-.3-.8zm6.9 10h2.3c0 .3.2.6.5.7.3.2.7.3 1.1.3h.8c.5 0 1-.1 1.3-.3.3-.3.4-.6.4-1 0-.6-.5-1-1.5-1.2l-1.4-.1c-1.1-.2-2-.5-2.5-1-.6-.4-.8-1.1-.8-2 0-1 .3-1.7 1-2.2.6-.5 1.6-.7 2.8-.7h.7c1.1 0 2 .2 2.7.7.6.4 1 1 1.1 1.9h-2.3a1 1 0 00-.5-.7 2 2 0 00-1-.2h-.7c-1 0-1.6.4-1.6 1.1 0 .6.4 1 1.3 1.1l1.5.2c1.2.2 2 .5 2.6 1 .6.5.8 1.2.8 2 0 1-.3 1.8-1 2.3-.7.6-1.7.8-3 .8h-.7c-1.1 0-2-.2-2.7-.7-.7-.5-1.1-1.1-1.2-2zm10-7.4h2.8v-3h2.2v3h3.8v2h-3.8v4.9c0 .3.1.5.3.7.2.2.4.3.8.3h2.5v2h-2.7c-1 0-1.7-.2-2.3-.8a3 3 0 01-.8-2.2V456h-2.8v-2z"/>
-      <path fill="#67708A" d="M149.8 465.9V449h5v2.1H152v12.7h2.8v2.1z"/>
-      <path fill="#3D4251" d="M158 451h4c.7 0 1.3 0 1.8.3.6.2 1 .5 1.4.8l1 1.3.2 1.7v4.7c0 .6 0 1.2-.3 1.7a3.7 3.7 0 01-2.3 2.1c-.5.2-1.1.3-1.8.3h-4v-13zm2.3 10.9h1.7c.7 0 1.2-.2 1.6-.6.4-.4.6-.9.6-1.5v-4.7a2 2 0 00-.6-1.5c-.4-.4-1-.6-1.6-.6h-1.7v8.9zm12.1 2.2c-.6 0-1.2 0-1.8-.3a4 4 0 01-1.3-.8c-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.4c0-.6.1-1.2.3-1.6a3.5 3.5 0 012.2-2l1.8-.3c.7 0 1.3 0 1.8.2l1.3.8 1 1.3.2 1.6v2.4c0 .6 0 1.1-.3 1.6a3.5 3.5 0 01-2.2 2c-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.1 1 .5 1.4.4.4.9.5 1.5.5a2 2 0 001.5-.5c.4-.3.6-.8.6-1.4v-2.4c0-.6-.2-1.1-.6-1.5a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.4.4-.5.9-.5 1.5v2.4zm12.4 4c-.6 0-1.2-.2-1.7-.3-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.6c0-.6.1-1 .3-1.6.3-.4.5-.8 1-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.3 0 4 4 0 012.7 3.3H185c0-.5-.3-.9-.7-1.2-.4-.2-.8-.4-1.4-.4-.6 0-1.1.2-1.5.5-.3.4-.5.8-.5 1.4v2.6c0 .6.2 1 .5 1.4.4.3.9.5 1.5.5s1-.2 1.4-.5c.4-.2.6-.6.7-1h2.2c0 .5-.2 1-.4 1.4a4 4 0 01-2.3 1.8 5 5 0 01-1.6.2z"/>
-      <path fill="#67708A" d="M190 465.9v-2.1h2.8v-12.7H190V449h5.1v16.9z"/>
-    </g>
-    <path fill="#FFF" fill-rule="nonzero" d="M272.6 399l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H303v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V401h-3.2zM580.6 400l1.4 4.1a6.1 6.1 0 01.3 1.2v.6h.2l.1-.6a8.2 8.2 0 01.3-1.2l1.2-4h2.7v12.7h-2v-8.2a11.7 11.7 0 01.1-2l.1-1h-.2l-.2 1a21.5 21.5 0 01-.5 1.6l-.9 3h-1.6l-1-3a24.6 24.6 0 01-.7-2.6h-.2a22.5 22.5 0 01.3 3v8.2h-2v-12.7h2.6zm12 13c-.6 0-1.2 0-1.7-.3a4 4 0 01-1.4-.8c-.4-.3-.6-.7-.8-1.2a4 4 0 01-.3-1.6v-2.4c0-.5 0-1 .3-1.5a3.5 3.5 0 012.2-2 5.4 5.4 0 013.5 0c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.4.3 1 .3 1.5v2.4a4 4 0 01-.3 1.6c-.2.5-.5 1-.9 1.2-.3.4-.8.6-1.3.8-.5.2-1.1.3-1.8.3zm-2-3.9c0 .6.2 1.1.5 1.4.4.4.9.6 1.5.6a2 2 0 001.5-.6c.4-.3.6-.8.6-1.4v-2.4c0-.5-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.3.4-.5.9-.5 1.4v2.4zm8-2.4c0-.6 0-1.1.2-1.6l.7-1.2a3 3 0 011.1-.7c.4-.2.9-.3 1.4-.3a3 3 0 011.8.5c.4.3.7.8.8 1.4h.2v-.6a5.4 5.4 0 01-.1-1.1v-3h2.2v12.7h-2.1v-1.7h-.2a2 2 0 01-.8 1.4 3 3 0 01-1.8.5c-.5 0-1 0-1.4-.3a3 3 0 01-1-.7c-.4-.4-.6-.8-.8-1.3a5 5 0 01-.2-1.6v-2.4zm2.2 0v2.4c0 .6.2 1 .6 1.4.3.4.8.6 1.4.6a2 2 0 002-2l-.1-2.3c0-.6-.2-1.1-.5-1.5-.4-.3-.8-.5-1.4-.5a2 2 0 00-1.4.5 2 2 0 00-.6 1.5zm8 0c0-.6.1-1 .3-1.6l.9-1.2c.4-.3.8-.6 1.3-.7l1.8-.3c.7 0 1.2.1 1.8.3.5.2 1 .4 1.3.7a4 4 0 011.2 2.8v1.8H611v.6c0 .7.1 1.2.5 1.6.4.3.9.5 1.5.5.5 0 1 0 1.3-.2l.7-.6h2.2a3 3 0 01-.5 1c-.3.4-.6.7-1 .9l-1.2.5a5.7 5.7 0 01-3.2 0c-.6-.2-1-.5-1.4-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5zm2.3.3h4v-.3a2 2 0 00-.5-1.5 2 2 0 00-1.5-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.5v.3zm7.4-5v-2h5.4v9.7c0 .3.1.6.3.8.2.2.5.3.9.3h2.9v2h-3.1c-1 0-1.8-.3-2.4-.8a3 3 0 01-.8-2.3V402h-3.2z"/>
-    <path fill="#3D4251" fill-rule="nonzero" d="M313 193v-18h2l.5.2.3.5.2 1.2c.6-.6 1.2-1.1 1.9-1.5a5 5 0 014.4-.1c.6.3 1.1.7 1.6 1.3.4.6.7 1.3 1 2.2a11 11 0 010 5.7 7 7 0 01-1.2 2.3c-.5.6-1.1 1-1.8 1.4-.7.4-1.5.6-2.4.6-.7 0-1.4-.1-1.9-.4-.5-.2-1-.5-1.4-1v5.6H313zm6.2-15.6c-.7 0-1.2.1-1.7.4a5 5 0 00-1.3 1.2v6a3 3 0 002.6 1.3c.4 0 .9 0 1.3-.3.4-.2.7-.4 1-.8l.6-1.4.2-2c0-.9 0-1.5-.2-2a4 4 0 00-.5-1.4c-.2-.4-.5-.6-.9-.8l-1-.2zm8.4 11.2V175h2l.6.1.3.7.2 1.6c.4-.8 1-1.5 1.6-2 .7-.4 1.4-.7 2.2-.7.6 0 1.2.2 1.6.5l-.4 2.4c0 .2-.1.3-.2.3l-.3.1h-.6l-.9-.1c-.6 0-1.2.1-1.7.5l-1.2 1.7v8.4h-3.2zm16.1-13.7c.8 0 1.6 0 2.3.4a5 5 0 013 3 7.4 7.4 0 01.4 3.3l-.1.4-.2.2h-8.6c.1 1.4.5 2.4 1.1 3 .6.7 1.5 1 2.5 1l1.4-.1 1-.4.7-.4c.2-.2.4-.2.6-.2a.7.7 0 01.6.3l1 1.2-1.3 1a6.6 6.6 0 01-2.8 1l-1.4.2a7 7 0 01-2.6-.5c-.8-.3-1.5-.8-2-1.4-.7-.6-1.1-1.3-1.5-2.2a9 9 0 010-5.8 6 6 0 013.3-3.6c.7-.3 1.6-.4 2.6-.4zm0 2.3a3 3 0 00-2.1.8 4 4 0 00-1 2.3h6c0-.5 0-.9-.2-1.2l-.5-1c-.2-.3-.5-.5-.9-.7a3 3 0 00-1.2-.2zm17.7 11.4c-.4 0-.7-.2-.8-.6l-.3-1.3a8 8 0 01-.9.8 5.5 5.5 0 01-2 1.1l-1.4.2a4.3 4.3 0 01-3.5-1.8c-.4-.6-.8-1.3-1-2.2a11.1 11.1 0 010-5.7c.3-.9.7-1.6 1.2-2.3a5.1 5.1 0 014.2-2 4.5 4.5 0 013.2 1.3V169h3.2v19.6h-2zm-4.3-2.4c.7 0 1.3-.1 1.7-.4a5 5 0 001.3-1.2v-6c-.3-.5-.7-.8-1.1-1a3.4 3.4 0 00-2.7 0c-.4.2-.8.4-1 .8l-.7 1.4-.2 2c0 .9 0 1.5.2 2 .1.6.3 1 .6 1.4.2.4.5.6.8.8l1.1.2zm13.1-11.1v13.5H367V175h3.2zm.5-4c0 .3 0 .6-.2.8a2.2 2.2 0 01-1 1.1 2 2 0 01-2.3-.4l-.4-.7a2 2 0 011.9-2.9c.2 0 .5 0 .7.2a2.1 2.1 0 011.1 1.1l.2.8zM383 178l-.2.3-.4.1-.5-.1a15.4 15.4 0 00-1.3-.7l-1.1-.2c-.6 0-1.1.1-1.5.3-.4.2-.8.5-1 1a4 4 0 00-.7 1.3l-.2 1.9c0 .7 0 1.4.2 2 .2.5.4 1 .7 1.3a2.8 2.8 0 002.4 1.2 3.3 3.3 0 002-.6l.6-.4c.1-.2.3-.2.5-.2.3 0 .5 0 .6.3l1 1.2a5.8 5.8 0 01-2.5 1.7l-1.4.4h-1.4c-.8 0-1.6 0-2.3-.4-.8-.3-1.4-.8-2-1.3-.5-.6-.9-1.4-1.2-2.2-.3-1-.5-1.9-.5-3 0-1 .2-2 .4-2.8.3-.8.7-1.5 1.3-2.2.5-.6 1.2-1 2-1.4.8-.4 1.7-.5 2.8-.5 1 0 1.8.1 2.6.4.7.4 1.4.8 2 1.4l-.9 1.2zm7.4 10.9c-1.1 0-2-.3-2.6-1-.7-.7-1-1.6-1-2.7v-7.6h-1.3c-.2 0-.3 0-.5-.2l-.1-.5v-1.3l2-.3.8-3.7c0-.2 0-.3.2-.4l.5-.2h1.6v4.3h3.5v2.3h-3.5v7.3c0 .5.1.8.3 1 .2.3.5.4.9.4l.5-.1a2.7 2.7 0 00.6-.3h.4l.2.2 1 1.6c-.5.4-1 .7-1.6.9-.6.2-1.2.3-1.9.3zM478.1 177.5l-.3.3h-.8a34.3 34.3 0 00-1.5-.6l-1-.2c-.7 0-1.2.2-1.5.4-.4.3-.6.7-.6 1 0 .4.1.6.3.8.2.2.4.4.8.5l1 .4a34.4 34.4 0 012.4.9c.4.1.7.4 1 .6.4.3.6.6.8 1 .2.4.3.9.3 1.4 0 .6-.1 1.2-.4 1.8-.2.5-.6 1-1 1.4-.5.3-1 .7-1.7.9a7.3 7.3 0 01-3.6.2 8 8 0 01-2.4-.9l-.9-.6.7-1.2.4-.4.5-.1c.2 0 .4 0 .5.2a12.5 12.5 0 001.5.7l1.2.2c.4 0 .7 0 1-.2l.6-.3c.2-.2.4-.3.4-.5l.2-.6c0-.4-.1-.6-.3-.8l-.8-.5-1-.4a35 35 0 01-2.4-.9c-.4-.2-.8-.4-1-.7-.4-.3-.6-.6-.8-1l-.3-1.5a3.8 3.8 0 011.3-2.9l1.5-.9c.7-.2 1.4-.3 2.2-.3 1 0 1.8.2 2.5.5.8.3 1.4.7 2 1.2l-.8 1.1zm9-2.8c.8 0 1.6.1 2.3.4a5 5 0 013 3 7.1 7.1 0 01.4 3.2l-.1.4-.2.2a1 1 0 01-.4 0h-8.3c.1 1.4.5 2.4 1.2 3 .6.7 1.4 1 2.5 1 .5 0 1 0 1.4-.2a23 23 0 001.7-.8l.6-.1h.3l.3.2 1 1.2c-.4.4-.8.8-1.3 1a6.7 6.7 0 01-2.8 1c-.5.2-1 .2-1.5.2-1 0-1.8-.2-2.6-.5a5.8 5.8 0 01-3.4-3.6 8.6 8.6 0 01-.1-5.6c.3-.8.7-1.5 1.3-2a6 6 0 012-1.5c.8-.3 1.7-.5 2.7-.5zm0 2.3a3 3 0 00-2.2.8c-.5.5-.8 1.3-1 2.2h6l-.1-1.1-.5-1-1-.7a3 3 0 00-1.2-.2zm12.7 11.4c-1.1 0-2-.3-2.6-1-.7-.6-1-1.5-1-2.7v-7.4H495c-.2 0-.3 0-.5-.2 0 0-.2-.2-.2-.5v-1.2l2.2-.4.7-3.6c0-.2 0-.3.2-.4l.5-.1h1.6v4.1h3.6v2.3h-3.6v7.2c0 .4.2.7.4 1 .2.2.4.3.8.3h.5a2.8 2.8 0 00.6-.3l.2-.1h.3l.2.3 1 1.5a6 6 0 01-3.5 1.2zm14.3 1.5v2.1h-10.3v-2h10.3zm12.1-1.7h-1.4l-.7-.1c-.2-.1-.3-.3-.4-.6l-.3-1a11 11 0 01-2 1.4l-1.1.4a6.7 6.7 0 01-2.9-.1c-.4-.2-.9-.4-1.2-.7a3 3 0 01-.8-1.1 4 4 0 01-.3-1.6c0-.5.1-1 .4-1.5.3-.5.7-1 1.3-1.3.7-.4 1.5-.7 2.5-1 1-.2 2.3-.4 3.8-.4v-.8c0-.9-.2-1.5-.6-2-.3-.4-.9-.6-1.6-.6a4 4 0 00-2.2.6l-.7.4-.7.2a1 1 0 01-.6-.2l-.4-.4-.6-1a8 8 0 015.6-2.1c.8 0 1.5.1 2 .4a4.4 4.4 0 012.6 2.6l.3 2.1v8.4zm-6.2-2h1a3.5 3.5 0 001.4-.8l.7-.6v-2.3c-1 0-1.7.1-2.3.3-.7 0-1.2.2-1.6.4a2 2 0 00-.8.6l-.2.8c0 .6.2 1 .5 1.2.3.3.8.4 1.3.4zm9.4 2v-13.3h2c.4 0 .7.2.8.6l.2 1a7.3 7.3 0 011.8-1.3 4.9 4.9 0 012.3-.5c.8 0 1.4.1 2 .4.5.2 1 .6 1.4 1 .4.5.7 1 .8 1.6.2.6.3 1.3.3 2v8.5h-3.2v-8.5c0-.8-.2-1.4-.5-1.8-.4-.5-1-.7-1.7-.7-.6 0-1.1.1-1.6.4-.5.2-1 .6-1.4 1v9.6h-3.2zm14.8 0v-13.3h2c.4 0 .7.2.8.6l.2 1a7.3 7.3 0 011.8-1.3 4.9 4.9 0 012.3-.5c.8 0 1.4.1 2 .4.5.2 1 .6 1.4 1 .4.5.6 1 .8 1.6.2.6.3 1.3.3 2v8.5h-3.2v-8.5c0-.8-.2-1.4-.5-1.8-.4-.5-1-.7-1.7-.7-.6 0-1.1.1-1.6.4-.5.2-1 .6-1.4 1v9.6h-3.2zm20.5-13.5c1 0 1.9.2 2.7.5a5.9 5.9 0 013.4 3.5 8.2 8.2 0 010 5.7 5.8 5.8 0 01-3.4 3.5c-.8.3-1.7.5-2.7.5-1 0-2-.2-2.7-.5a5.8 5.8 0 01-3.5-3.5 8 8 0 01-.4-2.9c0-1 .1-2 .4-2.8a6 6 0 013.4-3.5c.9-.3 1.8-.5 2.8-.5zm0 11.2c1.1 0 2-.3 2.5-1 .5-.8.8-2 .8-3.3 0-1.5-.3-2.6-.8-3.3-.6-.8-1.4-1.1-2.5-1.1s-2 .3-2.5 1c-.5.8-.8 2-.8 3.4s.3 2.5.8 3.2c.5.8 1.4 1.1 2.5 1.1zm13.5 2.5c-1.2 0-2-.3-2.7-1-.6-.6-1-1.5-1-2.7v-7.4h-1.3c-.2 0-.3 0-.4-.2-.2 0-.2-.2-.2-.5v-1.2l2.1-.4.7-3.6.2-.4.5-.1h1.7v4.1h3.5v2.3h-3.5v7.2c0 .4 0 .7.3 1 .2.2.5.3.8.3h.5a2.8 2.8 0 00.7-.3l.2-.1h.2l.2.3 1 1.5c-.5.4-1 .7-1.6.9a6 6 0 01-2 .3zm16.1-.2H593l-.7-.1c-.2-.1-.3-.3-.4-.6l-.3-1a11 11 0 01-2 1.4l-1.1.4a6.7 6.7 0 01-2.9-.1c-.5-.2-.9-.4-1.2-.7a3 3 0 01-.8-1.1 4 4 0 01-.3-1.6c0-.5.1-1 .4-1.5.3-.5.7-1 1.3-1.3.6-.4 1.5-.7 2.5-1 1-.2 2.3-.4 3.8-.4v-.8c0-.9-.2-1.5-.6-2-.4-.4-.9-.6-1.6-.6a4 4 0 00-2.2.6l-.7.4-.8.2a1 1 0 01-.5-.2l-.4-.4-.6-1a8 8 0 015.6-2.1c.8 0 1.5.1 2 .4a4.4 4.4 0 012.6 2.6l.3 2.1v8.4zm-6.2-2h.9a3.5 3.5 0 001.5-.8l.7-.6v-2.3c-1 0-1.7.1-2.4.3-.6 0-1 .2-1.5.4a2 2 0 00-.8.6l-.2.8c0 .6.1 1 .5 1.2.3.3.7.4 1.3.4zm13.8 2.2c-1.2 0-2-.3-2.7-1-.6-.6-1-1.5-1-2.7v-7.4H597c-.2 0-.3 0-.4-.2-.2 0-.2-.2-.2-.5v-1.2l2.1-.4.7-3.6.2-.4.5-.1h1.7v4.1h3.5v2.3h-3.5v7.2c0 .4 0 .7.3 1 .2.2.5.3.8.3h.5a2.8 2.8 0 00.6-.3l.3-.1h.2l.2.3 1 1.5c-.5.4-1 .7-1.6.9a6 6 0 01-2 .3zm9-13.5v13.3h-3.1v-13.3h3.2zm.6-3.9c0 .3 0 .6-.2.8a2.1 2.1 0 01-2 1.3 2 2 0 01-1.9-1.2v-.9a2 2 0 01.5-1.4 2 2 0 011.5-.6c.3 0 .5 0 .8.2a2.1 2.1 0 011 1l.3.8zm8.8 3.7c1 0 2 .2 2.7.5a5.9 5.9 0 013.4 3.5 8.2 8.2 0 010 5.7 5.8 5.8 0 01-3.4 3.5c-.8.3-1.7.5-2.7.5-1 0-1.9-.2-2.7-.5a5.8 5.8 0 01-3.4-3.5 8 8 0 01-.5-2.9c0-1 .1-2 .5-2.8a6 6 0 013.4-3.5c.8-.3 1.7-.5 2.7-.5zm0 11.2c1.1 0 2-.3 2.5-1 .5-.8.8-2 .8-3.3 0-1.5-.3-2.6-.8-3.3-.6-.8-1.4-1.1-2.5-1.1s-2 .3-2.5 1c-.5.8-.8 2-.8 3.4s.3 2.5.8 3.2c.6.8 1.4 1.1 2.5 1.1zm9.1 2.3v-13.3h2c.4 0 .7.2.8.6l.2 1a7.3 7.3 0 011.8-1.3 4.9 4.9 0 012.3-.5c.8 0 1.4.1 2 .4.5.2 1 .6 1.4 1 .4.5.7 1 .8 1.6.2.6.3 1.3.3 2v8.5h-3.2v-8.5c0-.8-.2-1.4-.5-1.8-.4-.5-1-.7-1.7-.7-.6 0-1.1.1-1.6.4-.5.2-1 .6-1.4 1v9.6h-3.2zm22.7-10.7l-.2.3h-.9a34.3 34.3 0 00-1.4-.6l-1-.2c-.7 0-1.2.2-1.6.4-.4.3-.5.7-.5 1 0 .4 0 .6.2.8l.8.5 1 .4a34.4 34.4 0 012.4.9c.4.1.8.4 1 .6.4.3.6.6.8 1 .2.4.3.9.3 1.4 0 .6-.1 1.2-.4 1.8-.2.5-.5 1-1 1.4-.4.3-1 .7-1.7.9a7.3 7.3 0 01-3.6.2 8 8 0 01-2.4-.9l-.9-.6.8-1.2c0-.2.2-.3.3-.4l.5-.1c.2 0 .4 0 .6.2a12.5 12.5 0 001.4.7l1.2.2c.4 0 .7 0 1-.2.3 0 .5-.2.7-.3l.4-.5v-.6c0-.4 0-.6-.2-.8l-.7-.5-1-.4a35 35 0 01-2.5-.9l-1-.7c-.4-.3-.6-.6-.8-1l-.3-1.5a3.8 3.8 0 011.3-2.9l1.6-.9c.6-.2 1.3-.3 2.2-.3.9 0 1.7.2 2.5.5.7.3 1.4.7 1.9 1.2l-.8 1.1z"/>
-  </g>
-</svg>
diff --git a/website/docs/images/trainable_component.svg b/website/docs/images/trainable_component.svg
new file mode 100644
index 000000000..621ff90ef
--- /dev/null
+++ b/website/docs/images/trainable_component.svg
@@ -0,0 +1,55 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1767" height="522" viewBox="0 0 1767 522">
+  <!-- Icons: Twemoji (https://twemoji.twitter.com/) -->
+  <g fill="none" fill-rule="evenodd">
+    <g stroke="#F03969">
+      <rect width="271" height="95" fill="#EAC1CC" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(217 32)"/>
+      <rect width="223.6" height="23.5" x="2.7" y="1.8" fill="#F03969" stroke-width="3.5" rx="11.8" transform="translate(238 19)"/>
+    </g>
+    <rect width="271" height="93" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(207 187)"/>
+    <rect width="114.7" height="55.8" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(207 300)"/>
+    <rect width="135.7" height="55.8" fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(342 300)"/>
+    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M234 256h-90a12 12 0 01-12-12v-15a12 12 0 0112-12h90l23 20-23 19h0zM542 256h-98a12 12 0 01-12-12v-15a12 12 0 0112-12h98l23 20-23 19h0z"/>
+    <rect width="171" height="23.5" x="2.5" y="1.8" fill="#3AD787" stroke="#3AD787" stroke-width="3.5" rx="11.8" transform="translate(250 173)"/>
+    <rect width="392" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(658 162)"/>
+    <rect width="365" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(1267 158)"/>
+    <rect width="337" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(1267 242)"/>
+    <rect width="498" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(658 246)"/>
+    <rect width="380" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(658 360)"/>
+    <rect width="370" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(658 51)"/>
+    <rect width="357" height="62" fill="#D2F1FF" stroke="#30A6DB" stroke-linejoin="round" stroke-width="5" rx="12" transform="translate(658 444)"/>
+    <path fill="#95AAB6" fill-rule="nonzero" d="M518.5 181.4v10L638 191h2v4h-2l-119.5.4v10l-24-11.9 24-12zM577.6 236.8l1.8 1 38 18.7 4.4-9L638 269h-26.8l4.4-8.9-38-18.8-1.8-.9 1.8-3.6zM614 74l24 12-24 12V88l-109.5.5h-2v-4h2l109.5-.4V74zM1232.6 220.3l23.9 12.2-24.1 11.8v-10l-27.4-.3h-2v-4h2l27.5.3.1-10z"/>
+    <path stroke="#95AAB6" stroke-linecap="square" stroke-width="4" d="M1171.9 193h28.2M1171.9 277h28.2M1202 277v-84"/>
+    <path fill="#95AAB6" fill-rule="nonzero" d="M491.5 277.5l26.7 3-5.4 8.3L639 370.3l1.7 1.1-2.2 3.4-1.7-1.1-126.3-81.5-5.4 8.4-13.7-23.1zM497.5 343.5l26.3 5.5-6.3 7.9 121.7 96.5 1.6 1.3-2.5 3.1-1.5-1.2L515 360l-6.3 7.8-11.3-24.3z"/>
+    <g fill-rule="nonzero">
+      <path fill="#E1E8ED" d="M120 57L90 27a6.7 6.7 0 00-9.4 9.4L67.8 49.2l30 30 12.8-12.8a6.7 6.7 0 009.5-9.4z"/>
+      <path fill="#CCD6DD" d="M85.3 25h-50C28 25 22 31 22 38.3v93.4c0 7.3 6 13.3 13.3 13.3h73.4c7.3 0 13.3-6 13.3-13.3v-70H92a7.2 7.2 0 01-6.7-6.7V25z"/>
+      <path fill="#99AAB5" d="M85.3 25h-6.6v30c0 7.4 6 13.3 13.3 13.3h30v-6.6H92a7.2 7.2 0 01-6.7-6.7V25zM68.7 54.3c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h26.6c1.9 0 3.4 1.8 3.4 4zm0 13.4c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h26.6c1.9 0 3.4 1.8 3.4 4zm40 13.3c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.3c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.4c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4zm0 13.3c0 2.2-1.5 4-3.4 4H38.7c-1.9 0-3.4-1.8-3.4-4s1.5-4 3.4-4h66.6c1.9 0 3.4 1.8 3.4 4z"/>
+    </g>
+    <rect width="121.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(7 131)"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M27 151.2l-1.8-.3c-.5-.2-1-.4-1.3-.8-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6.2-.5.5-.9.9-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.4 0 4 4 0 012.7 3.2h-2.2c-.1-.4-.3-.8-.7-1.1-.4-.3-.8-.4-1.4-.4-.6 0-1.1.1-1.5.5-.3.3-.5.8-.5 1.4v2.5c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5.5 0 1-.1 1.4-.4.4-.3.6-.7.7-1.1h2.2c0 .5-.2 1-.5 1.4a4 4 0 01-2.2 1.8 5 5 0 01-1.6.3zm10.2 0l-1.8-.3a4 4 0 01-1.3-.8l-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6a3.5 3.5 0 012.2-2l1.8-.3 1.8.3c.5.2 1 .4 1.3.8.4.3.7.7.9 1.2.2.5.3 1 .3 1.6v2.5l-.3 1.6a3.5 3.5 0 01-2.2 2c-.5.2-1.1.3-1.8.3zm-2-4c0 .7.1 1.1.5 1.5.4.3.9.5 1.5.5a2 2 0 001.5-.5c.4-.4.6-.8.6-1.4v-2.5c0-.6-.2-1-.6-1.4a2 2 0 00-1.5-.5 2 2 0 00-1.5.5c-.4.4-.6.8-.6 1.4v2.5zm8.2-6.1h2.1v1.7h.2c.1-.6.4-1 .8-1.4a3 3 0 011.8-.5c1 0 1.8.3 2.4 1 .6.7.9 1.5.9 2.6v6.5h-2.3v-6.2c0-.6-.1-1-.5-1.4-.3-.4-.7-.5-1.3-.5-.6 0-1 .1-1.4.5a2 2 0 00-.5 1.4v6.2h-2.2v-9.9zm10 3.1v-2H56v-1.3c0-.9.3-1.6.8-2.1.6-.5 1.4-.8 2.3-.8h3v2h-3a1 1 0 00-.6.3c-.2.1-.2.3-.2.6v1.3H62v2h-3.7v6.8H56v-6.8h-2.7zm10.8 4.7h3.5v-5.8h-3v-2h5.2v7.8h2.9v2.1h-8.6v-2zm2.9-10.5c0-.3.1-.6.3-.8.3-.3.6-.4 1-.4h.4c.4 0 .7.1 1 .4.2.2.3.5.3.8 0 .4-.1.7-.3.9-.3.2-.6.3-1 .3h-.4c-.4 0-.7-.1-1-.3-.2-.2-.3-.5-.3-.9zm7 6.4c0-.6 0-1.1.2-1.6.1-.5.4-.9.7-1.2a3 3 0 011-.8l1.5-.3a3 3 0 011.8.6c.5.3.8.8.9 1.4h.1v-1.8h2.2v9.3c0 .6-.1 1-.3 1.5-.2.5-.5.9-.8 1.2l-1.3.7c-.5.2-1.1.3-1.7.3h-2.7v-2h2.7c.5 0 1 0 1.3-.4.4-.3.5-.7.5-1.3v-.2l.1-1.9h-.1c-.1.6-.4 1.1-1 1.5a3 3 0 01-1.7.5c-.6 0-1 0-1.4-.3a3.2 3.2 0 01-1.8-2c-.2-.4-.3-1-.3-1.5v-1.7zm2.2 1.6c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5s1-.1 1.4-.5c.4-.3.5-.8.5-1.4v-1.6c0-.6-.1-1-.5-1.4a2 2 0 00-1.4-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.4v1.6zm10.7 3c0-.5.1-1 .4-1.3.3-.3.8-.4 1.3-.4s1 .1 1.3.4c.3.3.4.8.4 1.3s-.1 1-.4 1.3c-.3.3-.8.5-1.3.5s-1-.2-1.3-.5c-.3-.3-.4-.8-.4-1.3zm12.1 1.8l-1.8-.3c-.5-.2-1-.4-1.3-.8-.4-.3-.7-.7-.9-1.2a4 4 0 01-.3-1.6v-2.5c0-.6.1-1.1.3-1.6.2-.5.5-.9.9-1.2a4 4 0 011.3-.8 5.3 5.3 0 013.4 0 4 4 0 012.7 3.2h-2.2c-.1-.4-.3-.8-.7-1-.4-.4-.8-.5-1.4-.5-.6 0-1.1.1-1.5.5-.3.3-.5.8-.5 1.4v2.5c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5.5 0 1-.1 1.4-.4.4-.3.6-.7.7-1.1h2.2c0 .5-.2 1-.5 1.4a4 4 0 01-2.2 1.8 5 5 0 01-1.6.3zm5.8-7v-2h2.7v-1.3c0-.9.3-1.6.8-2 .6-.6 1.4-.9 2.3-.9h3v2h-3a1 1 0 00-.6.3c-.2.1-.2.3-.2.6v1.3h3.7v2H110v6.8h-2.3v-6.8h-2.7zm10.3.6c0-.6.1-1.1.3-1.6.1-.5.4-.9.7-1.2a3 3 0 011-.8l1.5-.3a3 3 0 011.8.6c.5.3.8.8.9 1.4h.1v-1.8h2.2v9.3c0 .6-.1 1-.3 1.5-.2.5-.5.9-.8 1.2l-1.3.7c-.5.2-1.1.3-1.7.3h-2.7v-2h2.7c.5 0 1 0 1.3-.4.4-.3.5-.7.5-1.3v-.2l.1-1.9h-.1c-.1.6-.4 1.1-1 1.5a3 3 0 01-1.7.5c-.6 0-1 0-1.4-.3a3.2 3.2 0 01-1.8-2c-.2-.4-.3-1-.3-1.5v-1.7zm2.3 1.6c0 .6.2 1 .5 1.4.4.4.9.5 1.5.5s1-.1 1.4-.5c.4-.3.5-.8.5-1.4v-1.6c0-.6-.1-1-.5-1.4a2 2 0 00-1.4-.5 2 2 0 00-1.5.5 2 2 0 00-.5 1.4v1.6z"/>
+    <path fill="#F03969" fill-rule="nonzero" d="M121 59a12 12 0 110 24 12 12 0 010-24zm93.5 10.5h5v4h-5v-4zm-9 0h5v4h-5v-4zm-9-.1h5v4h-5v-4zm-9 0h5v4h-5v-4zm-9-.1h5v4h-5v-4zm-9 0h5v4h-5v-4zm-9-.1h5v4h-5v-4zm-9 0h5v4h-5v-4zm-9 0h5v4h-5v-4zm-9-.1h5v4h-5v-4z"/>
+    <path fill="#3AD787" fill-rule="nonzero" d="M205 178.5l1.5 1.3.8.7 1.5 1.3-2.6 3-1.5-1.3-.8-.6-1.5-1.3 2.6-3zm-6.8-5.8l1.5 1.3.8.6 1.5 1.3-2.6 3-1.5-1.2-.8-.7-1.5-1.3 2.6-3zm-6.8-5.9l1.5 1.3.7.7 1.6 1.3-2.6 3-1.6-1.3-.7-.7-1.5-1.3 2.6-3zm-6.9-5.9l1.6 1.3.7.7 1.5 1.3-2.6 3-1.5-1.3-.7-.6-1.6-1.3 2.6-3zm-6.8-5.8l1.5 1.3.8.6 1.5 1.3-2.6 3-1.5-1.2-.8-.7-1.5-1.3 2.6-3zm-6.8-5.9l1.5 1.3.8.7 1.5 1.3-2.6 3-1.5-1.3-.8-.7-1.5-1.3 2.6-3zm-6.8-5.9l1.5 1.3.7.7 1.6 1.3-2.7 3-1.5-1.3-.7-.6-1.5-1.3 2.6-3zm-6.9-5.8l1.6 1.3.7.6 1.5 1.3-2.6 3-1.5-1.2-.8-.7-1.5-1.3 2.6-3zm-6.8-5.9l1.5 1.3.8.7 1.5 1.3-2.6 3-1.5-1.3-.8-.6-1.5-1.3 2.6-3zm-6.8-5.8l1.5 1.3.8.6 1.5 1.3-2.6 3-1.6-1.3-.7-.6-1.5-1.3 2.6-3zm-6.8-6l1.5 1.4.7.6 1.5 1.3-2.6 3-1.5-1.2-.7-.7-1.6-1.3 2.7-3zm-24.9-18.6a12 12 0 0119.4 14l.1.1.8.7 1.5 1.3-2.6 3-1.5-1.3-.8-.6-.1-.2a12 12 0 01-16.8-17z"/>
+    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M1652 209h-28a12 12 0 01-12-12v-15a12 12 0 0112-12h28l23 20-23 19h0z"/>
+    <path stroke="#3AD787" stroke-linecap="square" stroke-width="5" d="M264.5 281.5v18M412.5 281.5v18"/>
+    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M1717 292h-133a12 12 0 01-12-12v-15a12 12 0 0112-12h133l23 20-23 19h0zM1127 213h-98a12 12 0 01-12-12v-15a12 12 0 0112-12h98l23 20-23 19h0z"/>
+    <path fill="#3D4251" fill-rule="nonzero" d="M253.2 95.4c2.6 0 4.9-.5 6.8-1.3 1.9-.9 3.4-2.1 4.7-3.7l-2.3-2.5c-.2-.3-.5-.4-.8-.4l-.6.1a2 2 0 00-.4.3 9.8 9.8 0 01-3 1.9l-1.9.4-2.1.1a8.8 8.8 0 01-6.8-3c-1-.9-1.6-2-2.1-3.4a14 14 0 01-.8-4.8c0-1.7.3-3.3.8-4.7.5-1.4 1.2-2.5 2-3.5 1-1 2-1.7 3.3-2.2 1.3-.6 2.6-.8 4.1-.8a10.6 10.6 0 016.6 2c.3.3.7.5 1 .5.3 0 .6 0 .8-.2l.4-.5 2-2.7a14.5 14.5 0 00-10.7-4.2c-2.4 0-4.7.4-6.7 1.2a15.1 15.1 0 00-8.5 8.5 19.3 19.3 0 00-.1 13.1c.7 2 1.7 3.7 3 5.2a14 14 0 005 3.3c1.8.8 4 1.3 6.3 1.3zm25.3 0c1.7 0 3.2-.3 4.6-.9a9.7 9.7 0 005.8-6c.5-1.4.8-3 .8-4.8 0-1.8-.3-3.4-.8-4.8a9.9 9.9 0 00-5.8-6c-1.4-.6-2.9-.8-4.6-.8-1.7 0-3.2.2-4.6.8a10 10 0 00-5.8 6c-.5 1.4-.8 3-.8 4.8 0 1.8.3 3.4.8 4.8a9.8 9.8 0 005.8 6c1.4.6 3 .8 4.6.8zm0-4.3c-1.9 0-3.3-.6-4.2-1.9a9.4 9.4 0 01-1.4-5.5c0-2.4.5-4.3 1.4-5.5 1-1.3 2.3-2 4.2-2 1.9 0 3.3.7 4.2 2a9.6 9.6 0 011.3 5.5c0 2.4-.4 4.3-1.3 5.6-1 1.2-2.3 1.9-4.2 1.9zm20.8 3.9V78.5c.6-.7 1.3-1.2 2-1.6.6-.4 1.4-.6 2.1-.6 1.2 0 2.1.3 2.7 1 .6.7 1 1.8 1 3.3V95h5.4V80.6c0-.7.1-1.3.3-1.8.2-.6.5-1 .8-1.4l1.3-.8c.5-.2 1-.3 1.6-.3 1.2 0 2 .4 2.7 1 .7.8 1 1.9 1 3.3V95h5.4V80.6c0-1.3-.2-2.5-.5-3.6-.4-1-.9-2-1.5-2.7a6.4 6.4 0 00-2.5-1.7 9.7 9.7 0 00-7.5.4 7.2 7.2 0 00-3 3.3 7 7 0 00-2-3c-1-.8-2.1-1.2-3.6-1.2a7 7 0 00-4.8 1.8L299 75l-.4-1.7c-.2-.6-.7-1-1.4-1H294V95h5.4zm37 7.4V93a8 8 0 002.5 1.6c.9.4 2 .6 3.2.6a8.9 8.9 0 007-3.3c1-1 1.6-2.3 2-3.8a18.2 18.2 0 000-9.6c-.3-1.4-.9-2.6-1.6-3.6a7.4 7.4 0 00-6-3c-1.6 0-3 .3-4.2 1-1.1.6-2.2 1.5-3.1 2.5l-.4-2c-.1-.4-.3-.7-.5-.8-.3-.2-.6-.3-1-.3H331v30h5.4zm4.4-11.2a5.2 5.2 0 01-4.4-2V79c.8-.8 1.5-1.5 2.3-2a5.5 5.5 0 014.8-.3c.6.3 1 .7 1.5 1.3.4.5.7 1.3.9 2.2a15 15 0 010 6.7 7 7 0 01-1.1 2.4c-.5.7-1 1.1-1.7 1.4-.7.3-1.4.5-2.3.5zm24.8 4.1c1.7 0 3.2-.2 4.6-.8a9.7 9.7 0 005.7-6c.5-1.4.8-3 .8-4.8 0-1.8-.3-3.4-.8-4.8-.5-1.5-1.3-2.7-2.2-3.7-1-1-2.2-1.8-3.5-2.3-1.4-.6-3-.8-4.6-.8-1.7 0-3.3.2-4.6.8a10 10 0 00-5.8 6c-.6 1.4-.8 3-.8 4.8 0 1.8.2 3.4.8 4.8.5 1.4 1.3 2.7 2.2 3.7 1 1 2.2 1.8 3.6 2.3 1.3.6 2.9.8 4.6.8zm0-4.1c-2 0-3.3-.7-4.3-2a9.4 9.4 0 01-1.3-5.5c0-2.4.4-4.3 1.3-5.5 1-1.3 2.4-2 4.3-2 1.8 0 3.2.7 4.1 2a9.6 9.6 0 011.4 5.5c0 2.4-.5 4.3-1.4 5.6-.9 1.2-2.3 1.9-4.1 1.9zm20.8 3.8V78.7c.7-.8 1.5-1.3 2.3-1.8.8-.4 1.7-.6 2.7-.6 1.2 0 2.2.4 2.8 1.1.7.8 1 1.9 1 3.2V95h5.4V80.6c0-1.2-.1-2.4-.5-3.4a6.5 6.5 0 00-3.8-4.5 9.5 9.5 0 00-7.2.2 10 10 0 00-3 2.3l-.4-1.8c-.3-.6-.7-1-1.4-1h-3.4V95h5.5zm29.2.3a17.8 17.8 0 004.9-.8c.8-.2 1.6-.6 2.3-1a9 9 0 002-1.9l-1.5-2-.4-.3-.6-.2c-.4 0-.7.2-1 .4a38.9 38.9 0 01-3 1.3c-.6.2-1.4.3-2.3.3-1.8 0-3.2-.5-4.3-1.6-1-1-1.7-2.8-1.8-5.1h13.9l.6-.1c.2 0 .3-.2.4-.3l.2-.7v-1c0-1.6-.2-3-.7-4.3a8.6 8.6 0 00-5.1-5.2 11 11 0 00-4-.7c-1.6 0-3.1.3-4.5.9a10.1 10.1 0 00-5.5 5.9 14.6 14.6 0 00.1 9.6c.6 1.5 1.4 2.7 2.4 3.7s2.1 1.8 3.5 2.3c1.3.6 2.8.8 4.4.8zm4.6-14.2h-10.2c.2-1.6.8-2.9 1.7-3.8a5 5 0 013.7-1.3 5 5 0 012.1.4c.6.2 1.1.6 1.5 1 .4.6.7 1.1.9 1.7.2.7.3 1.3.3 2zM434.8 95V78.7c.7-.8 1.5-1.3 2.3-1.8.9-.4 1.7-.6 2.7-.6 1.3 0 2.2.4 2.9 1.1.6.8 1 1.9 1 3.2V95h5.3V80.6c0-1.2-.1-2.4-.4-3.4-.4-1-.9-2-1.5-2.7a6.5 6.5 0 00-2.4-1.8 9.5 9.5 0 00-7.2.2 10 10 0 00-3 2.3l-.4-1.8c-.2-.6-.7-1-1.4-1h-3.3V95h5.4zm27 .4c1.1 0 2.2-.2 3.2-.6 1-.3 2-.8 2.7-1.5l-1.6-2.6-.3-.4a.6.6 0 00-.4 0h-.4a7.7 7.7 0 01-1 .6h-1c-.5 0-1-.1-1.3-.5-.4-.4-.6-1-.6-1.7V76.5h6v-3.9h-6v-7h-2.8c-.3 0-.6 0-.8.2a1 1 0 00-.4.6l-1.1 6.2-3.7.6v2.2c0 .3.2.6.4.8.2.2.4.3.7.3h2.3v12.6c0 2 .5 3.5 1.6 4.6 1 1.1 2.6 1.7 4.5 1.7z"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M260.4 38.2c1.3 0 2.3-.3 3-.8.6-.5 1-1.3 1-2.3 0-.9-.3-1.6-.9-2-.5-.6-1.4-.9-2.6-1l-1.4-.2c-1-.1-1.4-.5-1.4-1.1 0-.8.6-1.2 1.6-1.2h.7c.5 0 .8.1 1 .3l.6.6h2.3c-.2-.8-.6-1.4-1.2-1.9-.7-.4-1.6-.7-2.7-.7h-.7c-1.2 0-2.2.3-2.8.8-.7.5-1 1.2-1 2.1 0 1 .3 1.6.8 2.1.6.5 1.4.8 2.5 1l1.4.1c1 .1 1.6.5 1.6 1.2 0 .4-.2.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.5 0-.8-.1-1.1-.3-.3-.2-.5-.4-.6-.7h-2.3c.1.8.5 1.5 1.2 2 .7.4 1.6.7 2.8.7h.7zm8.1 2.9V38a8.5 8.5 0 000-1 6.3 6.3 0 000-.7h.1c.1.6.4 1 .9 1.4.4.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.7-.4 1-.8.3-.3.5-.7.7-1.2.2-.5.3-1 .3-1.6v-2.5a5 5 0 00-.3-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1-.8l-1.4-.3a3 3 0 00-1.8.5c-.5.4-.8.8-.9 1.4h-.2v-1.7h-2.1v13h2.2zm2-4.9c-.6 0-1.1-.2-1.5-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.3-.5.7 0 1.1.1 1.5.5.3.3.5.8.5 1.4v2.4a2 2 0 01-.5 1.5c-.4.3-.8.5-1.5.5zm8.9 2c.8 0 1.5-.2 2-.6.6-.4 1-.9 1-1.5h.2V38h2.2v-6.8a3 3 0 00-1.1-2.4l-1.4-.7a6.2 6.2 0 00-3.3 0l-1.2.6c-.4.2-.6.5-.9.8a3 3 0 00-.4 1.2h2.2c0-.3.3-.5.6-.7.4-.2.8-.3 1.3-.3.6 0 1 .1 1.4.4.3.3.5.6.5 1.1v.8H280c-.6 0-1.1.1-1.6.3-.4.1-.9.3-1.2.6l-.8 1-.2 1.3c0 1 .3 1.6.8 2.2.6.5 1.4.8 2.4.8zm.8-1.8a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1 0-.5.2-.9.5-1.1.2-.3.6-.4 1.1-.4h2.5v1.2c0 .5-.2.9-.7 1.2-.4.4-1 .5-1.6.5zm10.9 1.8a5 5 0 001.6-.3 4.1 4.1 0 002.2-1.8l.5-1.4H293c0 .4-.3.8-.7 1-.3.4-.8.5-1.3.5-.7 0-1.2-.2-1.5-.5-.4-.3-.6-.8-.6-1.4v-2.5c0-.6.2-1 .6-1.4.3-.3.8-.5 1.5-.5.5 0 1 .1 1.3.4.4.3.6.7.7 1.1h2.3A4.1 4.1 0 00294 29a4 4 0 00-1.3-.7 5.3 5.3 0 00-3.4 0 4 4 0 00-1.4.8 4 4 0 00-1.2 2.8v2.5c0 .6.2 1.1.4 1.6s.5.9.8 1.2c.4.4.9.6 1.4.8.5.2 1.1.3 1.8.3zm10.2 2.9l4.7-13h-2.4l-1.8 5.3a5.9 5.9 0 00-.3 1.3v.7h-.2a13.3 13.3 0 00-.4-2l-2-5.3h-2.4l3.8 9.3-1.4 3.7h2.4zm10.3-3c.5 0 1 0 1.2-.4.3-.3.5-.8.5-1.3s-.2-1-.5-1.3c-.3-.3-.7-.4-1.3-.4-.5 0-.9.1-1.2.4-.3.3-.5.8-.5 1.3s.2 1 .5 1.3c.3.3.7.5 1.3.5zm8.4 3V38a8.5 8.5 0 000-1 6.3 6.3 0 000-.7h.1c.1.6.4 1 .9 1.4.4.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.7-.4 1-.8.3-.3.5-.7.7-1.2.2-.5.3-1 .3-1.6v-2.5a5 5 0 00-.3-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1-.8l-1.4-.3a3 3 0 00-1.8.5c-.5.4-.8.8-.9 1.4h-.2v-1.7h-2.1v13h2.2zm2-4.9c-.6 0-1.1-.2-1.5-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.3-.5.7 0 1.1.1 1.5.5.3.3.5.8.5 1.4v2.4a2 2 0 01-.5 1.5c-.4.3-.8.5-1.5.5zm10.9-9.6c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V28h-5.2v2h3V36h-3.5V38h8.6zm3.7 3v-3a8.5 8.5 0 000-1 6.3 6.3 0 000-.7h.1c.1.6.4 1 .9 1.4.4.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.7-.4 1-.8.3-.3.5-.7.7-1.2.2-.5.3-1 .3-1.6v-2.5a5 5 0 00-.3-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1-.8l-1.4-.3a3 3 0 00-1.8.5c-.5.4-.8.8-.9 1.4h-.2v-1.7h-2.1v13h2.2zm2-4.8c-.6 0-1.1-.2-1.5-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.3-.5.7 0 1.1.1 1.5.5.3.3.5.8.5 1.4v2.4a2 2 0 01-.5 1.5c-.4.3-.8.5-1.5.5zm10.1 2c.6 0 1 0 1.5-.2l1.3-.5.9-.9c.3-.3.4-.7.5-1.1h-2.2c-.1.3-.4.5-.7.6a3 3 0 01-1.3.3 2 2 0 01-1.5-.6c-.3-.3-.5-.9-.5-1.5v-.7h6.3v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3-.6.2-1 .4-1.4.8-.4.3-.6.7-.8 1.2a4 4 0 00-.4 1.6v2.5c0 .6.2 1.1.4 1.6a3.5 3.5 0 002.2 2c.5.2 1.1.3 1.8.3zm2.1-6.1h-4.1v-.3c0-.7.2-1.2.5-1.5a2 2 0 011.6-.6 2 2 0 011.5.6c.3.3.5.8.5 1.5v.3zm13 5.9v-2h-3c-.4 0-.7-.1-.9-.3-.2-.2-.3-.5-.3-.9V25h-5.4v2h3.2v7.8c0 1 .2 1.8.8 2.4.6.5 1.4.8 2.4.8h3.1zm6.3-11.4c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V28H370v2h3V36h-3.5V38h8.6zm3.7 0v-6.2c0-.6.2-1 .5-1.4.3-.4.8-.5 1.3-.5.6 0 1 .1 1.4.5.3.3.5.8.5 1.4V38h2.2v-6.5c0-1-.3-2-.8-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.5.3-.7.8-.9 1.4h-.1v-1.7h-2.2V38h2.3zm12.1.2c.6 0 1 0 1.5-.2l1.3-.5.9-.9c.3-.3.4-.7.5-1.1H396c-.1.3-.4.5-.7.6a3 3 0 01-1.3.3 2 2 0 01-1.5-.6c-.3-.3-.5-.9-.5-1.5v-.7h6.3v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3-.6.2-1 .4-1.4.8-.4.3-.6.7-.8 1.2a4 4 0 00-.4 1.6v2.5c0 .6.1 1.1.4 1.6a3.5 3.5 0 002.2 2c.5.2 1.1.3 1.8.3zM396 32H392v-.3c0-.7.2-1.2.5-1.5a2 2 0 011.6-.6 2 2 0 011.5.6c.3.3.5.8.5 1.5v.3zm8.3 6c.5 0 1 0 1.2-.4.3-.3.5-.8.5-1.3s-.2-1-.5-1.3c-.3-.3-.7-.4-1.3-.4-.5 0-.9.1-1.2.4-.3.3-.5.8-.5 1.3s.2 1 .5 1.3c.3.3.7.5 1.3.5zm8.4-.1v-5h2.3c.6 0 1.2-.2 1.7-.4a4 4 0 001.4-.8c.4-.3.7-.7.9-1.2.2-.5.3-1 .3-1.6 0-.6-.1-1.2-.3-1.6a3.5 3.5 0 00-2.3-2c-.5-.3-1-.4-1.7-.4h-4.5v13h2.2zm2.3-7h-2.3v-4h2.3a2 2 0 011.4.5c.4.4.6.9.6 1.5a2 2 0 01-.6 1.4 2 2 0 01-1.4.6zm10.6-4.4c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V28h-5.2v2h3V36H421V38h8.6zm3.7 3v-3a8.5 8.5 0 000-1 6.3 6.3 0 000-.7h.1c.1.6.4 1 .9 1.4.4.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.7-.4 1-.8.3-.3.5-.7.7-1.2.2-.5.3-1 .3-1.6v-2.5a5 5 0 00-.3-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1-.8l-1.4-.3a3 3 0 00-1.8.5c-.5.4-.8.8-.9 1.4h-.2v-1.7h-2.1v13h2.2zm2-4.8c-.6 0-1.1-.2-1.5-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.3-.5.7 0 1.1.1 1.5.5.3.3.5.8.5 1.4v2.4a2 2 0 01-.5 1.5c-.4.3-.8.5-1.5.5zm10.1 2c.6 0 1 0 1.5-.2l1.3-.5.9-.9c.3-.3.4-.7.5-1.1h-2.2c-.1.3-.4.5-.7.6a3 3 0 01-1.3.3 2 2 0 01-1.5-.6c-.3-.3-.5-.9-.5-1.5v-.7h6.3v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3-.6.2-1 .4-1.4.8-.4.3-.6.7-.8 1.2a4 4 0 00-.4 1.6v2.5c0 .6.1 1.1.4 1.6a3.5 3.5 0 002.2 2c.5.2 1.1.3 1.8.3zm2.1-6.1h-4.1v-.3c0-.7.2-1.2.5-1.5a2 2 0 011.6-.6 2 2 0 011.5.6c.3.3.5.8.5 1.5v.3z"/>
+    <path fill="#3D4251" fill-rule="nonzero" d="M287.7 251v-20.5a27.2 27.2 0 00-.2-2.8l9.8 18.2.8 1c.4.1.8.3 1.3.3h.8a2.5 2.5 0 002-1.3l9.7-18a30.8 30.8 0 00-.1 2.6V251h5.2v-31.8h-4.5a9 9 0 00-.7 0l-.5.1-.4.3-.4.5-9.4 17.7a19.6 19.6 0 00-1.3 3 95.1 95.1 0 00-1.3-3l-9.5-17.7-.4-.5-.5-.3h-.5a9 9 0 00-.7-.1h-4.4V251h5.2zm45 .3c1.7 0 3.2-.2 4.6-.8a9.7 9.7 0 005.8-6c.5-1.4.8-3 .8-4.8 0-1.8-.3-3.4-.8-4.8a9.9 9.9 0 00-5.8-6c-1.4-.6-2.9-.8-4.6-.8-1.7 0-3.2.2-4.6.8a10 10 0 00-5.8 6c-.5 1.4-.8 3-.8 4.8 0 1.8.3 3.4.8 4.8a9.8 9.8 0 005.8 6c1.4.6 3 .8 4.6.8zm0-4.2c-1.9 0-3.3-.6-4.2-1.9a9.4 9.4 0 01-1.4-5.5c0-2.4.5-4.3 1.4-5.5 1-1.3 2.3-2 4.2-2 2 0 3.3.7 4.2 2a9.6 9.6 0 011.4 5.5c0 2.4-.5 4.3-1.4 5.6-.9 1.2-2.3 1.8-4.2 1.8zm22.2 4.2a9.2 9.2 0 004.1-1c.6-.2 1.2-.6 1.7-1l1.5-1.5.4 2.2c.2.7.7 1 1.4 1h3.3v-32.7H362v12a7.8 7.8 0 00-5.6-2.2 8.8 8.8 0 00-7.1 3.3c-.9 1-1.5 2.3-2 3.7a18.3 18.3 0 000 9.6c.4 1.4 1 2.7 1.7 3.6.7 1 1.6 1.7 2.6 2.3 1 .5 2.2.7 3.4.7zm1.9-4.3c-.7 0-1.4-.1-2-.4-.5-.2-1-.6-1.4-1.2-.4-.6-.7-1.3-1-2.3a15 15 0 01.1-6.7c.3-1 .6-1.8 1.1-2.4.5-.6 1-1 1.7-1.4.7-.3 1.4-.5 2.2-.5a5.3 5.3 0 014.4 2v10.2c-.7.9-1.5 1.5-2.3 2-.8.5-1.7.7-2.8.7zm26 4.3a17.8 17.8 0 005-.8l2.3-1a9 9 0 002-1.9l-1.6-2-.4-.3-.6-.2a38.9 38.9 0 01-4 1.7c-.6.2-1.4.3-2.2.3-1.8 0-3.3-.5-4.3-1.6-1.1-1-1.8-2.8-2-5.1h14l.6-.1c.2 0 .3-.2.4-.3l.2-.7v-1c0-1.6-.2-3-.6-4.3a8.6 8.6 0 00-5.1-5.2 11 11 0 00-4-.7c-1.7 0-3.2.3-4.5.9a10.1 10.1 0 00-5.6 5.9 14.6 14.6 0 00.1 9.6c.6 1.5 1.4 2.7 2.4 3.7s2.2 1.8 3.5 2.3c1.4.6 2.8.8 4.4.8zm4.6-14.2h-10.2c.2-1.6.8-2.9 1.7-3.8a5 5 0 013.7-1.3 5 5 0 012.1.4c.6.2 1.1.6 1.5 1l1 1.7.2 2zm15 13.9v-32.7h-5.3V251h5.4zM238.6 335.8v-3H231v-16h-3.6v19h11.2zm5.6.2l1.3-.1a5.2 5.2 0 002.1-1l1-.8.3 1c.1.3.2.5.4.6l.7.1h1.5v-8.6c0-.7-.1-1.4-.3-2a4.5 4.5 0 00-2.6-2.7c-.6-.4-1.3-.5-2.1-.5a8 8 0 00-5.6 2.1l.6 1c0 .3.2.4.3.5l.6.2c.3 0 .5 0 .7-.2a33.5 33.5 0 001.7-.8 4 4 0 011.3-.2c.7 0 1.3.2 1.7.6.3.5.5 1.1.5 2v.8c-1.5 0-2.8.2-3.8.5s-2 .6-2.5 1c-.7.4-1.1.8-1.4 1.3-.2.5-.4 1-.4 1.5 0 .6.1 1.2.3 1.6a3.1 3.1 0 002 1.9l1.7.2zm1-2.2c-.6 0-1-.2-1.4-.4-.3-.2-.5-.7-.5-1.2 0-.3.1-.6.3-.8.1-.3.4-.5.8-.7l1.5-.4c.7-.2 1.5-.2 2.4-.3v2.3c-.2.2-.4.5-.7.6l-.7.5-.8.3h-1zm13.2 6.4c.4 0 .6 0 .7-.2.2 0 .3-.2.4-.5l7.4-17.2h-2.6a1 1 0 00-.9.6l-2.7 7a21.5 21.5 0 00-.5 1.4 11.3 11.3 0 00-.5-1.4l-2.9-7c0-.2-.1-.3-.3-.5l-.6-.1h-2.8l5.4 12.5-2.5 5.4h2.4zm16.2-4.2a10.7 10.7 0 003-.5c.4-.1.9-.4 1.3-.7.5-.2.9-.6 1.2-1l-1-1.2c0-.1 0-.2-.2-.2l-.3-.1c-.2 0-.4 0-.6.2a23.3 23.3 0 01-1.8.8l-1.4.2c-1 0-1.9-.3-2.5-1-.7-.7-1-1.7-1.2-3h8.4l.4-.1.2-.2.1-.4v-.6c0-1 0-1.9-.4-2.6-.3-.8-.7-1.4-1.2-2-.5-.5-1.1-.9-1.8-1.1a6.7 6.7 0 00-5 .1 6 6 0 00-3.5 3.5c-.3.8-.4 1.7-.4 2.6 0 1.2.2 2.3.5 3.2.4.9.8 1.6 1.4 2.2.6.7 1.3 1.1 2.1 1.4.8.3 1.7.5 2.7.5zm2.7-8.5h-6a4 4 0 011-2.3 3 3 0 012.2-.8c.4 0 .9 0 1.2.2l1 .7.5 1 .1 1.2zm8.9 8.3v-8.4c.3-.8.7-1.3 1.2-1.7.5-.4 1-.5 1.7-.5h1l.5.1h.4l.1-.4.4-2.4c-.4-.3-1-.5-1.6-.5-.8 0-1.6.2-2.2.7-.7.5-1.2 1.2-1.7 2l-.2-1.6c0-.3-.1-.5-.3-.7l-.7-.1H283v13.5h3.3zm11 .2c.9 0 1.7-.1 2.3-.3.7-.2 1.3-.6 1.7-1a4 4 0 001-1.4c.3-.5.4-1.1.4-1.8 0-.5 0-1-.3-1.4l-.7-1-1-.7a13.1 13.1 0 00-2.5-.8l-1-.4-.8-.6a1 1 0 01-.3-.7c0-.4.2-.8.5-1 .4-.3 1-.5 1.6-.5a3.9 3.9 0 011.9.5 34.7 34.7 0 011 .4l.5-.1.2-.3.8-1.2a6.8 6.8 0 00-4.5-1.7c-.8 0-1.6.2-2.2.4-.6.2-1.2.5-1.6.9a3.7 3.7 0 00-1.3 2.9c0 .6.1 1 .3 1.5.2.4.4.8.7 1l1.1.8a11.4 11.4 0 002.5.9l1 .4.8.5c.2.2.3.5.3.8 0 .2 0 .4-.2.6 0 .2-.2.4-.3.5l-.7.4-1 .1a3.7 3.7 0 01-2-.5l-.7-.4a1 1 0 00-.6-.2 1 1 0 00-.5.2 1 1 0 00-.3.3l-.8 1.2 1 .7a7 7 0 002.3.9l1.4.1zM369.6 335.8l4.2-13a30.5 30.5 0 00.4-1.2 12.4 12.4 0 00.3 1.2l4.2 13h3.2l6-19H385c-.3 0-.5 0-.7.2a1 1 0 00-.4.5l-3.3 11.4a19.3 19.3 0 00-.4 2l-.5-2-3.7-11.4-.4-.6-.8-.2h-1c-.3 0-.6.1-.8.3-.2.1-.4.3-.4.5l-3.8 11.4a11.6 11.6 0 00-.5 2 33.2 33.2 0 00-.4-2l-3.2-11.4a1 1 0 00-.4-.6l-.8-.2h-3l6 19.1h3.1zm25.9.2a10.7 10.7 0 003-.5c.4-.1.9-.4 1.3-.7.5-.2.9-.6 1.3-1l-1-1.2-.3-.2-.3-.1c-.2 0-.4 0-.6.2a23.3 23.3 0 01-1.8.8l-1.3.2c-1.1 0-2-.3-2.6-1-.7-.7-1-1.7-1.2-3h8.4l.4-.1.2-.2.1-.4v-.6c0-1 0-1.9-.4-2.6-.3-.8-.7-1.4-1.2-2-.5-.5-1.1-.9-1.8-1.1a6.7 6.7 0 00-5 .1 6 6 0 00-3.5 3.5 8.8 8.8 0 000 5.7c.5 1 1 1.7 1.5 2.3.6.7 1.3 1.1 2.1 1.4.8.3 1.7.5 2.7.5zm2.7-8.5h-6a4 4 0 011-2.3 3 3 0 012.2-.8c.5 0 .9 0 1.2.2l1 .7.5 1 .1 1.2zm7.5-7.1l.8-.2a2.2 2.2 0 001.1-1.1l.2-.8c0-.3 0-.6-.2-.8a2.1 2.1 0 00-2-1.3 2.1 2.1 0 00-1.9 1.3 2 2 0 001.2 2.7l.8.2zm1.6 15.4v-13.5H404v13.5h3.3zm8.4 4.8c1 0 2-.2 2.8-.4.8-.3 1.5-.7 2-1.1.6-.4 1-1 1.3-1.5.3-.6.4-1.1.4-1.7a3 3 0 00-.3-1.6c-.3-.4-.6-.8-1-1l-1.4-.5a12.6 12.6 0 00-3.2-.3l-1.4-.1a3 3 0 01-1-.3.6.6 0 01-.4-.6c0-.3.2-.6.7-.9a7.6 7.6 0 003.6-.1c.6-.2 1.2-.5 1.6-.9a4 4 0 001.3-4.7l1.2-.3c.2 0 .4 0 .5-.2l.2-.5v-1.2h-3.9l-1.4-.5a7.2 7.2 0 00-5.5 1 4 4 0 00-1.5 3.2 4 4 0 002.2 3.6 3.8 3.8 0 00-1.3 1.2l-.4.6v.6c0 .5 0 1 .2 1.2l.8.8a4 4 0 00-1.6 1c-.3.4-.5 1-.5 1.6 0 .5 0 .9.3 1.3.3.5.6.8 1.1 1.2.5.3 1.2.6 2 .8.7.2 1.6.3 2.6.3zm0-11.9a3 3 0 01-1-.2 2 2 0 01-1.2-1.2c-.2-.2-.2-.5-.2-.8 0-.7.2-1.2.6-1.6.4-.4 1-.6 1.8-.6s1.4.2 1.8.6c.4.4.6 1 .6 1.6 0 .3 0 .6-.2.9a1.9 1.9 0 01-1.2 1.1 3 3 0 01-1 .2zm0 9.5h-1.5l-1-.4-.5-.6c-.2-.2-.2-.4-.2-.6 0-.4 0-.7.3-1 .2-.3.5-.5 1-.7a16 16 0 002.4.2 17.6 17.6 0 012.2.3c.2 0 .4.2.6.4.2.1.2.3.2.6s0 .5-.2.7c0 .3-.3.5-.6.6l-1 .4-1.6.1zm12.2-2.4V326l1.4-1c.5-.3 1-.4 1.6-.4.7 0 1.3.2 1.7.7.4.4.6 1 .6 1.9v8.6h3.2v-8.6c0-.8 0-1.5-.3-2.1-.2-.6-.4-1.2-.8-1.6-.4-.5-.9-.8-1.5-1-.5-.3-1.2-.5-2-.5a5 5 0 00-2.2.5c-.6.3-1.2.7-1.7 1.2v-7.5h-3.3v19.6h3.3zm16.2.2a6 6 0 002-.3 5 5 0 001.5-.9l-1-1.6-.1-.2h-.3-.2a4.6 4.6 0 01-.6.3h-.5a1 1 0 01-.9-.3c-.2-.2-.3-.5-.3-1v-7.3h3.6v-2.3h-3.6V318H442c-.2 0-.3 0-.5.2l-.2.4-.7 3.7-2.2.3v1.3c0 .2 0 .4.2.5l.5.2h1.3v7.6c0 1.1.4 2 1 2.7.6.7 1.5 1 2.7 1zm9.3 0c.8 0 1.6-.1 2.3-.3.7-.2 1.2-.6 1.7-1a4 4 0 001-1.4c.3-.5.4-1.1.4-1.8 0-.5-.1-1-.3-1.4l-.7-1-1.1-.7a13.1 13.1 0 00-2.4-.8l-1.1-.4-.8-.6a1 1 0 01-.2-.7c0-.4.1-.8.5-1 .4-.3.9-.5 1.5-.5a3.9 3.9 0 012 .5 34.7 34.7 0 011 .4l.4-.1.3-.3.7-1.2c-.5-.5-1.1-.9-1.9-1.2a6.7 6.7 0 00-4.8-.1c-.6.2-1.1.5-1.5.9a3.7 3.7 0 00-1.3 2.9c0 .6 0 1 .3 1.5.1.4.4.8.7 1 .3.4.7.6 1 .8a11.4 11.4 0 002.5.9l1.1.4.8.5.2.8v.6l-.5.5-.7.4-1 .1a3.7 3.7 0 01-2-.5l-.7-.4a1 1 0 00-.5-.2 1 1 0 00-.5.2 1 1 0 00-.4.3l-.7 1.2c.2.3.5.5.9.7a7 7 0 002.4.9l1.4.1z"/>
+    <g fill-rule="nonzero">
+      <path fill="#3D4251" d="M150.3 242v-2h-5.6v-11h-2.3v13h8zm6.8-11.4c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V232H153v2h3v5.8h-3.5v2.1h8.6zm5.9.2c1.3 0 2.3-.3 3-.8.6-.5 1-1.3 1-2.3 0-.9-.3-1.6-.9-2-.5-.6-1.4-.9-2.6-1l-1.4-.2c-1-.1-1.4-.5-1.4-1.1 0-.8.6-1.2 1.6-1.2h.7c.5 0 .8.1 1 .3l.6.6h2.3c-.2-.8-.6-1.4-1.2-1.9-.7-.4-1.6-.7-2.7-.7h-.7c-1.2 0-2.2.3-2.8.8-.7.5-1 1.2-1 2.1 0 1 .3 1.6.8 2.1.6.5 1.4.8 2.5 1l1.4.1c1 .1 1.6.5 1.6 1.2 0 .4-.2.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.5 0-.8-.1-1.1-.3-.3-.2-.5-.4-.6-.7h-2.3c.1.8.5 1.5 1.2 2 .7.4 1.6.7 2.8.7h.7zm14-.2v-2h-2.6a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.2v3h-2.7v2h2.7v4.8c0 1 .3 1.7.8 2.3.6.5 1.4.8 2.3.8h2.7z"/>
+      <path fill="#67708A" d="M190.2 244v-2.1h-2.8v-12.7h2.8V227h-5V244z"/>
+      <path fill="#3D4251" d="M197.5 242l1.8-.3a3.7 3.7 0 002.3-2.1c.1-.5.3-1.1.3-1.7V233c0-.6-.2-1.1-.4-1.6a3.7 3.7 0 00-2.2-2.2l-1.8-.3h-4v13h4zm0-2h-1.8V231h1.8c.6 0 1.1.2 1.5.6.4.3.6.8.6 1.4v4.8a2 2 0 01-.6 1.5c-.4.3-.9.5-1.5.5zm10.3 2.2c.7 0 1.3-.1 1.8-.3a3.5 3.5 0 002.2-2c.2-.5.4-1 .4-1.6v-2.5a4 4 0 00-.4-1.6 3.5 3.5 0 00-2.2-2c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3a3.5 3.5 0 00-2.2 2 4 4 0 00-.4 1.6v2.5c0 .5.2 1 .4 1.6.2.4.4.8.8 1.2.4.3.8.6 1.4.8.5.2 1.1.3 1.8.3zm0-2a2 2 0 01-1.5-.5c-.3-.4-.5-.8-.5-1.4v-2.5c0-.6.2-1 .5-1.4a2 2 0 011.5-.5 2 2 0 011.6.5c.3.4.5.8.5 1.4v2.5c0 .6-.2 1-.5 1.4a2 2 0 01-1.5.5zm10.5 2a5 5 0 001.6-.3 4.1 4.1 0 002.2-1.8l.5-1.4h-2.3c0 .4-.3.8-.7 1-.3.4-.8.5-1.3.5-.7 0-1.2-.1-1.5-.5-.4-.3-.6-.8-.6-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.5-.5.5 0 1 .1 1.3.4.4.3.6.7.7 1.1h2.3a4.1 4.1 0 00-1.4-2.5 4 4 0 00-1.3-.7 5.3 5.3 0 00-3.4 0 4 4 0 00-1.4.8 4 4 0 00-1.2 2.8v2.5c0 .6.2 1.1.4 1.6s.5.9.8 1.2c.4.4.9.6 1.4.8.5.2 1.1.3 1.8.3z"/>
+      <path fill="#67708A" d="M230.5 244v-17h-5v2.2h2.7v12.7h-2.8v2z"/>
+    </g>
+    <path fill="#3D4251" fill-rule="nonzero" d="M446.9 242.2l1.8-.3a4 4 0 001.3-.8c.4-.4.7-.8.9-1.3.2-.5.3-1 .3-1.6v-5.4c0-.6-.1-1.1-.3-1.6a3.6 3.6 0 00-2.2-2l-1.8-.3c-.6 0-1.2 0-1.8.2l-1.3.8c-.4.4-.7.8-.9 1.3-.2.5-.3 1-.3 1.6v5.4c0 .6.1 1.1.3 1.6a3.6 3.6 0 002.2 2.1l1.8.3zm0-2a2 2 0 01-1.5-.5 2 2 0 01-.6-1.5v-5.4c0-.6.2-1 .6-1.4a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.4v5.4a2 2 0 01-.6 1.5 2 2 0 01-1.5.5zm10.3 2c1.3 0 2.3-.4 3-1 .7-.7 1-1.7 1-3v-6.1H459v6.2c0 1.3-.6 2-1.8 2-1.2 0-1.8-.7-1.8-2V232h-2.3v6.2c0 1.2.4 2.2 1.1 2.9.7.6 1.7 1 3 1zm14.3-.2v-2H469a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.3v3H463v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm4.5 3v-3a8.5 8.5 0 000-1 6.3 6.3 0 00-.1-.7h.2c0 .6.4 1 .8 1.4.5.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.8-.4 1-.8.4-.3.6-.7.8-1.2l.2-1.6v-2.5a5 5 0 00-.2-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1.1-.8c-.4-.2-.9-.3-1.4-.3a3 3 0 00-1.8.5c-.4.4-.7.8-.8 1.4h-.2v-1.7h-2.2v13h2.3zm1.9-4.8c-.6 0-1-.2-1.4-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.4.5.4.3.6.8.6 1.4v2.4a2 2 0 01-.6 1.5c-.3.3-.8.5-1.4.5zm10.2 2c1.3 0 2.3-.4 3-1 .7-.7 1-1.7 1-3v-6.1H490v6.2c0 1.3-.6 2-1.8 2-1.2 0-1.8-.7-1.8-2V232H484v6.2c0 1.2.4 2.2 1.1 2.9.7.6 1.7 1 3 1zm14.3-.2v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.3v3h-2.7v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm7.4 0v-10.9h3.6v-2H504v2h3.6V242h2.2zm9.3 3l4.7-12.9h-2.4l-1.8 5.3a5.9 5.9 0 00-.3 1.3l-.1.7h-.1a13.3 13.3 0 00-.5-2l-1.9-5.3h-2.5l3.8 9.3-1.3 3.7h2.4zm8.4 0v-3a8.5 8.5 0 000-1 6.3 6.3 0 00-.1-.7h.2c0 .6.4 1 .8 1.4.5.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.8-.4 1-.8.4-.3.6-.7.8-1.2l.2-1.6v-2.5a5 5 0 00-.2-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1.1-.8c-.4-.2-.9-.3-1.4-.3a3 3 0 00-1.8.5c-.4.4-.7.8-.8 1.4h-.2v-1.7h-2.2v13h2.3zm1.9-4.8c-.6 0-1-.2-1.4-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.4.5.4.3.6.8.6 1.4v2.4a2 2 0 01-.6 1.5c-.3.3-.8.5-1.4.5zm10.2 2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3z"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M270.1 192v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.2v3h-2.7v2h2.7v4.8c0 1 .3 1.7.8 2.3.6.5 1.4.8 2.3.8h2.7zm4.5 0v-6.2c0-.6.2-1 .5-1.4.3-.4.8-.5 1.3-.5.6 0 1 .1 1.4.5.3.3.5.8.5 1.4v6.2h2.2v-6.5c0-1-.3-2-.9-2.6a3 3 0 00-2.3-1 3 3 0 00-1.8.5c-.5.3-.7.8-.9 1.4h-.1v-.2a235.4 235.4 0 01.1-1.5v-3h-2.3V192h2.3zm12.9-11.4c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V182h-5.2v2h3v5.8h-3.5v2.1h8.6zm3.7 0v-6.2c0-.6.2-1 .5-1.4.3-.4.8-.5 1.4-.5.5 0 1 .1 1.3.5.3.3.5.8.5 1.4v6.2h2.2v-6.5c0-1-.3-2-.8-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.5.3-.7.8-.9 1.4h-.1v-1.7h-2.2v9.9h2.3zm12.3.2a5 5 0 001.6-.3 4.1 4.1 0 002.2-1.8l.5-1.4h-2.3c0 .4-.3.8-.7 1-.3.4-.8.5-1.3.5-.7 0-1.2-.1-1.5-.5-.4-.3-.6-.8-.6-1.4v-2.5c0-.6.2-1 .6-1.4.3-.3.8-.5 1.5-.5.5 0 1 .1 1.3.4.4.3.6.7.7 1.1h2.3a4.1 4.1 0 00-1.4-2.5 4 4 0 00-1.3-.7 5.3 5.3 0 00-3.4 0 4 4 0 00-1.4.8 4 4 0 00-1.2 2.8v2.5c0 .6.2 1.1.4 1.6s.5.9.8 1.2c.4.4.9.6 1.4.8.5.2 1.1.3 1.8.3zm10.1 0c.6 0 1-.2 1.3-.5.3-.3.5-.8.5-1.3s-.2-1-.5-1.3c-.3-.3-.7-.4-1.3-.4-.5 0-.9.1-1.2.4-.3.3-.5.8-.5 1.3s.2 1 .5 1.3c.3.3.7.5 1.3.5zm9 0c.9 0 1.6-.2 2.2-.6.5-.4.9-.9 1-1.5h.1v1.9h2.2v-6.8a3 3 0 00-1.1-2.4l-1.4-.7a6.2 6.2 0 00-3.3 0l-1.2.6c-.4.2-.6.5-.9.8a3 3 0 00-.4 1.2h2.2c0-.3.3-.5.6-.7.4-.2.8-.3 1.3-.3.6 0 1 .1 1.4.4.3.3.5.6.5 1.1v.8h-2.5c-.6 0-1.1.1-1.6.3-.4.1-.9.3-1.2.6l-.8 1-.2 1.3c0 1 .3 1.6.8 2.2.6.5 1.4.8 2.4.8zm.9-1.8a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1 0-.5.2-.9.4-1.1.3-.3.7-.4 1.2-.4h2.5v1.2c0 .5-.2.9-.7 1.2-.4.4-1 .5-1.6.5zm9 4.7V192a8.5 8.5 0 00-.1-1 6.3 6.3 0 000-.7h.1c.1.6.4 1 .9 1.4.4.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.7-.4 1-.8.3-.3.5-.7.7-1.2.2-.5.3-1 .3-1.6v-2.5a5 5 0 00-.3-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1-.8l-1.5-.3a3 3 0 00-1.7.5c-.5.4-.8.8-.9 1.4h-.2v-1.7h-2.1v13h2.2zm1.8-4.9c-.5 0-1-.2-1.4-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.3-.5.7 0 1.1.1 1.5.5.3.3.5.8.5 1.4v2.4a2 2 0 01-.5 1.5c-.4.3-.8.5-1.5.5zm11-9.6c.4 0 .7-.1 1-.3.1-.2.3-.5.3-.9 0-.3-.2-.6-.4-.8-.2-.3-.5-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.1.2-.3.5-.3.8 0 .4.2.7.4.9.2.2.5.3 1 .3h.4zm4 11.4v-2h-3V182h-5.2v2h3v5.8h-3.5v2.1h8.6zm5.6.2c.5 0 1-.2 1.2-.5.3-.3.5-.8.5-1.3s-.2-1-.5-1.3c-.3-.3-.7-.4-1.3-.4-.5 0-.9.1-1.2.4-.3.3-.5.8-.5 1.3s.2 1 .5 1.3c.3.3.7.5 1.3.5zm7.8-.2v-8.4a15 15 0 00-.2-3h.2a107.9 107.9 0 01.7 2.6l1 3h1.6l1-3a10.3 10.3 0 00.4-1.7l.2-.9h.2a23.1 23.1 0 00-.2 3v8.4h2v-13h-2.7l-1.3 4.2a4.9 4.9 0 00-.2 1.2l-.1.6h-.2a12 12 0 00-.3-1.8l-1.4-4.2h-2.7v13h2zm12.8.2c.6 0 1.2-.1 1.7-.3a3.5 3.5 0 002.2-2c.2-.5.4-1 .4-1.6v-2.5a4 4 0 00-.4-1.6 3.5 3.5 0 00-2.2-2c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3a3.5 3.5 0 00-2.2 2 4 4 0 00-.4 1.6v2.5c0 .5.1 1 .4 1.6.2.4.4.8.8 1.2.4.3.8.6 1.4.8.5.2 1.1.3 1.8.3zm0-2a2 2 0 01-1.6-.5c-.3-.4-.5-.8-.5-1.4v-2.5c0-.6.2-1 .5-1.4a2 2 0 011.6-.5 2 2 0 011.5.5c.3.4.5.8.5 1.4v2.5c0 .6-.2 1-.5 1.4a2 2 0 01-1.6.5zm9.4 2a3 3 0 001.7-.5c.5-.4.8-.8.9-1.4h.2v1.7h2.1v-13h-2.2v3.1a17.7 17.7 0 000 1.1v.6h-.1c-.1-.6-.4-1-.9-1.4a3 3 0 00-1.8-.5c-.5 0-1 .1-1.3.3a3 3 0 00-1.1.8c-.3.3-.5.7-.7 1.2a5 5 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6.2.5.4.9.7 1.2.3.4.6.6 1 .8l1.5.3zm.8-2c-.7 0-1.1-.2-1.5-.5a2 2 0 01-.5-1.5v-2.4c0-.6.2-1 .5-1.4a2 2 0 011.5-.5c.5 0 1 .1 1.4.5.3.4.5.8.5 1.4v2.5c0 .5-.2 1-.5 1.4-.4.3-.9.5-1.4.5zm10.4 2c.5 0 1 0 1.4-.2l1.3-.5.9-.9c.3-.3.4-.7.5-1.1H402c-.1.3-.4.5-.7.6a3 3 0 01-1.3.3 2 2 0 01-1.5-.6c-.3-.4-.5-.9-.5-1.5v-.7h6.3v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3-.6.2-1 .4-1.4.8-.4.3-.6.7-.8 1.2a4 4 0 00-.4 1.6v2.5c0 .6.2 1.1.4 1.6a3.5 3.5 0 002.2 2c.5.2 1.1.3 1.8.3zm2-6.1H398v-.3c0-.7.2-1.2.5-1.5a2 2 0 011.6-.6 2 2 0 011.5.6c.3.3.5.8.5 1.5v.3zm13 5.9v-2h-3c-.4 0-.7-.1-.9-.3-.2-.2-.3-.5-.3-.9V179h-5.4v2h3.2v7.8c0 1 .2 1.8.8 2.4.6.5 1.4.8 2.4.8h3.1z"/>
+    <path fill="#3D4251" fill-rule="nonzero" d="M694.7 206.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4H695c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.5c-1.6 0-2.7-.5-3.5-1.5-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.5-3.5 1.5zm17 3.2v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V206h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V206h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8H728V206h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V212h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V193c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.5c-1.5 0-2.7-.5-3.4-1.5-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.5-3.4 1.5zm17 3.2v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V206h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V206h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7H823c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V206h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V206h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm10.9 5.7v-7.5a6.6 6.6 0 004.6 1.8 7.3 7.3 0 005.8-2.8c.7-.8 1.2-1.9 1.6-3 .3-1.2.5-2.5.5-3.9 0-1.5-.2-2.8-.5-4a8.8 8.8 0 00-1.4-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.8 1.2-2.6 2l-.3-1.7a1 1 0 00-.4-.6l-.8-.2h-2.7V212h4.5zm3.5-9.1a4.2 4.2 0 01-3.6-1.7V193c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1.1 1 .3.5.6 1.1.8 1.9a12.3 12.3 0 010 5.5l-1 2c-.3.5-.8.9-1.3 1.1-.6.3-1.2.4-1.9.4zm16.8 3.1v-11.5c.5-1 1-1.7 1.7-2.2.7-.6 1.5-.8 2.4-.8l1.2.1.8.1h.4l.3-.5.6-3.4c-.6-.4-1.4-.6-2.3-.6-1.1 0-2.1.3-3 1-.9.6-1.7 1.5-2.3 2.7l-.3-2.3c0-.4-.2-.6-.4-.8-.1-.2-.5-.3-1-.3h-2.5V206h4.4zm18.1.3a14.6 14.6 0 004-.7c.7-.2 1.4-.5 2-1 .6-.3 1.2-.8 1.6-1.4l-1.2-1.6a1 1 0 00-.4-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.1.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.9 2c1 .4 2.3.6 3.5.6zm3.8-11.7h-8.3c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm13.2 11.7a7.5 7.5 0 003.4-.8l1.3-1 1.2-1 .4 1.7c.2.5.5.8 1.1.8h2.7v-26.7H947v9.7a6.4 6.4 0 00-4.6-1.8 7.2 7.2 0 00-5.8 2.7c-.7 1-1.2 2-1.6 3.1a15 15 0 000 7.8c.3 1.2.7 2.2 1.3 3a6.1 6.1 0 005 2.5zm1.5-3.6c-.6 0-1.1 0-1.6-.3-.4-.2-.8-.5-1.2-1-.3-.5-.5-1-.7-1.8a12.3 12.3 0 010-5.6c.3-.7.5-1.4 1-2 .3-.4.8-.8 1.3-1 .6-.3 1.2-.4 1.8-.4a4.4 4.4 0 013.6 1.7v8.2a7 7 0 01-1.8 1.7c-.7.4-1.5.5-2.4.5zm16-17.8c.4 0 .7 0 1-.2a3 3 0 001.6-1.5c.2-.3.2-.7.2-1 0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.6-1.7c-.4 0-.8 0-1.1.2a2.9 2.9 0 00-1.5 1.5 2.8 2.8 0 001.5 3.7c.3.2.7.2 1 .2zm2.2 21.1v-18.5h-4.5V206h4.5zm12 .3a13.6 13.6 0 004-.7 8.7 8.7 0 003.3-2.4l-1.2-1.6a.9.9 0 00-.9-.4c-.2 0-.5.1-.7.3l-.8.5c-.3.3-.7.5-1 .6-.5.2-1.1.3-1.8.3s-1.4-.1-2-.4a4 4 0 01-1.4-1.2c-.4-.5-.7-1.2-.9-2a10 10 0 01-.3-2.6c0-1 .1-1.8.3-2.5.2-.8.5-1.4.9-2 .4-.5.9-.9 1.4-1.1.6-.3 1.3-.5 2.1-.5a5 5 0 012.6.7l.8.5.6.2c.3 0 .5 0 .6-.2l.4-.4 1.2-1.6a8.9 8.9 0 00-2.8-1.9c-1-.4-2.3-.7-3.7-.7-1.4 0-2.7.3-3.8.8a7.9 7.9 0 00-4.6 5c-.3 1.1-.5 2.4-.5 3.7 0 1.5.2 2.9.6 4a9 9 0 001.8 3c.8.9 1.6 1.5 2.6 2 1 .4 2.1.6 3.3.6zm16.2 0c1 0 1.8-.2 2.7-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.7.2l-.3.5-1 5-2.9.6v1.7c0 .3 0 .6.3.7.1.2.3.3.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.6 1.4zM1303.7 202.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4h-1.8c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.5c-1.6 0-2.7-.5-3.5-1.5-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.5-3.5 1.5zm17 3.2v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V202h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V202h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8h-2.7V202h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V208h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V189c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.5c-1.5 0-2.7-.5-3.4-1.5-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.5-3.4 1.5zm17 3.2v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V202h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V202h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7h-8.3c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V202h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V202h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm18.2 5v-2.9H1488v2.9h14.2zm14.2 0v-2.9h-14.2v2.9h14.2zm9.4-5a13.6 13.6 0 004-.7 8.7 8.7 0 003.3-2.4l-1.3-1.6a.9.9 0 00-.8-.4c-.2 0-.5.1-.7.3l-.8.5c-.3.3-.7.5-1.1.6-.4.2-1 .3-1.7.3s-1.4-.1-2-.4a4 4 0 01-1.4-1.2c-.4-.5-.7-1.2-.9-2a10 10 0 01-.3-2.6c0-1 .1-1.8.3-2.5.2-.8.5-1.4.9-2 .4-.5.8-.9 1.4-1.1a5 5 0 014.7.3c.3 0 .5.2.8.4l.6.2c.3 0 .5 0 .6-.2l.4-.4 1.1-1.6a8.9 8.9 0 00-2.8-1.9c-1-.4-2.2-.7-3.6-.7s-2.7.3-3.8.8a7.9 7.9 0 00-4.6 5c-.4 1.1-.6 2.4-.6 3.7 0 1.5.3 2.9.7 4a9 9 0 001.8 3c.7.9 1.6 1.5 2.6 2 1 .4 2.1.6 3.2.6zm14.6 0c.7 0 1.3 0 1.8-.2a7.1 7.1 0 003-1.3l1.3-1 .4 1.2c.1.4.3.7.5.8l1 .2h2v-11.7c0-1-.1-2-.4-2.8-.3-1-.8-1.7-1.4-2.3a6.1 6.1 0 00-2.1-1.5 11 11 0 00-10.6 2.4l.8 1.4.5.6c.3.2.5.2.8.2.4 0 .7 0 1-.2a45.7 45.7 0 002.2-1.1c.5-.2 1.1-.3 1.8-.3 1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.7v1.1a24 24 0 00-5.3.6c-1.4.4-2.5.8-3.4 1.4-.8.5-1.4 1.1-1.8 1.8-.4.7-.6 1.4-.6 2 0 1 .2 1.6.4 2.3a4.3 4.3 0 002.8 2.5 7 7 0 002.2.3zm1.4-3a3 3 0 01-1.8-.6c-.5-.3-.7-.9-.7-1.6 0-.4.1-.8.3-1.1.2-.4.6-.7 1.2-1l2-.6 3.3-.3v3.1a9 9 0 01-1 .9c-.3.2-.6.5-1 .6l-1 .4-1.3.1zm17.7 2.7v-26.7h-4.4V202h4.4zm9.8 0v-26.7h-4.4V202h4.4zm16.8 5.3v-2.9h-14.2v2.9h14.2zm14.2 0v-2.9H1586v2.9h14.2zM1303.7 286.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4h-1.8c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.4c-1.6 0-2.7-.6-3.5-1.6-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.6-3.5 1.6zm17 3.1v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V286h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V286h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8h-2.7V286h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V292h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V273c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.4c-1.5 0-2.7-.6-3.4-1.6-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.6-3.4 1.6zm17 3.1v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V286h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V286h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7h-8.3c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V286h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V286h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm10.9 5.7v-7.5a6.6 6.6 0 004.6 1.8 7.3 7.3 0 005.8-2.8c.7-.8 1.2-1.9 1.6-3 .3-1.2.5-2.5.5-3.9 0-1.5-.2-2.8-.5-4a8.8 8.8 0 00-1.4-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.8 1.2-2.6 2l-.3-1.7a1 1 0 00-.4-.6l-.8-.2h-2.7V292h4.5zm3.5-9.1a4.2 4.2 0 01-3.6-1.7V273c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1.1 1 .3.5.6 1.1.8 1.9a12.3 12.3 0 010 5.5l-1 2c-.3.5-.8.9-1.3 1.1-.6.3-1.2.4-1.9.4zm14.9-18c.4 0 .8 0 1.1-.2a3 3 0 001.5-1.5c.2-.3.3-.7.3-1l-.3-1.2a2.9 2.9 0 00-2.6-1.7c-.4 0-.7 0-1 .2a2.9 2.9 0 00-1.6 1.5 2.8 2.8 0 001.5 3.7c.4.2.7.2 1.1.2zm2.2 21.1v-18.5h-4.4V286h4.4zm9.5 6v-7.5c.6.5 1.2 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.2-1.9 1.6-3 .3-1.2.5-2.5.5-3.9 0-1.5-.1-2.8-.5-4a8.8 8.8 0 00-1.4-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.8 1.2-2.6 2l-.3-1.7a1 1 0 00-.4-.6l-.7-.2h-2.8V292h4.5zm3.5-9.1a4.2 4.2 0 01-3.6-1.7V273c.6-.7 1.3-1.2 2-1.6a4.5 4.5 0 013.9-.3c.4.2.8.6 1.1 1 .4.5.6 1.1.8 1.9a12.3 12.3 0 010 5.5l-1 2c-.3.5-.8.9-1.3 1.1-.6.3-1.2.4-1.9.4zm20.2 3.4a14.6 14.6 0 004-.7c.7-.2 1.4-.5 2-1 .6-.3 1.2-.8 1.7-1.4l-1.3-1.6a1 1 0 00-.4-.3 1 1 0 00-.4 0c-.3 0-.6 0-.9.2a31.8 31.8 0 01-2.4 1c-.5.2-1.1.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8 12 12 0 000 7.8 8.9 8.9 0 002 3.1 8 8 0 002.9 2c1 .4 2.3.6 3.5.6zm3.8-11.7h-8.3c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.3.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zM694.7 290.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4H695c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.4c-1.6 0-2.7-.6-3.5-1.6-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.6-3.5 1.6zm17 3.1v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V290h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V290h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8H728V290h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V296h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V277c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.4c-1.5 0-2.7-.6-3.4-1.6-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.6-3.4 1.6zm17 3.1v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V290h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V290h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7H823c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V290h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V290h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm11.2 0a10 10 0 003.2-.5c.9-.3 1.7-.7 2.3-1.2a6.1 6.1 0 002-4.4c-.1-.8-.2-1.4-.5-2-.3-.5-.6-1-1-1.3-.4-.4-1-.7-1.5-1a18 18 0 00-3.3-1.1l-1.4-.6c-.5-.2-.8-.4-1-.7-.3-.3-.4-.6-.4-1 0-.6.2-1 .7-1.5.5-.3 1.2-.5 2-.5a5.3 5.3 0 012.7.6 47.3 47.3 0 011.5.6c.2 0 .4 0 .5-.2l.4-.4 1-1.6a7.8 7.8 0 00-2.6-1.6c-1-.4-2.2-.7-3.5-.7a9 9 0 00-3 .5c-.9.3-1.6.7-2.2 1.2a5 5 0 00-1.8 4c0 .8.2 1.5.4 2 .3.6.6 1.1 1 1.5l1.5 1a15.5 15.5 0 003.4 1.2l1.4.6c.4.2.8.4 1 .7.3.3.4.6.4 1l-.1.9-.6.7c-.2.2-.5.4-1 .5l-1.3.2a5 5 0 01-2.8-.8l-.8-.5-.8-.2c-.3 0-.5 0-.7.2-.2 0-.3.2-.5.4l-1 1.7 1.3 1a9.7 9.7 0 003.2 1.1l2 .2zm18.8 0a14.6 14.6 0 004-.7c.7-.2 1.3-.5 2-1 .6-.3 1.1-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.3 0-.6 0-.8.2a31.8 31.8 0 01-2.4 1c-.6.2-1.2.3-1.9.3-1.5 0-2.6-.4-3.5-1.3-1-.9-1.4-2.3-1.6-4.2h12l.3-.3.1-.5v-.9a10 10 0 00-.5-3.5 7 7 0 00-4.2-4.3 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.6 4.8 12 12 0 000 7.8 8.9 8.9 0 002 3.1 8 8 0 003 2c1 .4 2.2.6 3.5.6zm3.8-11.7h-8.4c.2-1.3.7-2.3 1.4-3 .8-.8 1.8-1.2 3-1.2.7 0 1.3.1 1.8.4.5.2.9.5 1.2.9.4.4.6.8.7 1.3.2.5.3 1 .3 1.6zm13.7 11.7c.9 0 1.7-.2 2.6-.4.8-.3 1.6-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.3a6.3 6.3 0 01-.9.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.1-.5a2 2 0 01-.5-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm19.6 5v-2.9H928v2.9H942zm6.9-5c.6 0 1.2 0 1.8-.2a7.1 7.1 0 002.9-1.3l1.4-1 .4 1.2c0 .4.3.7.5.8l1 .2h2v-11.7c0-1-.2-2-.5-2.8-.3-1-.7-1.7-1.3-2.3a6.1 6.1 0 00-2.2-1.5 11 11 0 00-10.6 2.4l.9 1.4c.1.3.3.4.5.6.2.2.5.2.8.2.3 0 .7 0 1-.2a45.7 45.7 0 002.2-1.1c.5-.2 1-.3 1.8-.3 1 0 1.7.3 2.3.9.5.6.7 1.5.7 2.7v1.1a24 24 0 00-5.2.6c-1.4.4-2.6.8-3.4 1.4a4.3 4.3 0 00-2.4 3.9c0 .8.1 1.5.4 2.2a4.3 4.3 0 002.8 2.5 7 7 0 002.2.3zm1.3-3a3 3 0 01-1.8-.6c-.4-.3-.7-.9-.7-1.6 0-.4.2-.8.4-1.1.2-.4.6-.7 1.1-1l2.1-.6 3.2-.3v3.1a9 9 0 01-1 .9c-.2.2-.6.5-1 .6-.2.2-.6.3-1 .4l-1.3.1zm17.5 2.7v-13.4c.6-.6 1.2-1 2-1.4.6-.3 1.3-.5 2-.5 1.1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V290h4.5v-11.8c0-1-.2-2-.5-2.8a5.3 5.3 0 00-3.1-3.7 7.8 7.8 0 00-5.9.2 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.5-.8-1.1-.8h-2.7V290h4.4zm20.5 0v-13.4c.6-.6 1.2-1 1.9-1.4.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V290h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V290h4.5zm23.9.3c1.4 0 2.6-.3 3.7-.7a8 8 0 004.7-5c.5-1 .7-2.4.7-3.9 0-1.4-.2-2.7-.7-4-.4-1-1-2-1.8-2.9a8 8 0 00-2.9-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.5 1.2-.7 2.5-.7 4 0 1.4.2 2.7.7 3.9a8 8 0 004.7 5c1.1.3 2.4.6 3.8.6zm0-3.4c-1.6 0-2.7-.6-3.5-1.6-.7-1-1.1-2.6-1.1-4.5 0-2 .4-3.5 1.1-4.6.8-1 2-1.6 3.5-1.6s2.6.6 3.4 1.6c.7 1 1 2.6 1 4.6 0 2-.3 3.5-1 4.5-.8 1-1.9 1.6-3.4 1.6zm18.6 3.4c1 0 1.8-.2 2.6-.4.9-.3 1.6-.7 2.3-1.3l-1.4-2.1-.2-.3-.4-.1h-.3a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.8v-3.2h-4.8v-5.8h-2.3a1 1 0 00-.7.2c-.2.1-.3.3-.3.5l-1 5-3 .6v1.7c0 .3.2.6.3.7.2.2.4.3.7.3h1.8v10.3c0 1.6.5 2.8 1.3 3.7.9 1 2.1 1.4 3.7 1.4zm12.3 0c.7 0 1.3 0 1.8-.2a7.1 7.1 0 003-1.3l1.3-1 .4 1.2c.2.4.3.7.6.8l1 .2h2v-11.7c0-1-.2-2-.5-2.8-.3-1-.8-1.7-1.3-2.3a6.1 6.1 0 00-2.2-1.5 11 11 0 00-10.6 2.4l.8 1.4.6.6c.2.2.4.2.7.2.4 0 .7 0 1-.2a45.7 45.7 0 002.3-1.1c.4-.2 1-.3 1.8-.3 1 0 1.7.3 2.2.9.6.6.8 1.5.8 2.7v1.1a24 24 0 00-5.3.6c-1.4.4-2.5.8-3.4 1.4-.8.5-1.4 1.1-1.8 1.8-.3.7-.5 1.4-.5 2 0 1 .1 1.6.4 2.3a4.3 4.3 0 002.8 2.5 7 7 0 002.1.3zm1.4-3a3 3 0 01-1.8-.6c-.5-.3-.7-.9-.7-1.6 0-.4.1-.8.4-1.1.2-.4.5-.7 1-1l2.2-.6 3.2-.3v3.1a9 9 0 01-1 .9c-.3.2-.6.5-1 .6l-1 .4-1.3.1zm19.1 3c.9 0 1.8-.2 2.6-.4.9-.3 1.6-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.3a6.3 6.3 0 01-.9.5 2 2 0 01-.7.1c-.4 0-.8-.2-1.1-.5a2 2 0 01-.4-1.3v-10h4.8v-3.2h-4.8v-5.8h-2.3a1 1 0 00-.7.2c-.2.1-.3.3-.3.5l-1 5-3 .6v1.7c0 .3.1.6.3.7l.6.3h2v10.3c0 1.6.3 2.8 1.2 3.7.9 1 2.1 1.4 3.7 1.4zm10.4-21.4c.4 0 .8 0 1.1-.2a3 3 0 001.5-1.5c.2-.3.3-.7.3-1l-.3-1.2a2.9 2.9 0 00-2.6-1.7c-.4 0-.8 0-1.1.2a2.9 2.9 0 00-1.5 1.5 2.8 2.8 0 001.5 3.7c.3.2.7.2 1 .2zm2.2 21.1v-18.5h-4.4V290h4.4zm13 .3c1.3 0 2.5-.3 3.7-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9.4 1.2 1 2.2 1.9 3a8 8 0 002.8 1.9c1.2.4 2.4.7 3.8.7zm0-3.4c-1.7 0-2.8-.6-3.5-1.6-.8-1-1.2-2.6-1.2-4.5 0-2 .4-3.5 1.1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.8 1 1.1 2.6 1.1 4.6 0 2-.3 3.5-1 4.5-.8 1-2 1.6-3.5 1.6zm17 3.1v-13.4c.5-.6 1.2-1 1.8-1.4.7-.3 1.4-.5 2.2-.5 1 0 1.8.3 2.4 1 .5.5.7 1.4.7 2.5V290h4.5v-11.8c0-1-.2-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.3.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.1-.5-.5-.8-1-.8h-2.8V290h4.4zm20.7.3a10 10 0 003.2-.5 5.4 5.4 0 003.7-3.2c.4-.7.5-1.6.5-2.5 0-.7-.1-1.3-.4-1.9-.2-.5-.6-1-1-1.3l-1.4-1a18 18 0 00-3.4-1.1l-1.4-.6c-.4-.2-.8-.4-1-.7-.3-.3-.4-.6-.4-1 0-.6.2-1 .7-1.5a5.3 5.3 0 014.6 0 47.3 47.3 0 011.6.7c.2 0 .4 0 .5-.2.2 0 .3-.2.4-.4l1-1.6a7.8 7.8 0 00-2.6-1.6c-1-.4-2.2-.7-3.5-.7a9 9 0 00-3 .5c-.8.3-1.6.7-2.2 1.2a5 5 0 00-1.7 4c0 .8.1 1.5.4 2 .2.6.6 1.1 1 1.5l1.4 1a15.5 15.5 0 003.4 1.2l1.5.6 1 .7c.2.3.4.6.4 1 0 .3 0 .6-.2.9l-.5.7-1 .5-1.3.2a5 5 0 01-2.8-.8c-.4-.1-.7-.3-.9-.5l-.8-.2c-.3 0-.5 0-.7.2-.2 0-.3.2-.4.4l-1 1.7 1.2 1a9.7 9.7 0 003.3 1.1l1.8.2zM694.7 404.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4H695c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.4c-1.6 0-2.7-.6-3.5-1.6-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.6-3.5 1.6zm17 3.1v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V404h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V404h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8H728V404h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V410h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V391c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.4c-1.5 0-2.7-.6-3.4-1.6-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.6-3.4 1.6zm17 3.1v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V404h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V404h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7H823c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V404h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V404h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm9-21.4c.3 0 .7 0 1-.2a3 3 0 001.6-1.5l.2-1c0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.7-1.7c-.3 0-.7 0-1 .2a2.9 2.9 0 00-1.5 1.5 2.8 2.8 0 001.5 3.7c.3.2.7.2 1 .2zM886 404v-18.5h-4.5V404h4.5zm9.4 0v-13.4c.6-.6 1.3-1 2-1.4.6-.3 1.3-.5 2-.5 1.1 0 2 .3 2.4 1 .6.5.8 1.4.8 2.5V404h4.5v-11.8c0-1-.2-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.8-.4-1.6-.6-2.7-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.1-.5-.5-.8-1.1-.8h-2.7V404h4.4zm18.6-21c.4 0 .7-.1 1-.3a3 3 0 001.6-1.5c.2-.3.2-.7.2-1 0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.6-1.7c-.4 0-.8 0-1.1.2a2.9 2.9 0 00-1.5 1.5 2.8 2.8 0 001.5 3.7c.3.2.7.2 1 .2zm2.2 21v-18.5h-4.5V404h4.5zm11 .3c1 0 1.8-.2 2.7-.4.8-.3 1.6-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.3a6.3 6.3 0 01-.9.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.1-.5a2 2 0 01-.5-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-.9 5-3 .6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm10.5-21.4c.3 0 .7 0 1-.2a3 3 0 001.6-1.5c.2-.3.2-.7.2-1 0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.6-1.7c-.4 0-.8 0-1.1.2a2.9 2.9 0 00-1.5 1.5 2.8 2.8 0 001.5 3.7c.3.2.7.2 1 .2zM940 404v-18.5h-4.5V404h4.5zm9.4.3c.7 0 1.3 0 1.8-.2a7.1 7.1 0 003-1.3l1.3-1 .4 1.2c.2.4.3.7.6.8l1 .2h2v-11.7c0-1-.2-2-.5-2.8-.3-1-.8-1.7-1.3-2.3a6.1 6.1 0 00-2.2-1.5 11 11 0 00-10.6 2.4l.8 1.4.6.6c.2.2.4.2.7.2.4 0 .7 0 1-.2a45.7 45.7 0 002.3-1.1c.4-.2 1-.3 1.8-.3 1 0 1.7.3 2.2.9.6.6.8 1.5.8 2.7v1.1a24 24 0 00-5.3.6c-1.4.4-2.5.8-3.4 1.4-.8.5-1.4 1.1-1.8 1.8-.3.7-.5 1.4-.5 2 0 1 .1 1.6.4 2.3a4.3 4.3 0 002.8 2.5 7 7 0 002.1.3zm1.4-3a3 3 0 01-1.8-.6c-.5-.3-.7-.9-.7-1.6 0-.4.1-.8.4-1.1.2-.4.5-.7 1-1l2.2-.6 3.2-.3v3.1a9 9 0 01-1 .9c-.3.2-.6.5-1 .6l-1 .4-1.3.1zm17.8 2.7v-26.7H964V404h4.5zm7.5-21c.4 0 .8-.1 1.1-.3a3 3 0 001.6-1.5l.2-1c0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.7-1.7c-.4 0-.7 0-1 .2a2.9 2.9 0 00-1.6 1.5 2.8 2.8 0 001.5 3.7c.4.2.7.2 1.1.2zm2.2 21v-18.5H974V404h4.4zm18.2 0v-3.4h-9.2l8.8-11.6a3 3 0 00.6-1.6v-1.9h-14v3.4h9l-9 11.7a2.6 2.6 0 00-.5 1.5v1.9h14.3zm11.6.3a14.6 14.6 0 004-.7c.7-.2 1.3-.5 2-1 .6-.3 1.1-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.3 0-.6 0-.9.2a31.8 31.8 0 01-2.3 1c-.6.2-1.2.3-2 .3-1.4 0-2.5-.4-3.4-1.3-1-.9-1.4-2.3-1.6-4.2h12l.3-.3.1-.5v-.9a10 10 0 00-.5-3.5 7 7 0 00-4.2-4.3 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.6 4.8 12 12 0 000 7.8 8.9 8.9 0 002 3.1 8 8 0 003 2c1 .4 2.2.6 3.5.6zm3.8-11.7h-8.4c.2-1.3.7-2.3 1.4-3 .8-.8 1.8-1.2 3-1.2.7 0 1.3.1 1.8.4.5.2.9.5 1.2.9.3.4.6.8.7 1.3.2.5.3 1 .3 1.6zM694.7 95.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4H695c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.3-.6-3.8 0-1.4.2-2.7.6-3.8.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.5c-1.6 0-2.7-.5-3.5-1.5-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.5-3.5 1.5zm17 3.2V81.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V95h4.5V83.2c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V95h4.4V83.2c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8H728V95h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V101h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V82c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.5c-1.5 0-2.7-.5-3.4-1.5-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.5-3.4 1.5zm17 3.2V81.6l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V95h4.4V83.2c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V95h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7H823c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4V81.6l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V95h4.4V83.2c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V95h4.5zm22.2.3c.9 0 1.7-.1 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3A2.7 2.7 0 00876 90l-1.1-.2a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm18.2 5v-2.9H879v2.9h14.2zm14.2 0v-2.9h-14.2v2.9h14.2zm5-26.4c.3 0 .7 0 1-.2a3 3 0 001.6-1.5l.2-1c0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.7-1.7c-.4 0-.7 0-1 .2a2.9 2.9 0 00-1.6 1.5 2.8 2.8 0 001.5 3.7c.4.2.7.2 1.1.2zm2.1 21.1V76.5H910V95h4.4zm9.5 0V81.6c.6-.6 1.2-1 1.9-1.4.7-.3 1.4-.5 2.2-.5 1 0 1.8.3 2.3 1 .5.5.8 1.4.8 2.5V95h4.4V83.2c0-1-.1-2-.4-2.8-.2-.8-.6-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.1-.8h-2.8V95h4.5zm18.5-21c.4 0 .8-.1 1.1-.3a3 3 0 001.6-1.5l.2-1c0-.4 0-.8-.2-1.2a2.9 2.9 0 00-2.7-1.7c-.4 0-.7 0-1 .2a2.9 2.9 0 00-1.6 1.5 2.8 2.8 0 001.5 3.7c.4.2.7.2 1.1.2zm2.2 21V76.5h-4.4V95h4.4zm11.1.3c1 0 1.8-.1 2.6-.4.9-.3 1.6-.7 2.3-1.3l-1.4-2.1-.2-.3-.4-.1h-.3a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-5v-5.8H953a1 1 0 00-.7.2c-.1.1-.3.3-.3.5l-1 5-3 .6v1.7c0 .3.2.6.4.7.1.2.3.3.6.3h1.8v10.3c0 1.6.5 2.8 1.3 3.7 1 1 2.1 1.4 3.7 1.4zm19.7 5v-2.9h-14.2v2.9h14.2zm14.2 0v-2.9h-14.2v2.9h14.2zM694.7 488.3c2.1 0 4-.4 5.5-1 1.6-.8 2.9-1.8 4-3l-2-2.1a1 1 0 00-1.1-.2l-.4.2a8 8 0 01-2.5 1.5l-1.4.4H695c-1.1 0-2.2-.1-3.1-.5-1-.5-1.8-1-2.5-1.8a8.6 8.6 0 01-1.7-3c-.4-1-.6-2.4-.6-3.8s.2-2.7.6-3.8c.4-1.2 1-2.1 1.7-3 .8-.7 1.6-1.3 2.7-1.8 1-.4 2.1-.6 3.3-.6a8.7 8.7 0 015.4 1.7c.3.3.6.4.8.4l.7-.2.3-.4 1.7-2.2a11.8 11.8 0 00-8.8-3.4c-2 0-3.8.3-5.5 1a12.4 12.4 0 00-7 7c-.6 1.5-.9 3.3-.9 5.3s.3 3.7.9 5.3a11.5 11.5 0 006.5 7c1.5.7 3.2 1 5.2 1zm20.7 0c1.4 0 2.6-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.5-1-1-2-1.9-2.9a8 8 0 00-2.8-1.9c-1.2-.4-2.4-.7-3.8-.7s-2.6.3-3.8.7a8.2 8.2 0 00-4.7 4.9c-.4 1.2-.7 2.5-.7 4 0 1.4.3 2.7.7 3.9a8 8 0 004.7 5c1.2.3 2.4.6 3.8.6zm0-3.4c-1.6 0-2.7-.6-3.5-1.6-.7-1-1-2.6-1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6s2.7.6 3.4 1.6c.7 1 1.1 2.6 1.1 4.6 0 2-.4 3.5-1 4.5-.8 1-2 1.6-3.5 1.6zm17 3.1v-13.5c.5-.6 1-1 1.6-1.3.5-.3 1.1-.5 1.8-.5 1 0 1.7.3 2.2.8.5.6.7 1.5.7 2.7V488h4.5v-11.8c0-.5 0-1 .2-1.5.2-.4.4-.8.7-1a3 3 0 011-.8l1.3-.2c1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.6V488h4.4v-11.8c0-1-.1-2-.4-3a6 6 0 00-1.2-2.1c-.6-.6-1.2-1-2-1.4a8 8 0 00-6.1.3 5.9 5.9 0 00-2.6 2.7c-.3-1-.9-1.9-1.6-2.5-.8-.6-1.7-1-3-1a5.8 5.8 0 00-3.9 1.5l-1 1-.3-1.4c-.1-.5-.5-.8-1.1-.8H728V488h4.4zm30.3 6v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V494h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V475c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 014-.3c.4.2.8.6 1 1 .4.5.7 1.1.8 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm20.2 3.4c1.4 0 2.7-.3 3.8-.7a8 8 0 004.7-5c.4-1 .6-2.4.6-3.9 0-1.4-.2-2.7-.6-4-.4-1-1-2-1.8-2.9a8 8 0 00-3-1.9 10.2 10.2 0 00-7.5 0 8.2 8.2 0 00-4.7 4.9c-.4 1.2-.6 2.5-.6 4 0 1.4.2 2.7.6 3.9.5 1.2 1 2.2 1.9 3a8 8 0 002.9 1.9c1.1.4 2.4.7 3.7.7zm0-3.4c-1.5 0-2.7-.6-3.4-1.6-.8-1-1.1-2.6-1.1-4.5 0-2 .3-3.5 1-4.6.8-1 2-1.6 3.5-1.6 1.6 0 2.7.6 3.4 1.6.8 1 1.2 2.6 1.2 4.6 0 2-.4 3.5-1.2 4.5-.7 1-1.8 1.6-3.4 1.6zm17 3.1v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V488h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V488h4.5zm24 .3a14.6 14.6 0 004-.7c.6-.2 1.3-.5 1.9-1 .6-.3 1.2-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.2 0-.5 0-.8.2a31.8 31.8 0 01-2.4 1c-.5.2-1.2.3-1.9.3-1.4 0-2.6-.4-3.5-1.3-.9-.9-1.4-2.3-1.5-4.2h11.9l.3-.3.2-.5v-.9a10 10 0 00-.6-3.5c-.4-1-1-2-1.6-2.7a7 7 0 00-2.6-1.6 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.5 4.8c-.5 1.1-.7 2.3-.7 3.6 0 1.6.3 3 .7 4.2a8.9 8.9 0 002 3.1 8 8 0 002.8 2c1.2.4 2.3.6 3.6.6zm3.7-11.7H823c.2-1.3.6-2.3 1.4-3 .7-.8 1.7-1.2 3-1.2.7 0 1.2.1 1.7.4.5.2 1 .5 1.3.9l.7 1.3.2 1.6zm12 11.4v-13.4l2-1.4c.6-.3 1.4-.5 2.1-.5 1 0 1.9.3 2.4 1 .5.5.8 1.4.8 2.5V488h4.4v-11.8c0-1-.1-2-.4-2.8-.3-.8-.7-1.6-1.2-2.2a5.3 5.3 0 00-2-1.4c-.7-.4-1.6-.6-2.6-.6a7.8 7.8 0 00-3.2.7 8.2 8.2 0 00-2.5 1.9l-.3-1.5c-.2-.5-.6-.8-1.2-.8h-2.7V488h4.5zm22.2.3c.9 0 1.7-.2 2.6-.4.8-.3 1.5-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.4a6.3 6.3 0 01-.8.5 2 2 0 01-.7.1c-.5 0-.9-.2-1.2-.5a2 2 0 01-.4-1.3v-10h4.9v-3.2h-4.9v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-1 5-2.9.6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.8 1 2 1.4 3.7 1.4zm9.5 0a2.8 2.8 0 002-.8 2.8 2.8 0 00.5-3 2.7 2.7 0 00-1.4-1.4c-.4-.2-.7-.3-1.1-.3a2.7 2.7 0 00-2 .8l-.5.9a2.8 2.8 0 00.5 3 2.7 2.7 0 002 .8zm12.3 0a7.5 7.5 0 005.7-2.6l.3 1.5c.2.5.5.8 1.1.8h2.7v-18.5h-4.4V483c-.6.6-1.2 1-2 1.4-.6.3-1.3.5-2 .5-1.1 0-1.9-.3-2.4-1a4 4 0 01-.8-2.5v-11.8h-4.5v11.8c0 1 .2 2 .4 2.8.3.9.7 1.6 1.2 2.2.6.6 1.2 1.1 2 1.5.8.3 1.7.5 2.7.5zm19 5.7v-7.5c.6.5 1.3 1 2 1.3a7.3 7.3 0 008.4-2.3c.7-.8 1.3-1.9 1.6-3a14.9 14.9 0 000-7.9 8.8 8.8 0 00-1.3-3 6 6 0 00-5-2.5 7 7 0 00-3.3.9c-1 .5-1.9 1.2-2.6 2l-.4-1.7a1 1 0 00-.4-.6l-.7-.2h-2.7V494h4.4zm3.6-9.1a4.2 4.2 0 01-3.6-1.7V475c.6-.7 1.2-1.2 1.9-1.6a4.5 4.5 0 013.9-.3c.5.2.9.6 1.2 1 .3.5.6 1.1.7 1.9a12.3 12.3 0 010 5.5c-.2.8-.5 1.4-.9 2-.4.5-.8.9-1.4 1.1-.5.3-1.1.4-1.8.4zm18 3.4a7.5 7.5 0 003.3-.8c.5-.3 1-.6 1.3-1l1.2-1 .4 1.7c.2.5.6.8 1.1.8h2.8v-26.7h-4.5v9.7a6.4 6.4 0 00-4.6-1.8 7.2 7.2 0 00-5.8 2.7c-.7 1-1.2 2-1.6 3.1a15 15 0 000 7.8c.3 1.2.8 2.2 1.4 3 .6.8 1.3 1.4 2.1 1.8.9.5 1.8.7 2.8.7zm1.4-3.6c-.6 0-1 0-1.5-.3-.5-.2-1-.5-1.2-1-.4-.5-.6-1-.8-1.8a12.3 12.3 0 010-5.6c.3-.7.6-1.4 1-1.9.3-.5.8-.9 1.3-1.1a4.9 4.9 0 013.7 0c.7.2 1.2.6 1.7 1.2v8.3a7 7 0 01-1.8 1.7c-.7.4-1.5.5-2.4.5zm18 3.6c.6 0 1.2 0 1.7-.2a7.1 7.1 0 003-1.3l1.3-1 .4 1.2c.1.4.3.7.5.8l1 .2h2v-11.7c0-1-.1-2-.4-2.8-.3-1-.8-1.7-1.4-2.3a6.1 6.1 0 00-2.1-1.5 11 11 0 00-10.6 2.4l.8 1.4.5.6c.3.2.5.2.8.2.4 0 .7 0 1-.2a45.7 45.7 0 002.2-1.1c.5-.2 1.1-.3 1.8-.3 1 0 1.8.3 2.3.9.5.6.8 1.5.8 2.7v1.1a24 24 0 00-5.3.6c-1.4.4-2.5.8-3.4 1.4-.8.5-1.4 1.1-1.8 1.8-.4.7-.6 1.4-.6 2 0 1 .2 1.6.4 2.3a4.3 4.3 0 002.9 2.5 7 7 0 002.1.3zm1.3-3a3 3 0 01-1.8-.6c-.5-.3-.7-.9-.7-1.6 0-.4.1-.8.3-1.1.3-.4.6-.7 1.2-1l2-.6 3.3-.3v3.1a9 9 0 01-1 .9c-.3.2-.6.5-1 .6l-1 .4-1.3.1zm19 3c1 0 1.8-.2 2.7-.4.8-.3 1.6-.7 2.2-1.3l-1.3-2.1-.3-.3-.3-.1h-.3a6.3 6.3 0 01-.9.5 2 2 0 01-.7.1c-.5 0-.8-.2-1.1-.5a2 2 0 01-.5-1.3v-10h4.9v-3.2H967v-5.8h-2.3a1 1 0 00-.6.2l-.4.5-.9 5-3 .6v1.7c0 .3.1.6.3.7l.6.3h1.9v10.3c0 1.6.4 2.8 1.3 3.7.9 1 2 1.4 3.7 1.4zm15.8 0a14.6 14.6 0 004-.7c.7-.2 1.3-.5 2-1 .6-.3 1.1-.8 1.6-1.4l-1.3-1.6a1 1 0 00-.3-.3 1 1 0 00-.5 0c-.3 0-.6 0-.9.2a31.8 31.8 0 01-2.3 1c-.6.2-1.2.3-2 .3-1.4 0-2.6-.4-3.4-1.3-1-.9-1.4-2.3-1.6-4.2h11.9l.4-.3.1-.5v-.9a10 10 0 00-.5-3.5 7 7 0 00-4.2-4.3 9 9 0 00-3.2-.6c-1.4 0-2.6.3-3.7.8a8.3 8.3 0 00-4.6 4.8 12 12 0 000 7.8 8.9 8.9 0 002 3.1 8 8 0 003 2c1 .4 2.2.6 3.5.6zm3.8-11.7h-8.4c.2-1.3.7-2.3 1.4-3 .8-.8 1.8-1.2 3-1.2.7 0 1.3.1 1.8.4.5.2.9.5 1.2.9.3.4.6.8.7 1.3.2.5.3 1 .3 1.6zM1627.4 196l1.8-.3a3.7 3.7 0 002.3-2.1c.1-.5.3-1.1.3-1.7V187c0-.6-.2-1.1-.4-1.6a3.7 3.7 0 00-2.2-2.2l-1.8-.3h-4v13h4zm0-2h-1.8V185h1.8c.6 0 1.1.2 1.5.6.4.3.6.8.6 1.4v4.8a2 2 0 01-.6 1.5c-.4.3-.9.5-1.5.5zm10.3 2.2c.7 0 1.3-.1 1.8-.3a3.5 3.5 0 002.2-2c.3-.5.4-1 .4-1.6v-2.5a4 4 0 00-.4-1.6 3.5 3.5 0 00-2.2-2c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3a3.5 3.5 0 00-2.2 2 4 4 0 00-.4 1.6v2.5c0 .5.1 1 .4 1.6.2.4.4.8.8 1.2.4.3.8.6 1.4.8.5.2 1.1.3 1.8.3zm0-2a2 2 0 01-1.5-.5c-.3-.4-.5-.8-.5-1.4v-2.5c0-.6.2-1 .5-1.4a2 2 0 011.5-.5 2 2 0 011.6.5c.3.4.5.8.5 1.4v2.5c0 .6-.2 1-.5 1.4a2 2 0 01-1.5.5zm10.5 2a5 5 0 001.6-.3 4.1 4.1 0 002.2-1.8l.5-1.4h-2.3c0 .4-.3.8-.7 1-.3.4-.8.5-1.3.5-.7 0-1.2-.1-1.5-.5-.4-.3-.6-.8-.6-1.4v-2.5c0-.6.2-1 .6-1.4.3-.3.8-.5 1.5-.5.5 0 1 .1 1.3.4.4.3.6.7.7 1.1h2.3a4.1 4.1 0 00-1.4-2.5 4 4 0 00-1.3-.7 5.3 5.3 0 00-3.4 0 4 4 0 00-1.4.8 4 4 0 00-1.2 2.8v2.5c0 .6.2 1.1.4 1.6s.5.9.8 1.2c.4.4.9.6 1.4.8.5.2 1.1.3 1.8.3z"/>
+    <g fill-rule="nonzero">
+      <path fill="#3D4251" d="M1591.4 278v-2h-2.8v-9h2.8v-2h-7.9v2h2.8v9h-2.8v2h7.9zm10.3 0v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.2v3h-2.7v2h2.7v4.8c0 1 .3 1.7.8 2.3.6.5 1.4.8 2.3.8h2.7zm6.4.2c.5 0 1 0 1.4-.2l1.3-.5.9-.9c.3-.3.4-.7.5-1.1h-2.2c-.1.3-.4.5-.7.6a3 3 0 01-1.3.3 2 2 0 01-1.5-.6c-.3-.4-.5-.9-.5-1.5v-.7h6.3v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3-.6.2-1 .4-1.4.8-.4.3-.6.7-.8 1.2a4 4 0 00-.4 1.6v2.5c0 .6.1 1.1.4 1.6a3.5 3.5 0 002.2 2c.5.2 1.1.3 1.7.3zm2-6.1h-4.1v-.3c0-.7.2-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.6.6c.3.3.5.8.5 1.5v.3zm6.8 5.9v-6.2c0-.6.2-1 .5-1.4.3-.4.8-.6 1.4-.6.6 0 1 .2 1.4.6.4.3.5.8.5 1.4v.6h2.4v-.8a4 4 0 00-1-2.7c-.5-.7-1.4-1-2.4-1-.8 0-1.4.2-1.9.5-.4.3-.7.8-.9 1.4h-.1v-1.7h-2.1v9.9h2.2zm10.5.2c.8 0 1.5-.2 2-.6.6-.4 1-.9 1-1.5h.2v1.9h2.2v-6.8a3 3 0 00-1.1-2.4l-1.4-.7a6.2 6.2 0 00-3.3 0l-1.2.6c-.4.2-.6.5-.9.8a3 3 0 00-.4 1.2h2.2c0-.3.3-.5.6-.7.4-.2.8-.3 1.3-.3.6 0 1 .1 1.4.4.3.3.5.6.5 1.1v.8h-2.5c-.6 0-1.1.1-1.6.3-.4.1-.9.3-1.2.6l-.8 1-.2 1.3c0 1 .3 1.6.8 2.2.6.5 1.4.8 2.4.8zm.8-1.8a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1 0-.5.2-.9.4-1.1.3-.3.7-.4 1.2-.4h2.5v1.2c0 .5-.2.9-.7 1.2-.4.4-1 .5-1.6.5zm14.7 1.6v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.2v3h-2.7v2h2.7v4.8c0 1 .3 1.7.8 2.3.6.5 1.4.8 2.3.8h2.7zm6.3.2c.7 0 1.3-.1 1.8-.3a3.5 3.5 0 002.2-2c.2-.5.4-1 .4-1.6v-2.5a4 4 0 00-.4-1.6 3.5 3.5 0 00-2.2-2c-.5-.2-1.1-.3-1.8-.3-.6 0-1.2.1-1.7.3a3.5 3.5 0 00-2.2 2 4 4 0 00-.4 1.6v2.5c0 .5.1 1 .4 1.6.2.4.4.8.8 1.2.4.3.8.6 1.4.8.5.2 1.1.3 1.7.3zm0-2a2 2 0 01-1.5-.5c-.3-.4-.5-.8-.5-1.4v-2.5c0-.6.2-1 .5-1.4a2 2 0 011.5-.5 2 2 0 011.6.5c.3.4.5.8.5 1.4v2.5c0 .6-.2 1-.5 1.4a2 2 0 01-1.6.5zm9 1.8v-6.2c0-.6 0-1 .4-1.4.3-.4.8-.6 1.4-.6.6 0 1 .2 1.4.6.4.3.5.8.5 1.4v.6h2.4v-.8a4 4 0 00-1-2.7c-.5-.7-1.4-1-2.4-1-.8 0-1.4.2-1.9.5-.4.3-.7.8-.9 1.4h-.1v-1.7h-2.1v9.9h2.2z"/>
+      <path fill="#67708A" d="M1672.8 280v-2.1h-2.8v-12.7h2.8V263h-5V280z"/>
+      <path fill="#3D4251" d="M1680 278c.7 0 1.3-.1 1.9-.3a3.7 3.7 0 002.3-2.1c.1-.5.3-1.1.3-1.7V269c0-.6-.2-1.1-.4-1.6a3.7 3.7 0 00-2.2-2.2l-1.8-.3h-4v13h4zm0-2h-1.7V267h1.8c.6 0 1.1.2 1.5.6.4.3.6.8.6 1.4v4.8a2 2 0 01-.6 1.5c-.4.3-.9.5-1.5.5zm10.5 2.2c.6 0 1.2-.1 1.7-.3a3.5 3.5 0 002.2-2c.3-.5.4-1 .4-1.6v-2.5a4 4 0 00-.4-1.6 3.5 3.5 0 00-2.2-2 5.4 5.4 0 00-3.5 0 3.5 3.5 0 00-2.2 2 4 4 0 00-.4 1.6v2.5c0 .5.2 1 .4 1.6.2.4.4.8.8 1.2.4.3.8.6 1.4.8.5.2 1.1.3 1.8.3zm0-2a2 2 0 01-1.6-.5c-.3-.4-.5-.8-.5-1.4v-2.5c0-.6.2-1 .5-1.4a2 2 0 011.6-.5 2 2 0 011.5.5c.3.4.5.8.5 1.4v2.5c0 .6-.2 1-.5 1.4a2 2 0 01-1.5.5zm10.4 2a5 5 0 001.6-.3 4.1 4.1 0 002.2-1.8l.5-1.4h-2.3c0 .4-.3.8-.7 1-.3.4-.8.5-1.3.5-.7 0-1.2-.2-1.5-.5-.4-.3-.6-.8-.6-1.4v-2.5c0-.6.2-1 .6-1.4.3-.4.8-.5 1.5-.5.5 0 1 .1 1.3.4.4.3.6.7.7 1.1h2.3a4.1 4.1 0 00-1.4-2.5 4 4 0 00-1.3-.7 5.3 5.3 0 00-3.4 0 4 4 0 00-1.4.8 4 4 0 00-1.2 2.8v2.5c0 .6.2 1.1.4 1.6s.5.9.8 1.2c.4.4.9.6 1.4.8.5.2 1.1.3 1.8.3z"/>
+      <path fill="#67708A" d="M1713.1 280v-17h-5v2.2h2.7v12.7h-2.8v2z"/>
+    </g>
+    <path fill="#3D4251" fill-rule="nonzero" d="M1031.9 199.2l1.8-.3a4 4 0 001.3-.8c.4-.4.7-.8.9-1.3.2-.5.3-1 .3-1.6v-5.4c0-.6-.1-1.1-.3-1.6a3.6 3.6 0 00-2.2-2l-1.8-.3c-.6 0-1.2 0-1.8.2l-1.3.8c-.4.4-.7.8-.9 1.3-.2.5-.3 1-.3 1.6v5.4c0 .6.1 1.1.3 1.6a3.6 3.6 0 002.2 2.1l1.8.3zm0-2a2 2 0 01-1.5-.5 2 2 0 01-.6-1.5v-5.4c0-.6.2-1 .6-1.4a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.4v5.4a2 2 0 01-.6 1.5 2 2 0 01-1.5.5zm10.3 2c1.3 0 2.3-.4 3-1 .7-.7 1-1.7 1-3v-6.1h-2.2v6.2c0 1.3-.6 2-1.8 2-1.2 0-1.8-.7-1.8-2V189h-2.3v6.2c0 1.2.4 2.2 1.1 2.9.7.6 1.7 1 3 1zm14.3-.2v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.3v3h-2.7v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm4.5 3v-3a8.5 8.5 0 000-1 6.3 6.3 0 00-.1-.7h.2c0 .6.4 1 .8 1.4.5.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.8-.4 1-.8.4-.3.6-.7.8-1.2l.2-1.6v-2.5a5 5 0 00-.2-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1.1-.8c-.4-.2-.9-.3-1.4-.3a3 3 0 00-1.8.5c-.4.4-.7.8-.8 1.4h-.2v-1.7h-2.2v13h2.3zm1.9-4.8c-.6 0-1-.2-1.4-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.4.5.4.3.6.8.6 1.4v2.4a2 2 0 01-.6 1.5c-.3.3-.8.5-1.4.5zm10.2 2c1.3 0 2.3-.4 3-1 .7-.7 1-1.7 1-3v-6.1h-2.2v6.2c0 1.3-.6 2-1.8 2-1.2 0-1.8-.7-1.8-2V189h-2.3v6.2c0 1.2.4 2.2 1.1 2.9.7.6 1.7 1 3 1zm14.3-.2v-2h-2.5a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2h-3.8v-3h-2.3v3h-2.7v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm7.4 0v-10.9h3.6v-2h-9.4v2h3.6V199h2.2zm9.3 3l4.7-12.9h-2.4l-1.8 5.3a5.9 5.9 0 00-.3 1.3l-.1.7h-.1a13.3 13.3 0 00-.5-2l-1.9-5.3h-2.5l3.8 9.3-1.3 3.7h2.4zm8.4 0v-3a8.5 8.5 0 000-1 6.3 6.3 0 00-.1-.7h.2c0 .6.4 1 .8 1.4.5.3 1 .5 1.8.5.5 0 1-.1 1.4-.3.4-.2.8-.4 1-.8.4-.3.6-.7.8-1.2l.2-1.6v-2.5a5 5 0 00-.2-1.6c-.2-.5-.4-.9-.7-1.2a3 3 0 00-1.1-.8c-.4-.2-.9-.3-1.4-.3a3 3 0 00-1.8.5c-.4.4-.7.8-.8 1.4h-.2v-1.7h-2.2v13h2.3zm1.9-4.8c-.6 0-1-.2-1.4-.5a2 2 0 01-.5-1.4v-2.5c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.4.5.4.3.6.8.6 1.4v2.4a2 2 0 01-.6 1.5c-.3.3-.8.5-1.4.5zm10.2 2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3z"/>
+  </g>
+</svg>
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 24c7bf1cf..7fa60e0f1 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -646,7 +646,9 @@ get_candidates = model.attrs["get_candidates"]
 
 To use our new relation extraction model as part of a custom
 [trainable component](/usage/processing-pipelines#trainable-components), we
-create a subclass of [`Pipe`](/api/pipe) that holds the model:
+create a subclass of [`Pipe`](/api/pipe) that holds the model.
+
+![Illustration of Pipe methods](../images/trainable_component.svg)
 
 ```python
 ### Pipeline component skeleton
@@ -826,7 +828,7 @@ def __call__(self, Doc doc):
 
 Once our `Pipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
-component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
+component with the [`@Language.factory`](/api/language#factory) decorator. This
 assigns it a name and lets you create the component with
 [`nlp.add_pipe`](/api/language#add_pipe) and via the
 [config](/usage/training#config).
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index c8224dfc9..8b4e39ee9 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1172,13 +1172,15 @@ doc = nlp("This is a text...")
 spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
 components that have their own model instance, make predictions over `Doc`
 objects and can be updated using [`spacy train`](/api/cli#train). This lets you
-plug fully custom machine learning components into your pipeline. You'll need
-the following:
+plug fully custom machine learning components into your pipeline.
+
+![Illustration of Pipe methods](../images/trainable_component.svg)
+
+You'll need the following:
 
 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
-   can be a model implemented in
-   [Thinc](/usage/layers-architectures#thinc), or a
-   [wrapped model](/usage/layers-architectures#frameworks) implemented in
+   can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or
+   a [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
    list of [`Doc`](/api/doc) objects as input and can have any type of output.
 2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
@@ -1283,7 +1285,7 @@ loss is calculated and to add evaluation scores to the training output.
 For more details on how to implement your own trainable components and model
 architectures, and plug existing models implemented in PyTorch or TensorFlow
 into your spaCy pipeline, see the usage guide on
-[layers and model architectures](/usage/layers-architectures).
+[layers and model architectures](/usage/layers-architectures#components).
 
 </Infobox>
 
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 5d7c7d7a5..c315c5f76 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -404,8 +404,73 @@ import Training101 from 'usage/101/\_training.md'
 <Infobox title="Training pipelines and models" emoji="📖">
 
 To learn more about **training and updating** pipelines, how to create training
-data and how to improve spaCy's named entity recognition models, see the usage
-guides on [training](/usage/training).
+data and how to improve spaCy's named models, see the usage guides on
+[training](/usage/training).
+
+</Infobox>
+
+### Training config and lifecycle {#training-config}
+
+Training config files include all **settings and hyperparameters** for training
+your pipeline. Instead of providing lots of arguments on the command line, you
+only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train).
+This also makes it easy to integrate custom models and architectures, written in
+your framework of choice. A pipeline's `config.cfg` is considered the "single
+source of truth", both at **training** and **runtime**.
+
+> ```ini
+> ### config.cfg (excerpt)
+> [training]
+> accumulate_gradient = 3
+>
+> [training.optimizer]
+> @optimizers = "Adam.v1"
+>
+> [training.optimizer.learn_rate]
+> @schedules = "warmup_linear.v1"
+> warmup_steps = 250
+> total_steps = 20000
+> initial_rate = 0.01
+> ```
+
+![Illustration of pipeline lifecycle](../images/lifecycle.svg)
+
+<Infobox title="Training configuration system" emoji="📖">
+
+For more details on spaCy's **configuration system** and how to use it to
+customize your pipeline components, component models, training settings and
+hyperparameters, see the [training config](/usage/training#config) usage guide.
+
+</Infobox>
+
+### Trainable components {#training-components}
+
+spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
+components that have their own model instance, make predictions over `Doc`
+objects and can be updated using [`spacy train`](/api/cli#train). This lets you
+plug fully custom machine learning components into your pipeline that can be
+configured via a single training config.
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [components.my_component]
+> factory = "my_component"
+>
+> [components.my_component.model]
+> @architectures = "my_model.v1"
+> width = 128
+> ```
+
+![Illustration of Pipe methods](../images/trainable_component.svg)
+
+<Infobox title="Custom trainable components" emoji="📖">
+
+To learn more about how to implement your own **model architectures** and use
+them to power custom **trainable components**, see the usage guides on the
+[trainable component API](/usage/processing-pipelines#trainable-components) and
+implementing [layers and architectures](/usage/layers-architectures#components)
+for trainable components.
 
 </Infobox>
 

From cfb9770a94980db9e385724568b434b7790e8bc2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 6 Oct 2020 14:15:41 +0200
Subject: [PATCH 451/516] Fix empty input into StaticVectors layer (#6211)

* Add test for empty doc(s)

* Fix empty check in staticvectors

* Remove xfail

* Update spacy/ml/staticvectors.py
---
 spacy/ml/staticvectors.py  |  2 +-
 spacy/tests/test_models.py | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index c77247d33..da731dadb 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -34,7 +34,7 @@ def StaticVectors(
 def forward(
     model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
 ) -> Tuple[Ragged, Callable]:
-    if not len(docs):
+    if not sum(len(doc) for doc in docs):
         return _handle_empty(model.ops, model.get_dim("nO"))
     key_attr = model.attrs["key_attr"]
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 17408f7e8..8ca7f8b66 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -7,6 +7,7 @@ import numpy
 
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
+from spacy.ml.staticvectors import StaticVectors
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
 
@@ -185,3 +186,22 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
     model1 = get_updated_model()
     model2 = get_updated_model()
     assert_array_equal(get_all_params(model1), get_all_params(model2))
+
+
+@pytest.mark.parametrize(
+    "model_func,kwargs",
+    [
+        (StaticVectors, {"nO": 128, "nM": 300}),
+    ]
+)
+def test_empty_docs(model_func, kwargs):
+    nlp = English()
+    model = model_func(**kwargs).initialize()
+    # Test the layer can be called successfully with 0, 1 and 2 empty docs.
+    for n_docs in range(3):
+        docs = [nlp("") for _ in range(n_docs)]
+        # Test predict
+        _ = model.predict(docs)
+        # Test backprop
+        output, backprop = model.begin_update(docs)
+        _ = backprop(output)

From fff3f8ccfaec48dcfdd6b19e6811070724d33c80 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 6 Oct 2020 14:16:05 +0200
Subject: [PATCH 452/516] Fix packaging pin (#6212)

* pin packaging to >=20.0

* ignore spacy-pkuseg in requirements unit test
---
 requirements.txt                         | 2 +-
 setup.cfg                                | 2 +-
 spacy/tests/package/test_requirements.py | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 29695e9b4..3f3886a60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ pydantic>=1.5.0,<2.0.0
 pytokenizations
 # Official Python utilities
 setuptools
-packaging
+packaging>=20.0
 importlib_metadata>=0.20; python_version < "3.8"
 typing_extensions>=3.7.4; python_version < "3.8"
 # Development dependencies
diff --git a/setup.cfg b/setup.cfg
index e77bda2fc..eef4fcf67 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -55,7 +55,7 @@ install_requires =
     pytokenizations
     # Official Python utilities
     setuptools
-    packaging
+    packaging>=20.0
     importlib_metadata>=0.20; python_version < "3.8"
     typing_extensions>=3.7.4; python_version < "3.8"
 
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 6cc8fa6a8..8145beba9 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -10,12 +10,14 @@ def test_build_dependencies():
         "mock",
         "flake8",
     ]
+    # ignore language-specific packages that shouldn't be installed by all
     libs_ignore_setup = [
         "fugashi",
         "natto-py",
         "pythainlp",
         "sudachipy",
         "sudachidict_core",
+        "spacy-pkuseg",
     ]
 
     # check requirements.txt

From 1a500f9717bd92b2d376bde6aec387e3dfd92878 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 6 Oct 2020 14:19:07 +0200
Subject: [PATCH 453/516] Set version to v3.0.0a35

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 373d1d2b0..108689074 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a34"
+__version__ = "3.0.0a35"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From ce14520789eae6123589423613ce513ae74ead1e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Oct 2020 14:35:17 +0200
Subject: [PATCH 454/516] Update docs [ci skip]

---
 website/docs/usage/v3.md | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index a10fc6321..1024a2551 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -168,9 +168,13 @@ follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each
 `Model` can also be used as a sublayer of a larger network, allowing you to
 freely combine implementations from different frameworks into a single model.
 
+![Illustration of Pipe methods](../images/trainable_component.svg)
+
 <Infobox title="Details & Documentation" emoji="📖" list>
 
-- **Usage: ** [Layers and architectures](/usage/layers-architectures)
+- **Usage: ** [Layers and architectures](/usage/layers-architectures),
+  [Trainable component API](/usage/processing-pipelines#trainable-components),
+  [Trainable components and models](/usage/layers-architectures#components)
 - **Thinc: **
   [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
   [`Model` API](https://thinc.ai/docs/api-model)
@@ -503,36 +507,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 - Pipeline package symlinks, the `link` command and shortcut names are now
   deprecated. There can be many [different trained pipelines](/models) and not
   just one "English model", so you should always use the full package name like
-  [`en_core_web_sm`](/models/en) explicitly.
-- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide
-  meta information like the package name, author, license and labels. It's
-  **not** used to construct the processing pipeline anymore. This is all defined
-  in the [`config.cfg`](/api/data-formats#config), which also includes all
-  settings used to train the pipeline.
-- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now
-  only take a `config.cfg` file containing the full
-  [training config](/usage/training#config).
+  `en_core_web_sm` explicitly.
+- A pipeline's `meta.json` is now only used to provide meta information like the
+  package name, author, license and labels. It's **not** used to construct the
+  processing pipeline anymore. This is all defined in the
+  [`config.cfg`](/api/data-formats#config), which also includes all settings
+  used to train the pipeline.
+- The `train`, `pretrain` and `debug data` commands now only take a
+  `config.cfg`.
 - [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
   the component factory instead of the component function.
 - **Custom pipeline components** now need to be decorated with the
   [`@Language.component`](/api/language#component) or
   [`@Language.factory`](/api/language#factory) decorator.
-- [`Language.update`](/api/language#update) now takes a batch of
-  [`Example`](/api/example) objects instead of raw texts and annotations, or
-  `Doc` and `GoldParse` objects.
-- The `Language.disable_pipes` context manager has been replaced by
-  [`Language.select_pipes`](/api/language#select_pipes), which can explicitly
-  disable or enable components.
 - The [`Language.update`](/api/language#update),
   [`Language.evaluate`](/api/language#evaluate) and
   [`Pipe.update`](/api/pipe#update) methods now all take batches of
   [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
   raw text and a dictionary of annotations.
-  [`Language.initialize`](/api/language#initialize) and
-  [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
-  sequence of `Example` objects to initialize the model instead of a list of
-  tuples.
-- The `begin_training` methods have been renamed to `initialize`.
+- The `begin_training` methods have been renamed to `initialize` and now take a
+  function that returns a sequence of `Example` objects to initialize the model
+  instead of a list of tuples.
 - [`Matcher.add`](/api/matcher#add) and
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
@@ -557,7 +552,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 
 | Removed                                                                                      | Replacement                                                                                                                                                                                                              |
 | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                                                        |
 | `Language.begin_training`, `Pipe.begin_training`, ...                                        | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ...                                                                                                                        |
 | `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
 | `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |

From bcaad28edae481197366988f1061f6479c2c9dc5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 7 Oct 2020 13:05:37 +0200
Subject: [PATCH 455/516] fix typos

---
 website/docs/usage/saving-loading.md | 2 +-
 website/docs/usage/v3.md             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index c19ff39eb..968689baf 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -297,7 +297,7 @@ packages. This lets one application easily customize the behavior of another, by
 exposing an entry point in its `setup.py`. For a quick and fun intro to entry
 points in Python, check out
 [this excellent blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/).
-spaCy can load custom function from several different entry points to add
+spaCy can load custom functions from several different entry points to add
 pipeline component factories, language classes and other settings. To make spaCy
 use your entry points, your package needs to expose them and it needs to be
 installed in the same environment – that's it.
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 1024a2551..b0d9ca84c 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -395,7 +395,7 @@ type-check model definitions.
 For data validation, spaCy v3.0 adopts
 [`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
 validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
-lets you to register **custom functions with typed arguments**, reference them
+lets you register **custom functions with typed arguments**, reference them
 in your config and see validation errors if the argument values don't match.
 
 <Infobox title="Details & Documentation" emoji="📖" list>

From b79a420c208f9495de6af2bd24f3ff15514dcfaa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 7 Oct 2020 13:16:56 +0200
Subject: [PATCH 456/516] Adjust version pin [ci skip]

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index eef4fcf67..53171a346 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,7 +65,7 @@ console_scripts =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data==1.0.0rc0
+    spacy_lookups_data>=1.0.0rc0,<1.0.0
 transformers =
     spacy_transformers>=1.0.0a17,<1.0.0
 cuda =

From 33c2d4af16d8019e36916244e0bb02c19f87e8d2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 7 Oct 2020 14:56:00 +0200
Subject: [PATCH 457/516] move kb_loader to initialize for NEL instead of
 constructor

---
 spacy/pipeline/entity_linker.py | 36 +++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 2a5f3962d..b371ca9a4 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -8,6 +8,7 @@ from thinc.api import set_dropout_rate
 import warnings
 
 from ..kb import KnowledgeBase, Candidate
+from ..ml import empty_kb
 from ..tokens import Doc
 from .pipe import Pipe, deserialize_config
 from ..language import Language
@@ -41,11 +42,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
     requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
     assigns=["token.ent_kb_id"],
     default_config={
-        "kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 64},
         "model": DEFAULT_NEL_MODEL,
         "labels_discard": [],
         "incl_prior": True,
         "incl_context": True,
+        "entity_vector_length": 64,
         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
     },
     default_score_weights={
@@ -58,11 +59,11 @@ def make_entity_linker(
     nlp: Language,
     name: str,
     model: Model,
-    kb_loader: Callable[[Vocab], KnowledgeBase],
     *,
     labels_discard: Iterable[str],
     incl_prior: bool,
     incl_context: bool,
+    entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
 ):
     """Construct an EntityLinker component.
@@ -70,19 +71,21 @@ def make_entity_linker(
     model (Model[List[Doc], Floats2d]): A model that learns document vector
         representations. Given a batch of Doc objects, it should return a single
         array, with one row per item in the batch.
-    kb (KnowledgeBase): The knowledge-base to link entities to.
     labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
     incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
     incl_context (bool): Whether or not to include the local context in the model.
+    entity_vector_length (int): Size of encoding vectors in the KB.
+    get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+        produces a list of candidates, given a certain knowledge base and a textual mention.
     """
     return EntityLinker(
         nlp.vocab,
         model,
         name,
-        kb_loader=kb_loader,
         labels_discard=labels_discard,
         incl_prior=incl_prior,
         incl_context=incl_context,
+        entity_vector_length=entity_vector_length,
         get_candidates=get_candidates,
     )
 
@@ -101,10 +104,10 @@ class EntityLinker(Pipe):
         model: Model,
         name: str = "entity_linker",
         *,
-        kb_loader: Callable[[Vocab], KnowledgeBase],
         labels_discard: Iterable[str],
         incl_prior: bool,
         incl_context: bool,
+        entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]],
     ) -> None:
         """Initialize an entity linker.
@@ -113,10 +116,12 @@ class EntityLinker(Pipe):
         model (thinc.api.Model): The Thinc Model powering the pipeline component.
         name (str): The component instance name, used to add entries to the
             losses during training.
-        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
         labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
         incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
         incl_context (bool): Whether or not to include the local context in the model.
+        entity_vector_length (int): Size of encoding vectors in the KB.
+        get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+            produces a list of candidates, given a certain knowledge base and a textual mention.
 
         DOCS: https://nightly.spacy.io/api/entitylinker#init
         """
@@ -127,15 +132,17 @@ class EntityLinker(Pipe):
             "labels_discard": list(labels_discard),
             "incl_prior": incl_prior,
             "incl_context": incl_context,
+            "entity_vector_length": entity_vector_length,
         }
-        self.kb = kb_loader(self.vocab)
         self.get_candidates = get_candidates
         self.cfg = dict(cfg)
         self.distance = CosineDistance(normalize=False)
         # how many neightbour sentences to take into account
         self.n_sents = cfg.get("n_sents", 0)
+        # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
+        self.kb = empty_kb(entity_vector_length)(self.vocab)
 
-    def _require_kb(self) -> None:
+    def validate_kb(self) -> None:
         # Raise an error if the knowledge base is not initialized.
         if len(self.kb) == 0:
             raise ValueError(Errors.E139.format(name=self.name))
@@ -145,6 +152,7 @@ class EntityLinker(Pipe):
         get_examples: Callable[[], Iterable[Example]],
         *,
         nlp: Optional[Language] = None,
+        kb_loader: Callable[[Vocab], KnowledgeBase] = None,
     ):
         """Initialize the pipe for training, using a representative set
         of data examples.
@@ -152,11 +160,17 @@ class EntityLinker(Pipe):
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
         nlp (Language): The current nlp object the component is part of.
+        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
+            Note that providing this argument, will overwrite all data accumulated in the current KB.
+            Use this only when loading a KB as-such from file.
 
         DOCS: https://nightly.spacy.io/api/entitylinker#initialize
         """
         self._ensure_examples(get_examples)
-        self._require_kb()
+        if kb_loader is not None:
+            self.kb = kb_loader(self.vocab)
+            self.cfg["entity_vector_length"] = self.kb.entity_vector_length
+        self.validate_kb()
         nO = self.kb.entity_vector_length
         doc_sample = []
         vector_sample = []
@@ -192,7 +206,7 @@ class EntityLinker(Pipe):
 
         DOCS: https://nightly.spacy.io/api/entitylinker#update
         """
-        self._require_kb()
+        self.validate_kb()
         if losses is None:
             losses = {}
         losses.setdefault(self.name, 0.0)
@@ -303,7 +317,7 @@ class EntityLinker(Pipe):
 
         DOCS: https://nightly.spacy.io/api/entitylinker#predict
         """
-        self._require_kb()
+        self.validate_kb()
         entity_count = 0
         final_kb_ids = []
         if not docs:

From 6b8bdb2d390c4d26577754c213170a0190bb2cc5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 7 Oct 2020 14:58:16 +0200
Subject: [PATCH 458/516] add init_config to nlp.create_pipe

---
 spacy/language.py                          | 23 +++++++++++++++++++++-
 spacy/tests/pipeline/test_entity_linker.py | 22 +++++++++++++--------
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index ba244617e..e3b2285fb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -600,6 +600,7 @@ class Language:
         *,
         config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
+        init_config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         validate: bool = True,
     ) -> Callable[[Doc], Doc]:
         """Create a pipeline component. Mostly used internally. To create and
@@ -611,6 +612,9 @@ class Language:
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
         raw_config (Optional[Config]): Internals: the non-interpolated config.
+        init_config (Optional[Dict[str, Any]]): Config parameters to use to
+            initialize this component. Will be used to update the internal
+            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
         RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -621,8 +625,13 @@ class Language:
         if not isinstance(config, dict):
             err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
             raise ValueError(err)
+        if not isinstance(init_config, dict):
+            err = Errors.E962.format(style="init_config", name=name, cfg_type=type(init_config))
+            raise ValueError(err)
         if not srsly.is_json_serializable(config):
             raise ValueError(Errors.E961.format(config=config))
+        if not srsly.is_json_serializable(init_config):
+            raise ValueError(Errors.E961.format(config=init_config))
         if not self.has_factory(factory_name):
             err = Errors.E002.format(
                 name=factory_name,
@@ -634,6 +643,8 @@ class Language:
             raise ValueError(err)
         pipe_meta = self.get_factory_meta(factory_name)
         config = config or {}
+        if init_config:
+            self._config["initialize"]["components"][name] = init_config
         # This is unideal, but the alternative would mean you always need to
         # specify the full config settings, which is not really viable.
         if pipe_meta.default_config:
@@ -708,6 +719,7 @@ class Language:
         source: Optional["Language"] = None,
         config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
+        init_config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         validate: bool = True,
     ) -> Callable[[Doc], Doc]:
         """Add a component to the processing pipeline. Valid components are
@@ -730,6 +742,9 @@ class Language:
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
         raw_config (Optional[Config]): Internals: the non-interpolated config.
+        init_config (Optional[Dict[str, Any]]): Config parameters to use to
+            initialize this component. Will be used to update the internal
+            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
         RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -763,6 +778,7 @@ class Language:
                 name=name,
                 config=config,
                 raw_config=raw_config,
+                init_config=init_config,
                 validate=validate,
             )
         pipe_index = self._get_pipe_index(before, after, first, last)
@@ -842,6 +858,7 @@ class Language:
         factory_name: str,
         *,
         config: Dict[str, Any] = SimpleFrozenDict(),
+        init_config: Dict[str, Any] = SimpleFrozenDict(),
         validate: bool = True,
     ) -> None:
         """Replace a component in the pipeline.
@@ -850,6 +867,9 @@ class Language:
         factory_name (str): Factory name of replacement component.
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
+        init_config (Optional[Dict[str, Any]]): Config parameters to use to
+            initialize this component. Will be used to update the internal
+            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
 
@@ -866,13 +886,14 @@ class Language:
         self.remove_pipe(name)
         if not len(self._components) or pipe_index == len(self._components):
             # we have no components to insert before/after, or we're replacing the last component
-            self.add_pipe(factory_name, name=name, config=config, validate=validate)
+            self.add_pipe(factory_name, name=name, config=config, init_config=init_config, validate=validate)
         else:
             self.add_pipe(
                 factory_name,
                 name=name,
                 before=pipe_index,
                 config=config,
+                init_config=init_config,
                 validate=validate,
             )
 
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 66de54c06..cf9fce2a7 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -110,7 +110,7 @@ def test_kb_invalid_entity_vector(nlp):
 
 
 def test_kb_default(nlp):
-    """Test that the default (empty) KB is loaded when not providing a config"""
+    """Test that the default (empty) KB is loaded upon construction"""
     entity_linker = nlp.add_pipe("entity_linker", config={})
     assert len(entity_linker.kb) == 0
     assert entity_linker.kb.get_size_entities() == 0
@@ -122,7 +122,7 @@ def test_kb_default(nlp):
 def test_kb_custom_length(nlp):
     """Test that the default (empty) KB can be configured with a custom entity length"""
     entity_linker = nlp.add_pipe(
-        "entity_linker", config={"kb_loader": {"entity_vector_length": 35}}
+        "entity_linker", config={"entity_vector_length": 35}
     )
     assert len(entity_linker.kb) == 0
     assert entity_linker.kb.get_size_entities() == 0
@@ -140,7 +140,7 @@ def test_kb_undefined(nlp):
 def test_kb_empty(nlp):
     """Test that the EL can't train with an empty KB"""
     config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
-    entity_linker = nlp.add_pipe("entity_linker", config=config)
+    entity_linker = nlp.add_pipe("entity_linker", init_config=config)
     assert len(entity_linker.kb) == 0
     with pytest.raises(ValueError):
         entity_linker.initialize(lambda: [])
@@ -217,8 +217,10 @@ def test_el_pipe_configuration(nlp):
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
     nlp.add_pipe(
         "entity_linker",
-        config={"kb_loader": {"@misc": "myAdamKB.v1"}, "incl_context": False},
+        config={"incl_context": False},
+        init_config={"kb_loader": {"@misc": "myAdamKB.v1"}},
     )
+    nlp.initialize()
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
     doc = nlp(text)
@@ -238,11 +240,14 @@ def test_el_pipe_configuration(nlp):
         "entity_linker",
         "entity_linker",
         config={
-            "kb_loader": {"@misc": "myAdamKB.v1"},
             "incl_context": False,
             "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
         },
+        init_config={
+            "kb_loader": {"@misc": "myAdamKB.v1"},
+        },
     )
+    nlp.initialize()
     doc = nlp(text)
     assert doc[0].ent_kb_id_ == "Q2"
     assert doc[1].ent_kb_id_ == ""
@@ -356,8 +361,9 @@ def test_preserving_links_asdoc(nlp):
     ]
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
-    el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
-    entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
+    config = {"incl_prior": False}
+    init_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}}
+    entity_linker = nlp.add_pipe("entity_linker", config=config, init_config=init_config, last=True)
     nlp.initialize()
     assert entity_linker.model.get_dim("nO") == vector_length
 
@@ -456,7 +462,7 @@ def test_overfitting_IO():
     # Create the Entity Linker component and add it to the pipeline
     entity_linker = nlp.add_pipe(
         "entity_linker",
-        config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
+        init_config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
         last=True,
     )
 

From efedccea8da3a71b2383e53feba552628d3ad770 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 7 Oct 2020 15:29:52 +0200
Subject: [PATCH 459/516] fix tests

---
 spacy/tests/regression/test_issue5230.py   |  4 ++--
 spacy/tests/serialize/test_serialize_kb.py | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 5e320996a..aa4cc9be1 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -80,8 +80,8 @@ def entity_linker():
 
         return create_kb
 
-    config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
-    entity_linker = nlp.add_pipe("entity_linker", config=config)
+    init_config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
+    entity_linker = nlp.add_pipe("entity_linker", init_config=init_config)
     # need to add model for two reasons:
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 63736418b..84e7c8ec2 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -6,6 +6,7 @@ from spacy.util import ensure_path, registry
 from spacy.kb import KnowledgeBase
 
 from ..util import make_tempdir
+from numpy import zeros
 
 
 def test_serialize_kb_disk(en_vocab):
@@ -90,11 +91,13 @@ def test_serialize_subclassed_kb():
         entity_vector_length: int, custom_field: int
     ) -> Callable[["Vocab"], KnowledgeBase]:
         def custom_kb_factory(vocab):
-            return SubKnowledgeBase(
+            kb = SubKnowledgeBase(
                 vocab=vocab,
                 entity_vector_length=entity_vector_length,
                 custom_field=custom_field,
             )
+            kb.add_entity("random_entity", 0.0, zeros(entity_vector_length))
+            return kb
 
         return custom_kb_factory
 
@@ -106,7 +109,8 @@ def test_serialize_subclassed_kb():
             "custom_field": 666,
         }
     }
-    entity_linker = nlp.add_pipe("entity_linker", config=config)
+    entity_linker = nlp.add_pipe("entity_linker", init_config=config)
+    nlp.initialize()
     assert type(entity_linker.kb) == SubKnowledgeBase
     assert entity_linker.kb.entity_vector_length == 342
     assert entity_linker.kb.custom_field == 666
@@ -116,6 +120,7 @@ def test_serialize_subclassed_kb():
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         entity_linker2 = nlp2.get_pipe("entity_linker")
-        assert type(entity_linker2.kb) == SubKnowledgeBase
+        # After IO, the KB is the standard one
+        assert type(entity_linker2.kb) == KnowledgeBase
         assert entity_linker2.kb.entity_vector_length == 342
-        assert entity_linker2.kb.custom_field == 666
+        assert not hasattr(entity_linker2.kb, "custom_field")

From 654ce9bae84ed2372eeee0761b07a20a7ca07dcc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 7 Oct 2020 21:02:22 +0200
Subject: [PATCH 460/516] Fix makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3f10e79cc..741366063 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
 		--disable-cache \
 		-o $@ \
 		$(package)==$(version) \
-		$(SPACY_EXTRAS)
+		"$(SPACY_EXTRAS)"
 	chmod a+rx $@
 	cp $@ dist/spacy.pex
 

From 010956d4933e8a09e165d35abdf32f020931717f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 09:51:31 +0200
Subject: [PATCH 461/516] Clear rule-based components on initialize

---
 spacy/pipeline/attributeruler.py            | 11 ++++++++++-
 spacy/pipeline/entityruler.py               |  2 +-
 spacy/tests/pipeline/test_attributeruler.py | 10 ++++++++++
 spacy/tests/pipeline/test_entity_ruler.py   |  9 +++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 9e6174d07..0ab1ac9bf 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -53,10 +53,18 @@ class AttributeRuler(Pipe):
         self.name = name
         self.vocab = vocab
         self.matcher = Matcher(self.vocab, validate=validate)
+        self.validate = validate
         self.attrs = []
         self._attrs_unnormed = []  # store for reference
         self.indices = []
 
+    def clear(self) -> None:
+        """Reset all patterns."""
+        self.matcher = Matcher(self.vocab, validate=self.validate)
+        self.attrs = []
+        self._attrs_unnormed = []
+        self.indices = []
+
     def initialize(
         self,
         get_examples: Optional[Callable[[], Iterable[Example]]],
@@ -65,13 +73,14 @@ class AttributeRuler(Pipe):
         patterns: Optional[Iterable[AttributeRulerPatternType]] = None,
         tag_map: Optional[TagMapType] = None,
         morph_rules: Optional[MorphRulesType] = None,
-    ):
+    ) -> None:
         """Initialize the attribute ruler by adding zero or more patterns.
 
         Rules can be specified as a sequence of dicts using the `patterns`
         keyword argument. You can also provide rules using the "tag map" or
         "morph rules" formats supported by spaCy prior to v3.
         """
+        self.clear()
         if patterns:
             self.add_patterns(patterns)
         if tag_map:
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 6ca586d05..dfaddad74 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -201,10 +201,10 @@ class EntityRuler(Pipe):
 
         DOCS: https://nightly.spacy.io/api/entityruler#initialize
         """
+        self.clear()
         if patterns:
             self.add_patterns(patterns)
 
-
     @property
     def ent_ids(self) -> Tuple[str, ...]:
         """All entity ids present in the match patterns `id` properties
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index fedeb192f..6c66469cc 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -136,6 +136,16 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc.has_annotation("MORPH")
 
 
+def test_attributeruler_init_clear(nlp, pattern_dicts):
+    """Test that initialization clears patterns."""
+    ruler = nlp.add_pipe("attribute_ruler")
+    assert not len(ruler.matcher)
+    ruler.add_patterns(pattern_dicts)
+    assert len(ruler.matcher)
+    ruler.initialize(lambda: [])
+    assert not len(ruler.matcher)
+
+
 def test_attributeruler_score(nlp, pattern_dicts):
     # initialize with patterns
     ruler = nlp.add_pipe("attribute_ruler")
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 96deab24b..206f44719 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -68,6 +68,15 @@ def test_entity_ruler_init_patterns(nlp, patterns):
     assert doc.ents[1].label_ == "BYE"
 
 
+def test_entity_ruler_init_clear(nlp, patterns):
+    """Test that initialization clears patterns."""
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    assert len(ruler.labels) == 4
+    ruler.initialize(lambda: [])
+    assert len(ruler.labels) == 0
+
+
 def test_entity_ruler_existing(nlp, patterns):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)

From eaf5c265cbc9cbe2a294a862e97fd5a388002dda Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 8 Oct 2020 10:34:01 +0200
Subject: [PATCH 462/516] set_kb method for entity_linker

---
 spacy/language.py                          |  41 ++++----
 spacy/pipeline/entity_linker.py            |   9 +-
 spacy/tests/pipeline/test_entity_linker.py | 105 ++++++++-------------
 spacy/tests/regression/test_issue5230.py   |  16 ++--
 spacy/tests/serialize/test_serialize_kb.py |  39 +++++---
 website/docs/api/language.md               |   5 +-
 6 files changed, 100 insertions(+), 115 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e3b2285fb..3a0ea783e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -600,7 +600,6 @@ class Language:
         *,
         config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
-        init_config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         validate: bool = True,
     ) -> Callable[[Doc], Doc]:
         """Create a pipeline component. Mostly used internally. To create and
@@ -612,9 +611,6 @@ class Language:
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
         raw_config (Optional[Config]): Internals: the non-interpolated config.
-        init_config (Optional[Dict[str, Any]]): Config parameters to use to
-            initialize this component. Will be used to update the internal
-            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
         RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -625,13 +621,9 @@ class Language:
         if not isinstance(config, dict):
             err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
             raise ValueError(err)
-        if not isinstance(init_config, dict):
-            err = Errors.E962.format(style="init_config", name=name, cfg_type=type(init_config))
             raise ValueError(err)
         if not srsly.is_json_serializable(config):
             raise ValueError(Errors.E961.format(config=config))
-        if not srsly.is_json_serializable(init_config):
-            raise ValueError(Errors.E961.format(config=init_config))
         if not self.has_factory(factory_name):
             err = Errors.E002.format(
                 name=factory_name,
@@ -643,8 +635,6 @@ class Language:
             raise ValueError(err)
         pipe_meta = self.get_factory_meta(factory_name)
         config = config or {}
-        if init_config:
-            self._config["initialize"]["components"][name] = init_config
         # This is unideal, but the alternative would mean you always need to
         # specify the full config settings, which is not really viable.
         if pipe_meta.default_config:
@@ -719,7 +709,6 @@ class Language:
         source: Optional["Language"] = None,
         config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         raw_config: Optional[Config] = None,
-        init_config: Optional[Dict[str, Any]] = SimpleFrozenDict(),
         validate: bool = True,
     ) -> Callable[[Doc], Doc]:
         """Add a component to the processing pipeline. Valid components are
@@ -742,9 +731,6 @@ class Language:
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
         raw_config (Optional[Config]): Internals: the non-interpolated config.
-        init_config (Optional[Dict[str, Any]]): Config parameters to use to
-            initialize this component. Will be used to update the internal
-            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
         RETURNS (Callable[[Doc], Doc]): The pipeline component.
@@ -778,7 +764,6 @@ class Language:
                 name=name,
                 config=config,
                 raw_config=raw_config,
-                init_config=init_config,
                 validate=validate,
             )
         pipe_index = self._get_pipe_index(before, after, first, last)
@@ -858,20 +843,17 @@ class Language:
         factory_name: str,
         *,
         config: Dict[str, Any] = SimpleFrozenDict(),
-        init_config: Dict[str, Any] = SimpleFrozenDict(),
         validate: bool = True,
-    ) -> None:
+    ) -> Callable[[Doc], Doc]:
         """Replace a component in the pipeline.
 
         name (str): Name of the component to replace.
         factory_name (str): Factory name of replacement component.
         config (Optional[Dict[str, Any]]): Config parameters to use for this
             component. Will be merged with default config, if available.
-        init_config (Optional[Dict[str, Any]]): Config parameters to use to
-            initialize this component. Will be used to update the internal
-            'initialize' config.
         validate (bool): Whether to validate the component config against the
             arguments and types expected by the factory.
+        RETURNS (Callable[[Doc], Doc]): The new pipeline component.
 
         DOCS: https://nightly.spacy.io/api/language#replace_pipe
         """
@@ -886,14 +868,15 @@ class Language:
         self.remove_pipe(name)
         if not len(self._components) or pipe_index == len(self._components):
             # we have no components to insert before/after, or we're replacing the last component
-            self.add_pipe(factory_name, name=name, config=config, init_config=init_config, validate=validate)
+            return self.add_pipe(
+                factory_name, name=name, config=config, validate=validate
+            )
         else:
-            self.add_pipe(
+            return self.add_pipe(
                 factory_name,
                 name=name,
                 before=pipe_index,
                 config=config,
-                init_config=init_config,
                 validate=validate,
             )
 
@@ -1321,7 +1304,11 @@ class Language:
             kwargs.setdefault("batch_size", batch_size)
             # non-trainable components may have a pipe() implementation that refers to dummy
             # predict and set_annotations methods
-            if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable():
+            if (
+                not hasattr(pipe, "pipe")
+                or not hasattr(pipe, "is_trainable")
+                or not pipe.is_trainable()
+            ):
                 docs = _pipe(docs, pipe, kwargs)
             else:
                 docs = pipe.pipe(docs, **kwargs)
@@ -1433,7 +1420,11 @@ class Language:
             kwargs.setdefault("batch_size", batch_size)
             # non-trainable components may have a pipe() implementation that refers to dummy
             # predict and set_annotations methods
-            if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable():
+            if (
+                hasattr(proc, "pipe")
+                and hasattr(proc, "is_trainable")
+                and proc.is_trainable()
+            ):
                 f = functools.partial(proc.pipe, **kwargs)
             else:
                 # Apply the function, but yield the doc
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index b371ca9a4..eec591995 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -142,6 +142,12 @@ class EntityLinker(Pipe):
         # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
         self.kb = empty_kb(entity_vector_length)(self.vocab)
 
+    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
+        """Define the KB of this pipe by providing a function that will
+        create it using this object's vocab."""
+        self.kb = kb_loader(self.vocab)
+        self.cfg["entity_vector_length"] = self.kb.entity_vector_length
+
     def validate_kb(self) -> None:
         # Raise an error if the knowledge base is not initialized.
         if len(self.kb) == 0:
@@ -168,8 +174,7 @@ class EntityLinker(Pipe):
         """
         self._ensure_examples(get_examples)
         if kb_loader is not None:
-            self.kb = kb_loader(self.vocab)
-            self.cfg["entity_vector_length"] = self.kb.entity_vector_length
+            self.set_kb(kb_loader)
         self.validate_kb()
         nO = self.kb.entity_vector_length
         doc_sample = []
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index cf9fce2a7..e77be74ad 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -130,18 +130,9 @@ def test_kb_custom_length(nlp):
     assert entity_linker.kb.entity_vector_length == 35
 
 
-def test_kb_undefined(nlp):
-    """Test that the EL can't train without defining a KB"""
-    entity_linker = nlp.add_pipe("entity_linker", config={})
-    with pytest.raises(ValueError):
-        entity_linker.initialize(lambda: [])
-
-
-def test_kb_empty(nlp):
-    """Test that the EL can't train with an empty KB"""
-    config = {"kb_loader": {"@misc": "spacy.EmptyKB.v1", "entity_vector_length": 342}}
-    entity_linker = nlp.add_pipe("entity_linker", init_config=config)
-    assert len(entity_linker.kb) == 0
+def test_kb_initialize_empty(nlp):
+    """Test that the EL can't initialize without examples"""
+    entity_linker = nlp.add_pipe("entity_linker")
     with pytest.raises(ValueError):
         entity_linker.initialize(lambda: [])
 
@@ -201,26 +192,21 @@ def test_el_pipe_configuration(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns([pattern])
 
-    @registry.misc.register("myAdamKB.v1")
-    def mykb() -> Callable[["Vocab"], KnowledgeBase]:
-        def create_kb(vocab):
-            kb = KnowledgeBase(vocab, entity_vector_length=1)
-            kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
-            kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
-            kb.add_alias(
-                alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
-            )
-            return kb
-
-        return create_kb
+    def create_kb(vocab):
+        kb = KnowledgeBase(vocab, entity_vector_length=1)
+        kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+        kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+        kb.add_alias(
+            alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
+        )
+        return kb
 
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
-    nlp.add_pipe(
+    entity_linker = nlp.add_pipe(
         "entity_linker",
         config={"incl_context": False},
-        init_config={"kb_loader": {"@misc": "myAdamKB.v1"}},
     )
-    nlp.initialize()
+    entity_linker.set_kb(create_kb)
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
     doc = nlp(text)
@@ -236,18 +222,15 @@ def test_el_pipe_configuration(nlp):
         return get_lowercased_candidates
 
     # replace the pipe with a new one with with a different candidate generator
-    nlp.replace_pipe(
+    entity_linker = nlp.replace_pipe(
         "entity_linker",
         "entity_linker",
         config={
             "incl_context": False,
             "get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
         },
-        init_config={
-            "kb_loader": {"@misc": "myAdamKB.v1"},
-        },
     )
-    nlp.initialize()
+    entity_linker.set_kb(create_kb)
     doc = nlp(text)
     assert doc[0].ent_kb_id_ == "Q2"
     assert doc[1].ent_kb_id_ == ""
@@ -339,19 +322,15 @@ def test_preserving_links_asdoc(nlp):
     """Test that Span.as_doc preserves the existing entity links"""
     vector_length = 1
 
-    @registry.misc.register("myLocationsKB.v1")
-    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
-        def create_kb(vocab):
-            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-            # adding entities
-            mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
-            mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
-            # adding aliases
-            mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
-            mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
-            return mykb
-
-        return create_kb
+    def create_kb(vocab):
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        # adding entities
+        mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
+        mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
+        # adding aliases
+        mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
+        mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
+        return mykb
 
     # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
     nlp.add_pipe("sentencizer")
@@ -362,8 +341,8 @@ def test_preserving_links_asdoc(nlp):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
     config = {"incl_prior": False}
-    init_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}}
-    entity_linker = nlp.add_pipe("entity_linker", config=config, init_config=init_config, last=True)
+    entity_linker = nlp.add_pipe("entity_linker", config=config, last=True)
+    entity_linker.set_kb(create_kb)
     nlp.initialize()
     assert entity_linker.model.get_dim("nO") == vector_length
 
@@ -441,30 +420,26 @@ def test_overfitting_IO():
         doc = nlp(text)
         train_examples.append(Example.from_dict(doc, annotation))
 
-    @registry.misc.register("myOverfittingKB.v1")
-    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
-        def create_kb(vocab):
-            # create artificial KB - assign same prior weight to the two russ cochran's
-            # Q2146908 (Russ Cochran): American golfer
-            # Q7381115 (Russ Cochran): publisher
-            mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
-            mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
-            mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
-            mykb.add_alias(
-                alias="Russ Cochran",
-                entities=["Q2146908", "Q7381115"],
-                probabilities=[0.5, 0.5],
-            )
-            return mykb
-
-        return create_kb
+    def create_kb(vocab):
+        # create artificial KB - assign same prior weight to the two russ cochran's
+        # Q2146908 (Russ Cochran): American golfer
+        # Q7381115 (Russ Cochran): publisher
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="Russ Cochran",
+            entities=["Q2146908", "Q7381115"],
+            probabilities=[0.5, 0.5],
+        )
+        return mykb
 
     # Create the Entity Linker component and add it to the pipeline
     entity_linker = nlp.add_pipe(
         "entity_linker",
-        init_config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
         last=True,
     )
+    entity_linker.set_kb(create_kb)
 
     # train the NEL pipe
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index aa4cc9be1..9fda413a3 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -71,17 +71,13 @@ def tagger():
 def entity_linker():
     nlp = Language()
 
-    @registry.misc.register("TestIssue5230KB.v1")
-    def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
-        def create_kb(vocab):
-            kb = KnowledgeBase(vocab, entity_vector_length=1)
-            kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
-            return kb
+    def create_kb(vocab):
+        kb = KnowledgeBase(vocab, entity_vector_length=1)
+        kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
+        return kb
 
-        return create_kb
-
-    init_config = {"kb_loader": {"@misc": "TestIssue5230KB.v1"}}
-    entity_linker = nlp.add_pipe("entity_linker", init_config=init_config)
+    entity_linker = nlp.add_pipe("entity_linker")
+    entity_linker.set_kb(create_kb)
     # need to add model for two reasons:
     # 1. no model leads to error in serialization,
     # 2. the affected line is the one for model serialization
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 84e7c8ec2..352c335ea 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,9 +1,9 @@
 from typing import Callable
 
 from spacy import util
-from spacy.lang.en import English
-from spacy.util import ensure_path, registry
+from spacy.util import ensure_path, registry, load_model_from_config
 from spacy.kb import KnowledgeBase
+from thinc.api import Config
 
 from ..util import make_tempdir
 from numpy import zeros
@@ -81,6 +81,28 @@ def _check_kb(kb):
 def test_serialize_subclassed_kb():
     """Check that IO of a custom KB works fine as part of an EL pipe."""
 
+    config_string = """
+    [nlp]
+    lang = "en"
+    pipeline = ["entity_linker"]
+
+    [components]
+
+    [components.entity_linker]
+    factory = "entity_linker"
+
+    [initialize]
+
+    [initialize.components]
+
+    [initialize.components.entity_linker]
+
+    [initialize.components.entity_linker.kb_loader]
+    @misc = "spacy.CustomKB.v1"
+    entity_vector_length = 342
+    custom_field = 666
+    """
+
     class SubKnowledgeBase(KnowledgeBase):
         def __init__(self, vocab, entity_vector_length, custom_field):
             super().__init__(vocab, entity_vector_length)
@@ -101,16 +123,11 @@ def test_serialize_subclassed_kb():
 
         return custom_kb_factory
 
-    nlp = English()
-    config = {
-        "kb_loader": {
-            "@misc": "spacy.CustomKB.v1",
-            "entity_vector_length": 342,
-            "custom_field": 666,
-        }
-    }
-    entity_linker = nlp.add_pipe("entity_linker", init_config=config)
+    config = Config().from_str(config_string)
+    nlp = load_model_from_config(config, auto_fill=True)
     nlp.initialize()
+
+    entity_linker = nlp.get_pipe("entity_linker")
     assert type(entity_linker.kb) == SubKnowledgeBase
     assert entity_linker.kb.entity_vector_length == 342
     assert entity_linker.kb.custom_field == 666
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 6257199c9..51e9a5e10 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -524,7 +524,7 @@ Get a pipeline component for a given component name.
 
 ## Language.replace_pipe {#replace_pipe tag="method" new="2"}
 
-Replace a component in the pipeline.
+Replace a component in the pipeline and return the new component.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -538,7 +538,7 @@ and instead expects the **name of a component factory** registered using
 > #### Example
 >
 > ```python
-> nlp.replace_pipe("parser", my_custom_parser)
+> new_parser = nlp.replace_pipe("parser", "my_custom_parser")
 > ```
 
 | Name                                  | Description                                                                                                                                                        |
@@ -548,6 +548,7 @@ and instead expects the **name of a component factory** registered using
 | _keyword-only_                        |                                                                                                                                                                    |
 | `config` <Tag variant="new">3</Tag>   | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
 | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                                     |
+| **RETURNS**                           | The new pipeline component. ~~Callable[[Doc], Doc]~~                                                                                                               |
 
 ## Language.rename_pipe {#rename_pipe tag="method" new="2"}
 

From 3e2e1fd323c2c822f864b5a4043c8464d986216a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 8 Oct 2020 10:37:32 +0200
Subject: [PATCH 463/516] cleanup

---
 spacy/language.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3a0ea783e..b438936a6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -621,7 +621,6 @@ class Language:
         if not isinstance(config, dict):
             err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
             raise ValueError(err)
-            raise ValueError(err)
         if not srsly.is_json_serializable(config):
             raise ValueError(Errors.E961.format(config=config))
         if not self.has_factory(factory_name):

From 43e59bb22a5fdeb4dadc0572a1f51d6fb672e557 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 10:58:50 +0200
Subject: [PATCH 464/516] Update docs and install extras [ci skip]

---
 setup.cfg                                     |  2 ++
 website/docs/api/transformer.md               | 24 ++++++++---------
 website/docs/usage/embeddings-transformers.md |  3 +--
 website/docs/usage/index.md                   | 27 ++++++++++---------
 website/docs/usage/linguistic-features.md     |  9 ++++---
 website/docs/usage/models.md                  |  4 +--
 website/docs/usage/projects.md                |  4 +--
 website/docs/usage/training.md                |  2 +-
 website/docs/usage/v3.md                      |  4 +--
 website/gatsby-config.js                      |  2 ++
 website/src/widgets/changelog.js              |  5 +++-
 website/src/widgets/landing.js                |  9 ++++---
 website/src/widgets/quickstart-install.js     | 11 +++++---
 13 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 53171a346..424b1ff8e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -68,6 +68,8 @@ lookups =
     spacy_lookups_data>=1.0.0rc0,<1.0.0
 transformers =
     spacy_transformers>=1.0.0a17,<1.0.0
+ray =
+    spacy_ray>=0.0.1,<1.0.0
 cuda =
     cupy>=5.0.0b4,<9.0.0
 cuda80 =
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index abceeff4f..5754d2238 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -11,7 +11,7 @@ api_string_name: transformer
 > #### Installation
 >
 > ```bash
-> $ pip install spacy-transformers
+> $ pip install -U %%SPACY_PKG_NAME[transformers] %%SPACY_PKG_FLAGS
 > ```
 
 <Infobox title="Important note" variant="warning">
@@ -385,12 +385,12 @@ are wrapped into the
 by this class. Instances of this class are typically assigned to the
 [`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
 
-| Name      | Description                                                                                                                                                                                                                                                                                                                                             |
-| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                         |
+| Name      | Description                                                                                                                                                                                                                                                                                                                                               |
+| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                            |
 | `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
-| `align`   | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                    |
-| `width`   | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                             |
+| `align`   | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                      |
+| `width`   | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                               |
 
 ### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
 
@@ -406,13 +406,13 @@ Holds a batch of input and output objects for a transformer model. The data can
 then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
 objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
 
-| Name       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Name       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `spans`    | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
-| `tokens`   | The output of the tokenizer. ~~transformers.BatchEncoding~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `tensors`  | The output of the transformer model. ~~List[torch.Tensor]~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `align`    | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                                                                                                                                                   |
-| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `tokens`   | The output of the tokenizer. ~~transformers.BatchEncoding~~                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| `tensors`  | The output of the transformer model. ~~List[torch.Tensor]~~                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| `align`    | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                                                                                                                                                       |
+| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~                                                                                                                                                                                                                                                                                                                                                                                                                           |
 
 ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index c615097d6..c0611787b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -216,8 +216,7 @@ in `/opt/nvidia/cuda`, you would run:
 ```bash
 ### Installation with CUDA
 $ export CUDA_PATH="/opt/nvidia/cuda"
-$ pip install cupy-cuda102
-$ pip install spacy-transformers
+$ pip install -U %%SPACY_PKG_NAME[cud102,transformers]%%SPACY_PKG_FLAGS
 ```
 
 ### Runtime usage {#transformers-runtime}
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index e0a4fdb07..398f97bb4 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -47,7 +47,7 @@ Before you install spaCy and its dependencies, make sure that your `pip`,
 
 ```bash
 $ pip install -U pip setuptools wheel
-$ pip install -U spacy
+$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 
 When using pip it is generally recommended to install packages in a virtual
@@ -57,7 +57,7 @@ environment to avoid modifying system state:
 $ python -m venv .env
 $ source .env/bin/activate
 $ pip install -U pip setuptools wheel
-$ pip install spacy
+$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 ```
 
 spaCy also lets you install extra dependencies by specifying the following
@@ -68,15 +68,16 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 > #### Example
 >
 > ```bash
-> $ pip install spacy[lookups,transformers]
+> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
 > ```
 
-| Name             | Description                                                                                                                                                                                                                                                    |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
-| `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
-| `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
-| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
+| Name                   | Description                                                                                                                                                                                                                                                    |
+| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lookups`              | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
+| `transformers`         | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
+| `ray`                  | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training).                                                                                                                  |
+| `cuda`, ...            | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
+| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
 
 ### conda {#conda}
 
@@ -88,8 +89,8 @@ $ conda install -c conda-forge spacy
 ```
 
 For the feedstock including the build recipe and configuration, check out
-[this repository](https://github.com/conda-forge/spacy-feedstock). Improvements
-and pull requests to the recipe and setup are always appreciated.
+[this repository](https://github.com/conda-forge/spacy-feedstock). Note that we
+currently don't publish any [pre-releases](#changelog-pre) on conda.
 
 ### Upgrading spaCy {#upgrading}
 
@@ -116,7 +117,7 @@ are printed. It's recommended to run the command with `python -m` to make sure
 you're executing the correct version of spaCy.
 
 ```cli
-$ pip install -U spacy
+$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 $ python -m spacy validate
 ```
 
@@ -134,7 +135,7 @@ specifier allows cupy to be installed via wheel, saving some compilation time.
 The specifiers should install [`cupy`](https://cupy.chainer.org).
 
 ```bash
-$ pip install -U spacy[cuda92]
+$ pip install -U %%SPACY_PKG_NAME[cuda92]%%SPACY_PKG_FLAGS
 ```
 
 Once you have a GPU-enabled installation, the best way to activate it is to call
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 1964bac18..f669c0a84 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -166,7 +166,7 @@ lookup lemmatizer looks up the token surface form in the lookup table without
 reference to the token's part-of-speech or context.
 
 ```python
-# pip install spacy-lookups-data
+# pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
 import spacy
 
 nlp = spacy.blank("sv")
@@ -181,7 +181,7 @@ rule-based lemmatizer can be added using rule tables from
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data):
 
 ```python
-# pip install spacy-lookups-data
+# pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
 import spacy
 
 nlp = spacy.blank("de")
@@ -1801,7 +1801,10 @@ print(doc2[5].tag_, doc2[5].pos_)  # WP PRON
 
 <Infobox variant="warning" title="Migrating from spaCy v2.x">
 
-The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
+The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph
+rules** in the v2.x format via its built-in methods or when the component is
+initialized before training. See the
+[migration guide](/usage/v3#migrating-training-mappings-exceptions) for details.
 
 </Infobox>
 
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index fe3ee6e04..8c8875b9e 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -54,7 +54,7 @@ contribute to development.
 > separately in the same environment:
 >
 > ```bash
-> $ pip install spacy[lookups]
+> $ pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
 > ```
 
 import Languages from 'widgets/languages.js'
@@ -287,7 +287,7 @@ The download command will [install the package](/usage/models#download-pip) via
 pip and place the package in your `site-packages` directory.
 
 ```cli
-$ pip install -U spacy
+$ pip install -U %%SPACY_PKG_NAME%%SPACY_PKG_FLAGS
 $ python -m spacy download en_core_web_sm
 ```
 
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 5fced922d..409236fbc 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -813,7 +813,7 @@ full embedded visualizer, as well as individual components.
 > #### Installation
 >
 > ```bash
-> $ pip install "spacy-streamlit>=1.0.0a0"
+> $ pip install spacy-streamlit --pre
 > ```
 
 ![](../images/spacy-streamlit.png)
@@ -911,7 +911,7 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main.
 > #### Installation
 >
 > ```cli
-> $ pip install spacy-ray
+> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > ```
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index e63e25e52..04924a431 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1249,7 +1249,7 @@ valid.
 > #### Installation
 >
 > ```cli
-> $ pip install spacy-ray
+> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > ```
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 1024a2551..0f30029e7 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -236,7 +236,7 @@ treebank.
 > #### Example
 >
 > ```cli
-> $ pip install spacy-ray
+> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
 > # Check that the CLI is registered
 > $ python -m spacy ray --help
 > # Train a pipeline
@@ -272,7 +272,7 @@ add to your pipeline and customize for your use case:
 > #### Example
 >
 > ```python
-> # pip install spacy-lookups-data
+> # pip install -U %%SPACY_PKG_NAME[lookups]%%SPACY_PKG_FLAGS
 > nlp = spacy.blank("en")
 > nlp.add_pipe("lemmatizer")
 > ```
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 4650711ac..5b11f56bc 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -30,6 +30,8 @@ const branch = isNightly ? 'develop' : 'master'
 const replacements = {
     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
     GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
+    SPACY_PKG_NAME: isNightly ? 'spacy-nightly' : 'spacy',
+    SPACY_PKG_FLAGS: isNightly ? ' --pre' : '',
 }
 
 /**
diff --git a/website/src/widgets/changelog.js b/website/src/widgets/changelog.js
index 73890d320..c5aca9b62 100644
--- a/website/src/widgets/changelog.js
+++ b/website/src/widgets/changelog.js
@@ -97,7 +97,10 @@ const Changelog = () => {
             <p>
                 Pre-releases include alpha and beta versions, as well as release candidates. They
                 are not intended for production use. You can download spaCy pre-releases via the{' '}
-                <InlineCode>spacy-nightly</InlineCode> package on pip.
+                <Link to="https://pypi.org/packages/spacy-nightly">
+                    <InlineCode>spacy-nightly</InlineCode>
+                </Link>{' '}
+                package on pip.
             </p>
 
             <p>
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 6fe7f4cdf..ac1d7c5c7 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -28,7 +28,8 @@ import irlBackground from '../images/spacy-irl.jpg'
 
 import Benchmarks from 'usage/_benchmarks-models.md'
 
-const CODE_EXAMPLE = `# pip install spacy
+function getCodeExample(nightly) {
+    return `# pip install -U ${nightly ? 'spacy-nightly --pre' : 'spacy'}
 # python -m spacy download en_core_web_sm
 import spacy
 
@@ -52,9 +53,11 @@ print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
 for entity in doc.ents:
     print(entity.text, entity.label_)
 `
+}
 
 const Landing = ({ data }) => {
-    const { counts } = data
+    const { counts, nightly } = data
+    const codeExample = getCodeExample(nightly)
     return (
         <>
             <LandingHeader nightly={data.nightly}>
@@ -91,7 +94,7 @@ const Landing = ({ data }) => {
             </LandingGrid>
 
             <LandingGrid>
-                <LandingDemo title="Edit the code & try spaCy">{CODE_EXAMPLE}</LandingDemo>
+                <LandingDemo title="Edit the code &amp; try spaCy">{codeExample}</LandingDemo>
 
                 <LandingCol>
                     <H2>Features</H2>
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index ab91b8e30..37ae10da4 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -141,6 +141,11 @@ const QuickstartInstall = ({ id, title }) => {
                         setters={setters}
                         showDropdown={showDropdown}
                     >
+                        {nightly && (
+                            <QS package="conda" comment prompt={false}>
+                                # 🚨 Nightly releases are currently only available via pip
+                            </QS>
+                        )}
                         <QS config="venv">python -m venv .env</QS>
                         <QS config="venv" os="mac">
                             source .env/bin/activate
@@ -175,9 +180,9 @@ const QuickstartInstall = ({ id, title }) => {
                         </QS>
                         <QS package="source">pip install -r requirements.txt</QS>
                         <QS package="source">python setup.py build_ext --inplace</QS>
-                        <QS package="source" config="train">
-                            pip install -e '.[{pipExtras}]'
-                        </QS>
+                        {(train || hardware == 'gpu') && (
+                            <QS package="source">pip install -e '.[{pipExtras}]'</QS>
+                        )}
 
                         <QS config="train" package="conda">
                             conda install -c conda-forge spacy-transformers

From 1e7560f3277e987f16745437d92ca67ef12096e2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 11:10:48 +0200
Subject: [PATCH 465/516] Update pin [ci skip]

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 424b1ff8e..e44d01026 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -69,7 +69,7 @@ lookups =
 transformers =
     spacy_transformers>=1.0.0a17,<1.0.0
 ray =
-    spacy_ray>=0.0.1,<1.0.0
+    spacy_ray>=0.1.0,<1.0.0
 cuda =
     cupy>=5.0.0b4,<9.0.0
 cuda80 =

From d1602e1ecea2d3ff14288df65e8f1a25ec634516 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 11:56:50 +0200
Subject: [PATCH 466/516] Update docs [ci skip]

---
 website/docs/usage/embeddings-transformers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index c0611787b..549c3bcc4 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -235,7 +235,7 @@ The `Transformer` component sets the
 which lets you access the transformers outputs at runtime.
 
 ```cli
-$ python -m spacy download en_core_trf_lg
+$ python -m spacy download en_core_web_trf
 ```
 
 ```python
@@ -249,7 +249,7 @@ from thinc.api import use_pytorch_for_gpu_memory, require_gpu
 use_pytorch_for_gpu_memory()
 require_gpu(0)
 
-nlp = spacy.load("en_core_trf_lg")
+nlp = spacy.load("en_core_web_trf")
 for doc in nlp.pipe(["some text", "some other text"]):
     tokvecs = doc._.trf_data.tensors[-1]
 ```
@@ -269,7 +269,7 @@ def custom_annotation_setter(docs, trf_data):
     for doc, data in zip(docs, doc_data):
         doc._.custom_attr = data
 
-nlp = spacy.load("en_core_trf_lg")
+nlp = spacy.load("en_core_web_trf")
 nlp.get_pipe("transformer").set_extra_annotations = custom_annotation_setter
 doc = nlp("This is a text")
 assert isinstance(doc._.custom_attr, TransformerData)
@@ -286,7 +286,7 @@ of objects by referring to creation functions, including functions you register
 yourself. For details on how to get started with training your own model, check
 out the [training quickstart](/usage/training#quickstart).
 
-<!-- TODO: <Project id="en_core_trf_lg">
+<!-- TODO: <Project id="pipelines/transformers">
 
 The easiest way to get started is to clone a transformers-based project
 template. Swap in your data, edit the settings and hyperparameters and train,

From 741796e5007700622b2c80e5a7f0b3bc7a060d7d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 14:31:34 +0200
Subject: [PATCH 467/516] Update docs [ci skip]

---
 website/meta/languages.json     | 2 +-
 website/src/templates/models.js | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 5b54c1977..a7ab28f03 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -145,7 +145,7 @@
         {
             "code": "pt",
             "name": "Portuguese",
-            "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"],
+            "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg", "pt_dep_news_trf"],
             "example": "Esta é uma frase.",
             "has_examples": true
         },
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index f9895334d..82dc554fe 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -44,7 +44,7 @@ const MODEL_META = {
     las: 'Labelled dependencies',
     token_acc: 'Tokenization',
     tok: 'Tokenization',
-    lemma: 'Statistical lemmatization',
+    lemma: 'Lemmatization',
     morph: 'Morphological analysis',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
     tag: 'Part-of-speech tags (fine grained tags, Token.tag)',

From 8ff73f04dbc6deba6fd051751b82bdf610e8772f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 14:44:35 +0200
Subject: [PATCH 468/516] Fix morph in Doc.to_json

---
 spacy/tests/doc/test_to_json.py | 10 +++++++++-
 spacy/tokens/doc.pyx            |  4 +---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index 9abe5779d..9ebee6c88 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -10,8 +10,16 @@ def doc(en_vocab):
     heads = [0, 0, 0]
     deps = ["ROOT", "dobj", "dobj"]
     ents = ["O", "B-ORG", "O"]
+    morphs = ["Feat1=A", "Feat1=B", "Feat1=A|Feat2=D"]
     return Doc(
-        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
+        en_vocab,
+        words=words,
+        pos=pos,
+        tags=tags,
+        heads=heads,
+        deps=deps,
+        ents=ents,
+        morphs=morphs,
     )
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 0499dc4a7..4a57e4c83 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1398,8 +1398,6 @@ cdef class Doc:
         attributes. Attribute values need to be JSON-serializable. Values will
         be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
         RETURNS (dict): The data in spaCy's JSON format.
-
-        DOCS: https://nightly.spacy.io/api/doc#to_json
         """
         data = {"text": self.text}
         if self.has_annotation("ENT_IOB"):
@@ -1421,7 +1419,7 @@ cdef class Doc:
             if include_annotation["POS"]:
                 token_data["pos"] = token.pos_
             if include_annotation["MORPH"]:
-                token_data["morph"] = token.morph_
+                token_data["morph"] = token.morph
             if include_annotation["LEMMA"]:
                 token_data["lemma"] = token.lemma_
             if include_annotation["DEP"]:

From 5ebd1fc2cfc5e53611b496eec27375570410b03b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 16:23:12 +0200
Subject: [PATCH 469/516] Update docs [ci skip]

---
 website/docs/models/index.md              | 28 ++++++-----------------
 website/docs/usage/_benchmarks-models.md  |  4 ++--
 website/docs/usage/linguistic-features.md | 10 ++++----
 website/src/templates/models.js           |  4 ++--
 4 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 64e719f37..5b17d7f83 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -6,32 +6,18 @@ menu:
   - ['Conventions', 'conventions']
 ---
 
-<!-- Update page, refer to new /api/architectures and training docs -->
-
-This directory includes two types of packages:
-
-1. **Trained pipelines:** General-purpose spaCy pipelines to predict named
-   entities, part-of-speech tags and syntactic dependencies. Can be used
-   out-of-the-box and fine-tuned on more specific data.
-2. **Starters:** Transfer learning starter packs with pretrained weights you can
-   initialize your pipeline models with to achieve better accuracy. They can
-   include word vectors (which will be used as features during training) or
-   other pretrained representations like BERT. These packages don't include
-   components for specific tasks like NER or text classification and are
-   intended to be used as base models when training your own models.
+<!-- TODO: include interactive demo -->
 
 ### Quickstart {hidden="true"}
 
+> #### 📖 Installation and usage
+>
+> For more details on how to use trained pipelines with spaCy, see the
+> [usage guide](/usage/models).
+
 import QuickstartModels from 'widgets/quickstart-models.js'
 
-<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and test it." />
-
-<Infobox title="Installation and usage" emoji="📖">
-
-For more details on how to use trained pipelines with spaCy, see the
-[usage guide](/usage/models).
-
-</Infobox>
+<QuickstartModels id="quickstart" />
 
 ## Package naming conventions {#conventions}
 
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 88e79112f..a604c4b57 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -1,13 +1,13 @@
 import { Help } from 'components/typography'; import Link from 'components/link'
 
-<!-- TODO: update numbers -->
+<!-- TODO: update numbers, add note on previous NER evaluation issues -->
 
 <figure>
 
 | Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
 | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |   92.1 |   97.4 | 87.0 |                                                                  7k |                                                                    |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |   92.2 |   97.4 | 85.8 |                                                                  7k |                                                                    |
 | `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
 
 <figcaption class="caption">
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index f669c0a84..6dbf2525e 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -970,8 +970,8 @@ import spacy
 from spacy.tokenizer import Tokenizer
 
 special_cases = {":)": [{"ORTH": ":)"}]}
-prefix_re = re.compile(r'''^[\[\("']''')
-suffix_re = re.compile(r'''[\]\)"']$''')
+prefix_re = re.compile(r'''^[\\[\\("']''')
+suffix_re = re.compile(r'''[\\]\\)"']$''')
 infix_re = re.compile(r'''[-~]''')
 simple_url_re = re.compile(r'''^https?://''')
 
@@ -1592,7 +1592,9 @@ print("After:", [(token.text, token._.is_musician) for token in doc])
 A [`Doc`](/api/doc) object's sentences are available via the `Doc.sents`
 property. To view a `Doc`'s sentences, you can iterate over the `Doc.sents`, a
 generator that yields [`Span`](/api/span) objects. You can check whether a `Doc`
-has sentence boundaries with the `doc.is_sentenced` attribute.
+has sentence boundaries by calling
+[`Doc.has_annotation`](/api/doc#has_annotation) with the attribute name
+`"SENT_START"`.
 
 ```python
 ### {executable="true"}
@@ -1600,7 +1602,7 @@ import spacy
 
 nlp = spacy.load("en_core_web_sm")
 doc = nlp("This is a sentence. This is another sentence.")
-assert doc.is_sentenced
+assert doc.has_annotation("SENT_START")
 for sent in doc.sents:
     print(sent.text)
 ```
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 82dc554fe..9c6f595da 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -403,8 +403,8 @@ const Models = ({ pageContext, repo, children }) => {
                 <Section>
                     <p>
                         Starter packs are pretrained weights you can initialize your models with to
-                        achieve better accuracy. They can include word vectors (which will be used
-                        as features during training) or other pretrained representations like BERT.
+                        achieve better accuracy, like word vectors (which will be used as features
+                        during training).
                     </p>
                 </Section>
             )}

From d093d6343b3cd8ab4814037e5e75bbff3177690b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 8 Oct 2020 21:33:49 +0200
Subject: [PATCH 470/516] TrainablePipe (#6213)

* rename Pipe to TrainablePipe

* split functionality between Pipe and TrainablePipe

* remove unnecessary methods from certain components

* cleanup

* hasattr(component, "pipe") should be sufficient again

* remove serialization and vocab/cfg from Pipe

* unify _ensure_examples and validate_examples

* small fixes

* hasattr checks for self.cfg and self.vocab

* make is_resizable and is_trainable properties

* serialize strings.json instead of vocab

* fix KB IO + tests

* fix typos

* more typos

* _added_strings as a set

* few more tests specifically for _added_strings field

* bump to 3.0.0a36
---
 setup.py                                      |   1 +
 spacy/about.py                                |   2 +-
 spacy/errors.py                               |  10 +-
 spacy/kb.pxd                                  |   1 +
 spacy/kb.pyx                                  |  49 +--
 spacy/language.py                             |  44 +--
 spacy/pipeline/__init__.py                    |   2 +
 spacy/pipeline/attributeruler.py              |  30 +-
 spacy/pipeline/entity_linker.py               |  13 +-
 spacy/pipeline/entityruler.py                 |   6 -
 spacy/pipeline/lemmatizer.py                  |   4 -
 spacy/pipeline/morphologizer.pyx              |   9 +-
 spacy/pipeline/multitask.pyx                  |   4 +-
 spacy/pipeline/pipe.pxd                       |   3 -
 spacy/pipeline/pipe.pyx                       | 319 ++---------------
 spacy/pipeline/sentencizer.pyx                |   9 -
 spacy/pipeline/senter.pyx                     |   6 +-
 spacy/pipeline/tagger.pyx                     |  88 +----
 spacy/pipeline/textcat.py                     |  19 +-
 spacy/pipeline/tok2vec.py                     |   9 +-
 spacy/pipeline/trainable_pipe.pxd             |   8 +
 spacy/pipeline/trainable_pipe.pyx             | 322 ++++++++++++++++++
 spacy/pipeline/transition_parser.pxd          |   4 +-
 spacy/pipeline/transition_parser.pyx          |  19 +-
 spacy/schemas.py                              |   2 +-
 spacy/tests/pipeline/test_entity_linker.py    |  27 +-
 spacy/tests/pipeline/test_morphologizer.py    |   5 +-
 spacy/tests/pipeline/test_pipe_methods.py     |   4 +-
 spacy/tests/pipeline/test_senter.py           |   5 +-
 spacy/tests/pipeline/test_tagger.py           |   8 +-
 spacy/tests/pipeline/test_textcat.py          |   6 +-
 spacy/tests/regression/test_issue4001-4500.py |   4 +-
 spacy/tests/regression/test_issue5230.py      |   7 +-
 .../serialize/test_serialize_pipeline.py      |  49 ++-
 spacy/training/__init__.py                    |   2 +-
 spacy/training/example.pyx                    |  18 +
 spacy/training/loggers.py                     |   2 +-
 spacy/training/loop.py                        |   2 +-
 spacy/util.py                                 |  13 +
 website/docs/api/pipe.md                      | 102 +++---
 website/docs/usage/101/_architecture.md       |   3 +-
 website/docs/usage/layers-architectures.md    |  20 +-
 website/docs/usage/processing-pipelines.md    |  26 +-
 website/docs/usage/v3.md                      |  24 +-
 44 files changed, 687 insertions(+), 623 deletions(-)
 create mode 100644 spacy/pipeline/trainable_pipe.pxd
 create mode 100644 spacy/pipeline/trainable_pipe.pyx

diff --git a/setup.py b/setup.py
index 4a4b99f22..604d65745 100755
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,7 @@ MOD_NAMES = [
     "spacy.pipeline.multitask",
     "spacy.pipeline.ner",
     "spacy.pipeline.pipe",
+    "spacy.pipeline.trainable_pipe",
     "spacy.pipeline.sentencizer",
     "spacy.pipeline.senter",
     "spacy.pipeline.tagger",
diff --git a/spacy/about.py b/spacy/about.py
index 108689074..095d726a0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a35"
+__version__ = "3.0.0a36"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/errors.py b/spacy/errors.py
index bf3628ce9..2bc2f3e20 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -522,14 +522,12 @@ class Errors:
     E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
             "but the provided argument {loc} points to a file.")
     E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
-    E930 = ("Received invalid get_examples callback in `{name}.initialize`. "
+    E930 = ("Received invalid get_examples callback in `{method}`. "
             "Expected function that returns an iterable of Example objects but "
             "got: {obj}")
-    E931 = ("Encountered Pipe subclass without `Pipe.{method}` method in component "
-            "'{name}'. If the component is trainable and you want to use this "
-            "method, make sure it's overwritten on the subclass. If your "
-            "component isn't trainable, add a method that does nothing or "
-            "don't use the Pipe base class.")
+    E931 = ("Encountered {parent} subclass without `{parent}.{method}` "
+            "method in component '{name}'. If you want to use this "
+            "method, make sure it's overwritten on the subclass.")
     E940 = ("Found NaN values in scores.")
     E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
             "model from a shortcut, which is deprecated as of spaCy v3.0. To "
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 4a71b26a2..d61bd43fa 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -30,6 +30,7 @@ cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly Vocab vocab
     cdef int64_t entity_vector_length
+    cdef public set _added_strings
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _KBEntryC struct in the _entries vector).
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index bdf652766..478579d71 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,5 +1,7 @@
 # cython: infer_types=True, profile=True
-from typing import Iterator
+from typing import Iterator, Iterable
+
+import srsly
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
@@ -10,13 +12,10 @@ from libcpp.vector cimport vector
 from pathlib import Path
 import warnings
 
-from spacy.strings import StringStore
-
-from spacy import util
-
 from .typedefs cimport hash_t
 from .errors import Errors, Warnings
-
+from . import util
+from .util import SimpleFrozenList, ensure_path
 
 cdef class Candidate:
     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
@@ -85,9 +84,6 @@ cdef class KnowledgeBase:
     DOCS: https://nightly.spacy.io/api/kb
     """
 
-    contents_loc = "contents"
-    strings_loc = "strings.json"
-
     def __init__(self, Vocab vocab, entity_vector_length):
         """Create a KnowledgeBase."""
         self.mem = Pool()
@@ -95,8 +91,8 @@ cdef class KnowledgeBase:
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
         self.vocab = vocab
-        self.vocab.strings.add("")
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
+        self._added_strings = set()
 
     @property
     def entity_vector_length(self):
@@ -118,12 +114,16 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
+    def add_string(self, string: str):
+        self._added_strings.add(string)
+        return self.vocab.strings.add(string)
+
     def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
         Return the hash of the entity ID/name at the end.
         """
-        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+        cdef hash_t entity_hash = self.add_string(entity)
 
         # Return if this entity was added before
         if entity_hash in self._entry_index:
@@ -157,7 +157,7 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash
         while i < len(entity_list):
             # only process this entity if its unique ID hadn't been added before
-            entity_hash = self.vocab.strings.add(entity_list[i])
+            entity_hash = self.add_string(entity_list[i])
             if entity_hash in self._entry_index:
                 warnings.warn(Warnings.W018.format(entity=entity_list[i]))
 
@@ -203,7 +203,7 @@ cdef class KnowledgeBase:
         if prob_sum > 1.00001:
             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 
-        cdef hash_t alias_hash = self.vocab.strings.add(alias)
+        cdef hash_t alias_hash = self.add_string(alias)
 
         # Check whether this alias was added before
         if alias_hash in self._alias_index:
@@ -324,26 +324,27 @@ cdef class KnowledgeBase:
 
         return 0.0
 
-    def to_disk(self, path):
-        path = util.ensure_path(path)
+    def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+        path = ensure_path(path)
         if not path.exists():
             path.mkdir(parents=True)
         if not path.is_dir():
             raise ValueError(Errors.E928.format(loc=path))
-        self.write_contents(path / self.contents_loc)
-        self.vocab.strings.to_disk(path / self.strings_loc)
+        serialize = {}
+        serialize["contents"] = lambda p: self.write_contents(p)
+        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        util.to_disk(path, serialize, exclude)
 
-    def from_disk(self, path):
-        path = util.ensure_path(path)
+    def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+        path = ensure_path(path)
         if not path.exists():
             raise ValueError(Errors.E929.format(loc=path))
         if not path.is_dir():
             raise ValueError(Errors.E928.format(loc=path))
-        self.read_contents(path / self.contents_loc)
-        kb_strings = StringStore()
-        kb_strings.from_disk(path / self.strings_loc)
-        for string in kb_strings:
-            self.vocab.strings.add(string)
+        deserialize = {}
+        deserialize["contents"] = lambda p: self.read_contents(p)
+        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
+        util.from_disk(path, deserialize, exclude)
 
     def write_contents(self, file_path):
         cdef Writer writer = Writer(file_path)
diff --git a/spacy/language.py b/spacy/language.py
index b438936a6..1fb559657 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -20,7 +20,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .training import Example, validate_examples
 from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
-from .util import registry, SimpleFrozenList
+from .util import registry, SimpleFrozenList, _pipe
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@@ -1095,7 +1095,7 @@ class Language:
                 if (
                     name not in exclude
                     and hasattr(proc, "is_trainable")
-                    and proc.is_trainable()
+                    and proc.is_trainable
                     and proc.model not in (True, False, None)
                 ):
                     proc.finish_update(sgd)
@@ -1194,8 +1194,8 @@ class Language:
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
         if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name="Language", obj=type(get_examples))
-            raise ValueError(err)
+            err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
+            raise TypeError(err)
         # Make sure the config is interpolated so we can resolve subsections
         config = self.config.interpolate()
         # These are the settings provided in the [initialize] block in the config
@@ -1301,16 +1301,7 @@ class Language:
         for name, pipe in self.pipeline:
             kwargs = component_cfg.get(name, {})
             kwargs.setdefault("batch_size", batch_size)
-            # non-trainable components may have a pipe() implementation that refers to dummy
-            # predict and set_annotations methods
-            if (
-                not hasattr(pipe, "pipe")
-                or not hasattr(pipe, "is_trainable")
-                or not pipe.is_trainable()
-            ):
-                docs = _pipe(docs, pipe, kwargs)
-            else:
-                docs = pipe.pipe(docs, **kwargs)
+            docs = _pipe(docs, pipe, kwargs)
         # iterate over the final generator
         if len(self.pipeline):
             docs = list(docs)
@@ -1417,17 +1408,7 @@ class Language:
             kwargs = component_cfg.get(name, {})
             # Allow component_cfg to overwrite the top-level kwargs.
             kwargs.setdefault("batch_size", batch_size)
-            # non-trainable components may have a pipe() implementation that refers to dummy
-            # predict and set_annotations methods
-            if (
-                hasattr(proc, "pipe")
-                and hasattr(proc, "is_trainable")
-                and proc.is_trainable()
-            ):
-                f = functools.partial(proc.pipe, **kwargs)
-            else:
-                # Apply the function, but yield the doc
-                f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
+            f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
             pipes.append(f)
 
         if n_process != 1:
@@ -1826,19 +1807,6 @@ class DisabledPipes(list):
         self[:] = []
 
 
-def _pipe(
-    examples: Iterable[Example], proc: Callable[[Doc], Doc], kwargs: Dict[str, Any]
-) -> Iterator[Example]:
-    # We added some args for pipe that __call__ doesn't expect.
-    kwargs = dict(kwargs)
-    for arg in ["batch_size"]:
-        if arg in kwargs:
-            kwargs.pop(arg)
-    for eg in examples:
-        eg = proc(eg, **kwargs)
-        yield eg
-
-
 def _apply_pipes(
     make_doc: Callable[[str], Doc],
     pipes: Iterable[Callable[[Doc], Doc]],
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 656182088..cec5b4eb5 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -6,6 +6,7 @@ from .entityruler import EntityRuler
 from .lemmatizer import Lemmatizer
 from .morphologizer import Morphologizer
 from .pipe import Pipe
+from .trainable_pipe import TrainablePipe
 from .senter import SentenceRecognizer
 from .sentencizer import Sentencizer
 from .tagger import Tagger
@@ -21,6 +22,7 @@ __all__ = [
     "EntityRuler",
     "Morphologizer",
     "Lemmatizer",
+    "TrainablePipe",
     "Pipe",
     "SentenceRecognizer",
     "Sentencizer",
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 0ab1ac9bf..7a6a1de5b 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -57,6 +57,7 @@ class AttributeRuler(Pipe):
         self.attrs = []
         self._attrs_unnormed = []  # store for reference
         self.indices = []
+        self._added_strings = set()
 
     def clear(self) -> None:
         """Reset all patterns."""
@@ -123,21 +124,6 @@ class AttributeRuler(Pipe):
             set_token_attrs(span[index], attrs)
         return doc
 
-    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
-        """Apply the pipe to a stream of documents. This usually happens under
-        the hood when the nlp object is called on a text and all components are
-        applied to the Doc.
-
-        stream (Iterable[Doc]): A stream of documents.
-        batch_size (int): The number of documents to buffer.
-        YIELDS (Doc): Processed documents in order.
-
-        DOCS: https://spacy.io/attributeruler/pipe#pipe
-        """
-        for doc in stream:
-            doc = self(doc)
-            yield doc
-
     def load_from_tag_map(
         self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]
     ) -> None:
@@ -201,12 +187,16 @@ class AttributeRuler(Pipe):
         # We need to make a string here, because otherwise the ID we pass back
         # will be interpreted as the hash of a string, rather than an ordinal.
         key = str(len(self.attrs))
-        self.matcher.add(self.vocab.strings.add(key), patterns)
+        self.matcher.add(self.add_string(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)
         self.indices.append(index)
 
+    def add_string(self, string: str):
+        self._added_strings.add(string)
+        return self.vocab.strings.add(string)
+
     def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
         """Add patterns from a list of pattern dicts with the keys as the
         arguments to AttributeRuler.add.
@@ -266,8 +256,8 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
         """
         serialize = {}
-        serialize["vocab"] = self.vocab.to_bytes
         serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
+        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(
@@ -286,7 +276,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.msgpack_loads(b))
 
         deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
+            "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
             "patterns": load_patterns,
         }
         util.from_bytes(bytes_data, deserialize, exclude)
@@ -303,7 +293,7 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
         """
         serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
+            "strings.json": lambda p: srsly.write_json(p, self._added_strings),
             "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
         }
         util.to_disk(path, serialize, exclude)
@@ -324,7 +314,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.read_msgpack(p))
 
         deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
+            "strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
             "patterns": load_patterns,
         }
         util.from_disk(path, deserialize, exclude)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index eec591995..881e98785 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -10,10 +10,11 @@ import warnings
 from ..kb import KnowledgeBase, Candidate
 from ..ml import empty_kb
 from ..tokens import Doc
-from .pipe import Pipe, deserialize_config
+from .pipe import deserialize_config
+from .trainable_pipe import TrainablePipe
 from ..language import Language
 from ..vocab import Vocab
-from ..training import Example, validate_examples
+from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
 from ..util import SimpleFrozenList
 from .. import util
@@ -90,7 +91,7 @@ def make_entity_linker(
     )
 
 
-class EntityLinker(Pipe):
+class EntityLinker(TrainablePipe):
     """Pipeline component for named entity linking.
 
     DOCS: https://nightly.spacy.io/api/entitylinker
@@ -172,7 +173,7 @@ class EntityLinker(Pipe):
 
         DOCS: https://nightly.spacy.io/api/entitylinker#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "EntityLinker.initialize")
         if kb_loader is not None:
             self.set_kb(kb_loader)
         self.validate_kb()
@@ -453,7 +454,6 @@ class EntityLinker(Pipe):
         """
         serialize = {}
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["kb"] = lambda p: self.kb.to_disk(p)
         serialize["model"] = lambda p: self.model.to_disk(p)
         util.to_disk(path, serialize, exclude)
@@ -477,11 +477,12 @@ class EntityLinker(Pipe):
                 raise ValueError(Errors.E149) from None
 
         deserialize = {}
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
         deserialize["kb"] = lambda p: self.kb.from_disk(p)
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
+        for s in self.kb._added_strings:
+            self.vocab.strings.add(s)
         return self
 
     def rehearse(self, examples, *, sgd=None, losses=None, **config):
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index dfaddad74..382ca338d 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -342,12 +342,6 @@ class EntityRuler(Pipe):
         validate_examples(examples, "EntityRuler.score")
         return Scorer.score_spans(examples, "ents", **kwargs)
 
-    def predict(self, docs):
-        pass
-
-    def set_annotations(self, docs, scores):
-        pass
-
     def from_bytes(
         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
     ) -> "EntityRuler":
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 9be596868..7f5370753 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -281,7 +281,6 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
         """
         serialize = {}
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["lookups"] = lambda p: self.lookups.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
@@ -297,7 +296,6 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
         """
         deserialize = {}
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
         util.from_disk(path, deserialize, exclude)
         self._validate_tables()
@@ -312,7 +310,6 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
         """
         serialize = {}
-        serialize["vocab"] = self.vocab.to_bytes
         serialize["lookups"] = self.lookups.to_bytes
         return util.to_bytes(serialize, exclude)
 
@@ -328,7 +325,6 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
         """
         deserialize = {}
-        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
         util.from_bytes(bytes_data, deserialize, exclude)
         self._validate_tables()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 6d97b062f..a456b7a0f 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -16,7 +16,7 @@ from .pipe import deserialize_config
 from .tagger import Tagger
 from .. import util
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..training import validate_examples, validate_get_examples
 
 
 default_model_config = """
@@ -95,6 +95,7 @@ class Morphologizer(Tagger):
         # add mappings for empty morph
         self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
         self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
+        self._added_strings = set()
 
     @property
     def labels(self):
@@ -128,6 +129,7 @@ class Morphologizer(Tagger):
             label_dict.pop(self.POS_FEAT)
         # normalize morph string and add to morphology table
         norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+        self.add_string(norm_morph)
         # add label mappings
         if norm_label not in self.cfg["labels_morph"]:
             self.cfg["labels_morph"][norm_label] = norm_morph
@@ -144,7 +146,7 @@ class Morphologizer(Tagger):
 
         DOCS: https://nightly.spacy.io/api/morphologizer#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "Morphologizer.initialize")
         if labels is not None:
             self.cfg["labels_morph"] = labels["morph"]
             self.cfg["labels_pos"] = labels["pos"]
@@ -159,6 +161,7 @@ class Morphologizer(Tagger):
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                    self.add_string(norm_label)
                     # add label->morph and label->POS mappings
                     if norm_label not in self.cfg["labels_morph"]:
                         self.cfg["labels_morph"][norm_label] = morph
@@ -176,6 +179,7 @@ class Morphologizer(Tagger):
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
+                self.add_string(norm_label)
                 gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
             doc_sample.append(example.x)
             label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
@@ -234,6 +238,7 @@ class Morphologizer(Tagger):
                 if pos:
                     label_dict[self.POS_FEAT] = pos
                 label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                self.add_string(label)
                 eg_truths.append(label)
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index fa304b842..e1ea49849 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -6,7 +6,7 @@ from thinc.api import set_dropout_rate
 
 from ..tokens.doc cimport Doc
 
-from .pipe import Pipe
+from .trainable_pipe import TrainablePipe
 from .tagger import Tagger
 from ..training import validate_examples
 from ..language import Language
@@ -164,7 +164,7 @@ class MultitaskObjective(Tagger):
             return "I-SENT"
 
 
-class ClozeMultitask(Pipe):
+class ClozeMultitask(TrainablePipe):
     def __init__(self, vocab, model, **cfg):
         self.vocab = vocab
         self.model = model
diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd
index bca94d528..bb97f79d0 100644
--- a/spacy/pipeline/pipe.pxd
+++ b/spacy/pipeline/pipe.pxd
@@ -1,5 +1,2 @@
 cdef class Pipe:
-    cdef public object vocab
-    cdef public object model
     cdef public str name
-    cdef public object cfg
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 50e5108b9..afb59fdb3 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,38 +1,22 @@
 # cython: infer_types=True, profile=True
 import warnings
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict
 import srsly
-from thinc.api import set_dropout_rate, Model
 
 from ..tokens.doc cimport Doc
 
-from ..training import validate_examples
+from ..training import Example
 from ..errors import Errors, Warnings
-from .. import util
-
+from ..language import Language
 
 cdef class Pipe:
-    """This class is a base class and not instantiated directly. Trainable
-    pipeline components like the EntityRecognizer or TextCategorizer inherit
-    from it and it defines the interface that components should follow to
-    function as trainable components in a spaCy pipeline.
+    """This class is a base class and not instantiated directly. It provides
+    an interface for pipeline components to implement.
+    Trainable pipeline components like the EntityRecognizer or TextCategorizer
+    should inherit from the subclass 'TrainablePipe'.
 
     DOCS: https://nightly.spacy.io/api/pipe
     """
-    def __init__(self, vocab, model, name, **cfg):
-        """Initialize a pipeline component.
-
-        vocab (Vocab): The shared vocabulary.
-        model (thinc.api.Model): The Thinc Model powering the pipeline component.
-        name (str): The component instance name.
-        **cfg: Additonal settings and config parameters.
-
-        DOCS: https://nightly.spacy.io/api/pipe#init
-        """
-        self.vocab = vocab
-        self.model = model
-        self.name = name
-        self.cfg = dict(cfg)
 
     @classmethod
     def __init_subclass__(cls, **kwargs):
@@ -41,18 +25,7 @@ cdef class Pipe:
         if hasattr(cls, "begin_training"):
             warnings.warn(Warnings.W088.format(name=cls.__name__))
 
-    @property
-    def labels(self) -> Optional[Tuple[str]]:
-        return []
-
-    @property
-    def label_data(self):
-        """Optional JSON-serializable data that would be sufficient to recreate
-        the label set if provided to the `pipe.initialize()` method.
-        """
-        return None
-
-    def __call__(self, Doc doc):
+    def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
         and returned. This usually happens under the hood when the nlp object
         is called on a text and all components are applied to the Doc.
@@ -62,11 +35,9 @@ cdef class Pipe:
 
         DOCS: https://nightly.spacy.io/api/pipe#call
         """
-        scores = self.predict([doc])
-        self.set_annotations([doc], scores)
-        return doc
+        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
 
-    def pipe(self, stream, *, batch_size=128):
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
         """Apply the pipe to a stream of documents. This usually happens under
         the hood when the nlp object is called on a text and all components are
         applied to the Doc.
@@ -77,137 +48,17 @@ cdef class Pipe:
 
         DOCS: https://nightly.spacy.io/api/pipe#pipe
         """
-        for docs in util.minibatch(stream, size=batch_size):
-            scores = self.predict(docs)
-            self.set_annotations(docs, scores)
-            yield from docs
+        for doc in stream:
+            doc = self(doc)
+            yield doc
 
-    def predict(self, docs):
-        """Apply the pipeline's model to a batch of docs, without modifying them.
-        Returns a single tensor for a batch of documents.
-
-        docs (Iterable[Doc]): The documents to predict.
-        RETURNS: Vector representations for each token in the documents.
-
-        DOCS: https://nightly.spacy.io/api/pipe#predict
-        """
-        raise NotImplementedError(Errors.E931.format(method="predict", name=self.name))
-
-    def set_annotations(self, docs, scores):
-        """Modify a batch of documents, using pre-computed scores.
-
-        docs (Iterable[Doc]): The documents to modify.
-        scores: The scores to assign.
-
-        DOCS: https://nightly.spacy.io/api/pipe#set_annotations
-        """
-        raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name))
-
-    def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None):
-        """Learn from a batch of documents and gold-standard information,
-        updating the pipe's model. Delegates to predict and get_loss.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        set_annotations (bool): Whether or not to update the Example objects
-            with the predictions.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://nightly.spacy.io/api/pipe#update
-        """
-        if losses is None:
-            losses = {}
-        if not hasattr(self, "model") or self.model in (None, True, False):
-            return losses
-        losses.setdefault(self.name, 0.0)
-        validate_examples(examples, "Pipe.update")
-        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
-            # Handle cases where there are no tokens in any docs.
-            return
-        set_dropout_rate(self.model, drop)
-        scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
-        loss, d_scores = self.get_loss(examples, scores)
-        bp_scores(d_scores)
-        if sgd not in (None, False):
-            self.finish_update(sgd)
-        losses[self.name] += loss
-        if set_annotations:
-            docs = [eg.predicted for eg in examples]
-            self.set_annotations(docs, scores=scores)
-        return losses
-
-    def rehearse(self, examples, *, sgd=None, losses=None, **config):
-        """Perform a "rehearsal" update from a batch of data. Rehearsal updates
-        teach the current model to make predictions similar to an initial model,
-        to try to address the "catastrophic forgetting" problem. This feature is
-        experimental.
-
-        examples (Iterable[Example]): A batch of Example objects.
-        drop (float): The dropout rate.
-        sgd (thinc.api.Optimizer): The optimizer.
-        losses (Dict[str, float]): Optional record of the loss during training.
-            Updated using the component name as the key.
-        RETURNS (Dict[str, float]): The updated losses dictionary.
-
-        DOCS: https://nightly.spacy.io/api/pipe#rehearse
-        """
-        pass
-
-    def get_loss(self, examples, scores):
-        """Find the loss and gradient of loss for the batch of documents and
-        their predicted scores.
-
-        examples (Iterable[Examples]): The batch of examples.
-        scores: Scores representing the model's predictions.
-        RETURNS (Tuple[float, float]): The loss and the gradient.
-
-        DOCS: https://nightly.spacy.io/api/pipe#get_loss
-        """
-        raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name))
-
-    def add_label(self, label):
-        """Add an output label, to be predicted by the model. It's possible to
-        extend pretrained models with new labels, but care should be taken to
-        avoid the "catastrophic forgetting" problem.
-
-        label (str): The label to add.
-        RETURNS (int): 0 if label is already present, otherwise 1.
-
-        DOCS: https://nightly.spacy.io/api/pipe#add_label
-        """
-        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
-
-
-    def _require_labels(self) -> None:
-        """Raise an error if the component's model has no labels defined."""
-        if not self.labels or list(self.labels) == [""]:
-            raise ValueError(Errors.E143.format(name=self.name))
-
-
-    def _allow_extra_label(self) -> None:
-        """Raise an error if the component can not add any more labels."""
-        if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
-            if not self.is_resizable():
-                raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
-
-
-    def create_optimizer(self):
-        """Create an optimizer for the pipeline component.
-
-        RETURNS (thinc.api.Optimizer): The optimizer.
-
-        DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
-        """
-        return util.create_default_optimizer()
-
-    def initialize(self, get_examples, *, nlp=None):
-        """Initialize the pipe for training, using data examples if available.
-        This method needs to be implemented by each Pipe component,
-        ensuring the internal model (if available) is initialized properly
-        using the provided sample of Example objects.
+    def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
+        """Initialize the pipe. For non-trainable components, this method
+        is optional. For trainable components, which should inherit
+        from the subclass TrainablePipe, the provided data examples
+        should be used to ensure that the internal model is initialized
+        properly and all input/output dimensions throughout the network are
+        inferred.
 
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
@@ -217,49 +68,7 @@ cdef class Pipe:
         """
         pass
 
-    def _ensure_examples(self, get_examples):
-        if get_examples is None or not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(name=self.name, obj=type(get_examples))
-            raise ValueError(err)
-        if not get_examples():
-            err = Errors.E930.format(name=self.name, obj=get_examples())
-            raise ValueError(err)
-
-    def is_resizable(self):
-        return hasattr(self, "model") and "resize_output" in self.model.attrs
-
-    def is_trainable(self):
-        return hasattr(self, "model") and isinstance(self.model, Model)
-
-    def set_output(self, nO):
-        if self.is_resizable():
-            self.model.attrs["resize_output"](self.model, nO)
-        else:
-            raise NotImplementedError(Errors.E921)
-
-    def use_params(self, params):
-        """Modify the pipe's model, to use the given parameter values. At the
-        end of the context, the original parameters are restored.
-
-        params (dict): The parameter values to use in the model.
-
-        DOCS: https://nightly.spacy.io/api/pipe#use_params
-        """
-        with self.model.use_params(params):
-            yield
-
-    def finish_update(self, sgd):
-        """Update parameters using the current parameter gradients.
-        The Optimizer instance contains the functionality to perform
-        the stochastic gradient descent.
-
-        sgd (thinc.api.Optimizer): The optimizer.
-
-        DOCS: https://nightly.spacy.io/api/pipe#finish_update
-        """
-        self.model.finish_update(sgd)
-
-    def score(self, examples, **kwargs):
+    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Union[float, Dict[str, float]]]:
         """Score a batch of examples.
 
         examples (Iterable[Example]): The examples to score.
@@ -269,81 +78,25 @@ cdef class Pipe:
         """
         return {}
 
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
+    @property
+    def is_trainable(self) -> bool:
+        return False
 
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
+    @property
+    def labels(self) -> Optional[Tuple[str]]:
+        return tuple()
 
-        DOCS: https://nightly.spacy.io/api/pipe#to_bytes
+    @property
+    def label_data(self):
+        """Optional JSON-serializable data that would be sufficient to recreate
+        the label set if provided to the `pipe.initialize()` method.
         """
-        serialize = {}
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["model"] = self.model.to_bytes
-        if hasattr(self, "vocab"):
-            serialize["vocab"] = self.vocab.to_bytes
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Pipe): The loaded object.
-
-        DOCS: https://nightly.spacy.io/api/pipe#from_bytes
-        """
-
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        if hasattr(self, "vocab"):
-            deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
-        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["model"] = load_model
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/pipe#to_disk
-        """
-        serialize = {}
-        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
-        serialize["model"] = lambda p: self.model.to_disk(p)
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Pipe): The loaded object.
-
-        DOCS: https://nightly.spacy.io/api/pipe#from_disk
-        """
-
-        def load_model(p):
-            try:
-                self.model.from_bytes(p.open("rb").read())
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {}
-        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
-        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
-        deserialize["model"] = load_model
-        util.from_disk(path, deserialize, exclude)
-        return self
+        return None
 
+    def _require_labels(self) -> None:
+        """Raise an error if this component has no labels defined."""
+        if not self.labels or list(self.labels) == [""]:
+            raise ValueError(Errors.E143.format(name=self.name))
 
 def deserialize_config(path):
     if path.exists():
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 13fcd15e2..7656b330c 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -58,9 +58,6 @@ class Sentencizer(Pipe):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
-    def initialize(self, get_examples, nlp=None):
-        pass
-
     def __call__(self, doc):
         """Apply the sentencizer to a Doc and set Token.is_sent_start.
 
@@ -204,9 +201,3 @@ class Sentencizer(Pipe):
         cfg = srsly.read_json(path)
         self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
         return self
-
-    def get_loss(self, examples, scores):
-        raise NotImplementedError
-
-    def add_label(self, label):
-        raise NotImplementedError
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 8fb1e664f..8ea4ed1b3 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -6,12 +6,11 @@ from thinc.api import Model, SequenceCategoricalCrossentropy, Config
 
 from ..tokens.doc cimport Doc
 
-from .pipe import deserialize_config
 from .tagger import Tagger
 from ..language import Language
 from ..errors import Errors
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..training import validate_examples, validate_get_examples
 from .. import util
 
 
@@ -62,6 +61,7 @@ class SentenceRecognizer(Tagger):
         self.name = name
         self._rehearsal_model = None
         self.cfg = {}
+        self._added_strings = set()
 
     @property
     def labels(self):
@@ -138,7 +138,7 @@ class SentenceRecognizer(Tagger):
 
         DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "SentenceRecognizer.initialize")
         doc_sample = []
         label_sample = []
         assert self.labels, Errors.E924.format(name=self.name)
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index dd10c5670..535b71270 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -11,13 +11,14 @@ from ..tokens.doc cimport Doc
 from ..morphology cimport Morphology
 from ..vocab cimport Vocab
 
-from .pipe import Pipe, deserialize_config
+from .trainable_pipe import TrainablePipe
+from .pipe import deserialize_config
 from ..language import Language
 from ..attrs import POS, ID
 from ..parts_of_speech import X
 from ..errors import Errors, Warnings
 from ..scorer import Scorer
-from ..training import validate_examples
+from ..training import validate_examples, validate_get_examples
 from .. import util
 
 
@@ -55,7 +56,7 @@ def make_tagger(nlp: Language, name: str, model: Model):
     return Tagger(nlp.vocab, model, name)
 
 
-class Tagger(Pipe):
+class Tagger(TrainablePipe):
     """Pipeline component for part-of-speech tagging.
 
     DOCS: https://nightly.spacy.io/api/tagger
@@ -77,6 +78,7 @@ class Tagger(Pipe):
         self._rehearsal_model = None
         cfg = {"labels": labels or []}
         self.cfg = dict(sorted(cfg.items()))
+        self._added_strings = set()
 
     @property
     def labels(self):
@@ -274,7 +276,7 @@ class Tagger(Pipe):
 
         DOCS: https://nightly.spacy.io/api/tagger#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "Tagger.initialize")
         if labels is not None:
             for tag in labels:
                 self.add_label(tag)
@@ -311,7 +313,7 @@ class Tagger(Pipe):
             return 0
         self._allow_extra_label()
         self.cfg["labels"].append(label)
-        self.vocab.strings.add(label)
+        self.add_string(label)
         return 1
 
     def score(self, examples, **kwargs):
@@ -325,79 +327,3 @@ class Tagger(Pipe):
         """
         validate_examples(examples, "Tagger.score")
         return Scorer.score_token_attr(examples, "tag", **kwargs)
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/tagger#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The loaded Tagger.
-
-        DOCS: https://nightly.spacy.io/api/tagger#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/tagger#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: self.model.to_disk(p),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The modified Tagger object.
-
-        DOCS: https://nightly.spacy.io/api/tagger#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index cc7a76288..e57954184 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -4,9 +4,9 @@ from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Conf
 from thinc.types import Floats2d
 import numpy
 
-from .pipe import Pipe
+from .trainable_pipe import TrainablePipe
 from ..language import Language
-from ..training import Example, validate_examples
+from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
 from ..scorer import Scorer
 from .. import util
@@ -85,7 +85,7 @@ def make_textcat(
     return TextCategorizer(nlp.vocab, model, name, threshold=threshold)
 
 
-class TextCategorizer(Pipe):
+class TextCategorizer(TrainablePipe):
     """Pipeline component for text classification.
 
     DOCS: https://nightly.spacy.io/api/textcategorizer
@@ -110,6 +110,7 @@ class TextCategorizer(Pipe):
         self._rehearsal_model = None
         cfg = {"labels": [], "threshold": threshold, "positive_label": None}
         self.cfg = dict(cfg)
+        self._added_strings = set()
 
     @property
     def labels(self) -> Tuple[str]:
@@ -119,13 +120,6 @@ class TextCategorizer(Pipe):
         """
         return tuple(self.cfg["labels"])
 
-    @labels.setter
-    def labels(self, value: List[str]) -> None:
-        # TODO: This really shouldn't be here. I had a look and I added it when
-        # I added the labels property, but it's pretty nasty to have this, and
-        # will lead to problems.
-        self.cfg["labels"] = tuple(value)
-
     @property
     def label_data(self) -> List[str]:
         """RETURNS (List[str]): Information about the component's labels."""
@@ -306,7 +300,8 @@ class TextCategorizer(Pipe):
         if label in self.labels:
             return 0
         self._allow_extra_label()
-        self.labels = tuple(list(self.labels) + [label])
+        self.cfg["labels"].append(label)
+        self.add_string(label)
         return 1
 
     def initialize(
@@ -329,7 +324,7 @@ class TextCategorizer(Pipe):
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "TextCategorizer.initialize")
         if labels is None:
             for example in get_examples():
                 for cat in example.y.cats:
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 0f309326e..b4625291b 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -2,8 +2,8 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice
 
-from .pipe import Pipe
-from ..training import Example, validate_examples
+from .trainable_pipe import TrainablePipe
+from ..training import Example, validate_examples, validate_get_examples
 from ..tokens import Doc
 from ..vocab import Vocab
 from ..language import Language
@@ -32,7 +32,7 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
     return Tok2Vec(nlp.vocab, model, name)
 
 
-class Tok2Vec(Pipe):
+class Tok2Vec(TrainablePipe):
     """Apply a "token-to-vector" model and set its outputs in the doc.tensor
     attribute. This is mostly useful to share a single subnetwork between multiple
     components, e.g. to have one embedding and CNN network shared between a
@@ -64,6 +64,7 @@ class Tok2Vec(Pipe):
         self.name = name
         self.listeners = []
         self.cfg = {}
+        self._added_strings = set()
 
     def add_listener(self, listener: "Tok2VecListener") -> None:
         """Add a listener for a downstream component. Usually internals."""
@@ -218,7 +219,7 @@ class Tok2Vec(Pipe):
 
         DOCS: https://nightly.spacy.io/api/tok2vec#initialize
         """
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "Tok2Vec.initialize")
         doc_sample = []
         for example in islice(get_examples(), 10):
             doc_sample.append(example.x)
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
new file mode 100644
index 000000000..8df5cb775
--- /dev/null
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -0,0 +1,8 @@
+from .pipe cimport Pipe
+from ..vocab cimport Vocab
+
+cdef class TrainablePipe(Pipe):
+    cdef public Vocab vocab
+    cdef public object model
+    cdef public object cfg
+    cdef public set _added_strings
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
new file mode 100644
index 000000000..07a308953
--- /dev/null
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -0,0 +1,322 @@
+# cython: infer_types=True, profile=True
+from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
+import srsly
+from thinc.api import set_dropout_rate, Model, Optimizer
+
+from ..tokens.doc cimport Doc
+
+from ..training import validate_examples
+from ..errors import Errors
+from .pipe import Pipe, deserialize_config
+from .. import util
+from ..vocab import Vocab
+from ..language import Language
+from ..training import Example
+
+cdef class TrainablePipe(Pipe):
+    """This class is a base class and not instantiated directly. Trainable
+    pipeline components like the EntityRecognizer or TextCategorizer inherit
+    from it and it defines the interface that components should follow to
+    function as trainable components in a spaCy pipeline.
+
+    DOCS: https://nightly.spacy.io/api/pipe
+    """
+    def __init__(self, vocab: Vocab, model: Model, name: str, **cfg):
+        """Initialize a pipeline component.
+
+        vocab (Vocab): The shared vocabulary.
+        model (thinc.api.Model): The Thinc Model powering the pipeline component.
+        name (str): The component instance name.
+        **cfg: Additonal settings and config parameters.
+
+        DOCS: https://nightly.spacy.io/api/pipe#init
+        """
+        self.vocab = vocab
+        self.model = model
+        self.name = name
+        self.cfg = dict(cfg)
+        self._added_strings = set()
+
+    def __call__(self, Doc doc) -> Doc:
+        """Apply the pipe to one document. The document is modified in place,
+        and returned. This usually happens under the hood when the nlp object
+        is called on a text and all components are applied to the Doc.
+
+        docs (Doc): The Doc to process.
+        RETURNS (Doc): The processed Doc.
+
+        DOCS: https://nightly.spacy.io/api/pipe#call
+        """
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc
+
+    def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
+        """Apply the pipe to a stream of documents. This usually happens under
+        the hood when the nlp object is called on a text and all components are
+        applied to the Doc.
+
+        stream (Iterable[Doc]): A stream of documents.
+        batch_size (int): The number of documents to buffer.
+        YIELDS (Doc): Processed documents in order.
+
+        DOCS: https://nightly.spacy.io/api/pipe#pipe
+        """
+        for docs in util.minibatch(stream, size=batch_size):
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs: Iterable[Doc]):
+        """Apply the pipeline's model to a batch of docs, without modifying them.
+        Returns a single tensor for a batch of documents.
+
+        docs (Iterable[Doc]): The documents to predict.
+        RETURNS: Vector representations of the predictions.
+
+        DOCS: https://nightly.spacy.io/api/pipe#predict
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="predict", name=self.name))
+
+    def set_annotations(self, docs: Iterable[Doc], scores):
+        """Modify a batch of documents, using pre-computed scores.
+
+        docs (Iterable[Doc]): The documents to modify.
+        scores: The scores to assign.
+
+        DOCS: https://nightly.spacy.io/api/pipe#set_annotations
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="set_annotations", name=self.name))
+
+    def update(self,
+               examples: Iterable["Example"],
+               *, drop: float=0.0,
+               set_annotations: bool=False,
+               sgd: Optimizer=None,
+               losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+        """Learn from a batch of documents and gold-standard information,
+        updating the pipe's model. Delegates to predict and get_loss.
+
+        examples (Iterable[Example]): A batch of Example objects.
+        drop (float): The dropout rate.
+        set_annotations (bool): Whether or not to update the Example objects
+            with the predictions.
+        sgd (thinc.api.Optimizer): The optimizer.
+        losses (Dict[str, float]): Optional record of the loss during training.
+            Updated using the component name as the key.
+        RETURNS (Dict[str, float]): The updated losses dictionary.
+
+        DOCS: https://nightly.spacy.io/api/pipe#update
+        """
+        if losses is None:
+            losses = {}
+        if not hasattr(self, "model") or self.model in (None, True, False):
+            return losses
+        losses.setdefault(self.name, 0.0)
+        validate_examples(examples, "TrainablePipe.update")
+        if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
+            # Handle cases where there are no tokens in any docs.
+            return
+        set_dropout_rate(self.model, drop)
+        scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
+        loss, d_scores = self.get_loss(examples, scores)
+        bp_scores(d_scores)
+        if sgd not in (None, False):
+            self.finish_update(sgd)
+        losses[self.name] += loss
+        if set_annotations:
+            docs = [eg.predicted for eg in examples]
+            self.set_annotations(docs, scores=scores)
+        return losses
+
+    def rehearse(self,
+                 examples: Iterable[Example],
+                 *,
+                 sgd: Optimizer=None,
+                 losses: Dict[str, float]=None,
+                 **config) -> Dict[str, float]:
+        """Perform a "rehearsal" update from a batch of data. Rehearsal updates
+        teach the current model to make predictions similar to an initial model,
+        to try to address the "catastrophic forgetting" problem. This feature is
+        experimental.
+
+        examples (Iterable[Example]): A batch of Example objects.
+        sgd (thinc.api.Optimizer): The optimizer.
+        losses (Dict[str, float]): Optional record of the loss during training.
+            Updated using the component name as the key.
+        RETURNS (Dict[str, float]): The updated losses dictionary.
+
+        DOCS: https://nightly.spacy.io/api/pipe#rehearse
+        """
+        pass
+
+    def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
+        """Find the loss and gradient of loss for the batch of documents and
+        their predicted scores.
+
+        examples (Iterable[Examples]): The batch of examples.
+        scores: Scores representing the model's predictions.
+        RETURNS (Tuple[float, float]): The loss and the gradient.
+
+        DOCS: https://nightly.spacy.io/api/pipe#get_loss
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
+
+    def create_optimizer(self) -> Optimizer:
+        """Create an optimizer for the pipeline component.
+
+        RETURNS (thinc.api.Optimizer): The optimizer.
+
+        DOCS: https://nightly.spacy.io/api/pipe#create_optimizer
+        """
+        return util.create_default_optimizer()
+
+    def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
+        """Initialize the pipe for training, using data examples if available.
+        This method needs to be implemented by each TrainablePipe component,
+        ensuring the internal model (if available) is initialized properly
+        using the provided sample of Example objects.
+
+        get_examples (Callable[[], Iterable[Example]]): Function that
+            returns a representative sample of gold-standard Example objects.
+        nlp (Language): The current nlp object the component is part of.
+
+        DOCS: https://nightly.spacy.io/api/pipe#initialize
+        """
+        raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="initialize", name=self.name))
+
+    def add_label(self, label: str) -> int:
+        """Add an output label.
+        For TrainablePipe components, it is possible to
+        extend pretrained models with new labels, but care should be taken to
+        avoid the "catastrophic forgetting" problem.
+
+        label (str): The label to add.
+        RETURNS (int): 0 if label is already present, otherwise 1.
+
+        DOCS: https://nightly.spacy.io/api/pipe#add_label
+        """
+        raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
+
+    def add_string(self, string: str):
+        self._added_strings.add(string)
+        return self.vocab.strings.add(string)
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
+
+    @property
+    def is_resizable(self) -> bool:
+        return getattr(self, "model", None) and "resize_output" in self.model.attrs
+
+    def _allow_extra_label(self) -> None:
+        """Raise an error if the component can not add any more labels."""
+        if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
+            if not self.is_resizable:
+                raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
+
+    def set_output(self, nO: int) -> None:
+        if self.is_resizable:
+            self.model.attrs["resize_output"](self.model, nO)
+        else:
+            raise NotImplementedError(Errors.E921)
+
+    def use_params(self, params: dict):
+        """Modify the pipe's model, to use the given parameter values. At the
+        end of the context, the original parameters are restored.
+
+        params (dict): The parameter values to use in the model.
+
+        DOCS: https://nightly.spacy.io/api/pipe#use_params
+        """
+        with self.model.use_params(params):
+            yield
+
+    def finish_update(self, sgd: Optimizer) -> None:
+        """Update parameters using the current parameter gradients.
+        The Optimizer instance contains the functionality to perform
+        the stochastic gradient descent.
+
+        sgd (thinc.api.Optimizer): The optimizer.
+
+        DOCS: https://nightly.spacy.io/api/pipe#finish_update
+        """
+        self.model.finish_update(sgd)
+
+    def to_bytes(self, *, exclude=tuple()):
+        """Serialize the pipe to a bytestring.
+
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.
+
+        DOCS: https://nightly.spacy.io/api/pipe#to_bytes
+        """
+        serialize = {}
+        if hasattr(self, "cfg"):
+            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["model"] = self.model.to_bytes
+        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        """Load the pipe from a bytestring.
+
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (TrainablePipe): The loaded object.
+
+        DOCS: https://nightly.spacy.io/api/pipe#from_bytes
+        """
+
+        def load_model(b):
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149) from None
+
+        deserialize = {}
+        deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
+        if hasattr(self, "cfg"):
+            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["model"] = load_model
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, *, exclude=tuple()):
+        """Serialize the pipe to disk.
+
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+
+        DOCS: https://nightly.spacy.io/api/pipe#to_disk
+        """
+        serialize = {}
+        if hasattr(self, "cfg"):
+            serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        serialize["model"] = lambda p: self.model.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, *, exclude=tuple()):
+        """Load the pipe from disk.
+
+        path (str / Path): Path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (TrainablePipe): The loaded object.
+
+        DOCS: https://nightly.spacy.io/api/pipe#from_disk
+        """
+
+        def load_model(p):
+            try:
+                self.model.from_bytes(p.open("rb").read())
+            except AttributeError:
+                raise ValueError(Errors.E149) from None
+
+        deserialize = {}
+        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
+        if hasattr(self, "cfg"):
+            deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
+        deserialize["model"] = load_model
+        util.from_disk(path, deserialize, exclude)
+        return self
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index 67bc01f97..bd5bad334 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -1,13 +1,13 @@
 from cymem.cymem cimport Pool
 
 from ..vocab cimport Vocab
-from .pipe cimport Pipe
+from .trainable_pipe cimport TrainablePipe
 from ._parser_internals.transition_system cimport Transition, TransitionSystem
 from ._parser_internals._state cimport StateC
 from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
 
 
-cdef class Parser(Pipe):
+cdef class Parser(TrainablePipe):
     cdef public object _rehearsal_model
     cdef readonly TransitionSystem moves
     cdef public object _multitasks
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 3b4406757..3743e1018 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -21,13 +21,14 @@ from ..ml.parser_model cimport predict_states, arg_max_if_valid
 from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
+from .trainable_pipe import TrainablePipe
 
-from ..training import validate_examples
+from ..training import validate_examples, validate_get_examples
 from ..errors import Errors, Warnings
 from .. import util
 
 
-cdef class Parser(Pipe):
+cdef class Parser(TrainablePipe):
     """
     Base class of the DependencyParser and EntityRecognizer.
     """
@@ -75,6 +76,7 @@ cdef class Parser(Pipe):
             self.add_multitask_objective(multitask)
 
         self._rehearsal_model = None
+        self._added_strings = set()
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -118,6 +120,7 @@ cdef class Parser(Pipe):
                 resized = True
         if resized:
             self._resize()
+            self.add_string(label)
             return 1
         return 0
 
@@ -411,7 +414,7 @@ cdef class Parser(Pipe):
         self.model.attrs["resize_output"](self.model, nO)
 
     def initialize(self, get_examples, nlp=None, labels=None):
-        self._ensure_examples(get_examples)
+        validate_get_examples(get_examples, "Parser.initialize")
         lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
         if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
             langs = ", ".join(util.LEXEME_NORM_LANGS)
@@ -439,7 +442,7 @@ cdef class Parser(Pipe):
                     break
                 # non-trainable components may have a pipe() implementation that refers to dummy
                 # predict and set_annotations methods
-                if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable():
+                if hasattr(component, "pipe"):
                     doc_sample = list(component.pipe(doc_sample, batch_size=8))
                 else:
                     doc_sample = [component(doc) for doc in doc_sample]
@@ -454,7 +457,7 @@ cdef class Parser(Pipe):
     def to_disk(self, path, exclude=tuple()):
         serializers = {
             'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
-            'vocab': lambda p: self.vocab.to_disk(p),
+            'strings.json': lambda p: srsly.write_json(p, self._added_strings),
             'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
             'cfg': lambda p: srsly.write_json(p, self.cfg)
         }
@@ -462,7 +465,7 @@ cdef class Parser(Pipe):
 
     def from_disk(self, path, exclude=tuple()):
         deserializers = {
-            'vocab': lambda p: self.vocab.from_disk(p),
+            'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
             'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
             'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
             'model': lambda p: None,
@@ -482,7 +485,7 @@ cdef class Parser(Pipe):
     def to_bytes(self, exclude=tuple()):
         serializers = {
             "model": lambda: (self.model.to_bytes()),
-            "vocab": lambda: self.vocab.to_bytes(),
+            "strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
             "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
             "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
         }
@@ -490,7 +493,7 @@ cdef class Parser(Pipe):
 
     def from_bytes(self, bytes_data, exclude=tuple()):
         deserializers = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
+            "strings.json": lambda b: [self.add_string(s) for s in  srsly.json_loads(b)],
             "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
             "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
             "model": lambda b: None,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index dc7a86b06..07d17d193 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -368,7 +368,7 @@ class ConfigSchemaInit(BaseModel):
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
     tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
-    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
+    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
     # fmt: on
 
     class Config:
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e77be74ad..71496327b 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -133,7 +133,7 @@ def test_kb_custom_length(nlp):
 def test_kb_initialize_empty(nlp):
     """Test that the EL can't initialize without examples"""
     entity_linker = nlp.add_pipe("entity_linker")
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         entity_linker.initialize(lambda: [])
 
 
@@ -153,6 +153,23 @@ def test_kb_serialize(nlp):
             mykb.from_disk(d / "unknown" / "kb")
 
 
+def test_kb_serialize_vocab(nlp):
+    """Test serialization of the KB and custom strings"""
+    entity = "MyFunnyID"
+    assert entity not in nlp.vocab.strings
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    assert not mykb.contains_entity(entity)
+    mykb.add_entity(entity, freq=342, entity_vector=[3])
+    assert mykb.contains_entity(entity)
+    assert entity in mykb.vocab.strings
+    with make_tempdir() as d:
+        # normal read-write behaviour
+        mykb.to_disk(d / "kb")
+        mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
+        mykb_new.from_disk(d / "kb")
+        assert entity in mykb_new.vocab.strings
+
+
 def test_candidate_generation(nlp):
     """Test correct candidate generation"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -413,6 +430,7 @@ def test_overfitting_IO():
     # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
     nlp = English()
     vector_length = 3
+    assert "Q2146908" not in nlp.vocab.strings
 
     # Convert the texts to docs to make sure we have doc.ents set for the training examples
     train_examples = []
@@ -440,6 +458,9 @@ def test_overfitting_IO():
         last=True,
     )
     entity_linker.set_kb(create_kb)
+    assert "Q2146908" in entity_linker.vocab.strings
+    assert "Q2146908" in entity_linker.kb.vocab.strings
+    assert "Q2146908" in entity_linker.kb._added_strings
 
     # train the NEL pipe
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
@@ -474,6 +495,10 @@ def test_overfitting_IO():
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         assert nlp2.pipe_names == nlp.pipe_names
+        assert "Q2146908" in nlp2.vocab.strings
+        entity_linker2 = nlp2.get_pipe("entity_linker")
+        assert "Q2146908" in entity_linker2.vocab.strings
+        assert "Q2146908" in entity_linker2.kb.vocab.strings
         predictions = []
         for text, annotation in TRAIN_DATA:
             doc2 = nlp2(text)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index af81129c0..ce9c0fa54 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -66,9 +66,9 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: None)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=train_examples)
 
 
@@ -101,3 +101,4 @@ def test_overfitting_IO():
         doc2 = nlp2(test_text)
         assert [str(t.morph) for t in doc2] == gold_morphs
         assert [t.pos_ for t in doc2] == gold_pos_tags
+        assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 4b96992e1..c693a7487 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,6 +1,6 @@
 import pytest
 from spacy.language import Language
-from spacy.pipeline import Pipe
+from spacy.pipeline import TrainablePipe
 from spacy.util import SimpleFrozenList, get_arg_names
 
 
@@ -376,7 +376,7 @@ def test_pipe_label_data_no_labels(pipe):
 def test_warning_pipe_begin_training():
     with pytest.warns(UserWarning, match="begin_training"):
 
-        class IncompatPipe(Pipe):
+        class IncompatPipe(TrainablePipe):
             def __init__(self):
                 ...
 
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index c64dfcbd6..472216512 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -40,9 +40,9 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: None)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=train_examples)
 
 
@@ -80,3 +80,4 @@ def test_overfitting_IO():
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
         assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
+        assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index b32925d84..590c22233 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -74,13 +74,13 @@ def test_initialize_examples():
     # you shouldn't really call this more than once, but for testing it should be fine
     nlp.initialize()
     nlp.initialize(get_examples=lambda: train_examples)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: None)
     with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: train_examples[0])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: [])
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=train_examples)
 
 
@@ -98,6 +98,7 @@ def test_overfitting_IO():
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["tagger"] < 0.00001
+    assert tagger._added_strings == {"J", "N", "V"}
 
     # test the trained model
     test_text = "I like blue eggs"
@@ -116,6 +117,7 @@ def test_overfitting_IO():
         assert doc2[1].tag_ is "V"
         assert doc2[2].tag_ is "J"
         assert doc2[3].tag_ is "N"
+        assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
 
 
 def test_tagger_requires_labels():
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index e950c81c6..7eb7ff658 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -127,9 +127,9 @@ def test_initialize_examples():
     nlp.initialize()
     get_examples = make_get_examples(nlp)
     nlp.initialize(get_examples=get_examples)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=lambda: None)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         nlp.initialize(get_examples=get_examples())
 
 
@@ -146,6 +146,7 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert textcat.model.get_dim("nO") == 2
+    assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
 
     for i in range(50):
         losses = {}
@@ -167,6 +168,7 @@ def test_overfitting_IO():
         cats2 = doc2.cats
         assert cats2["POSITIVE"] > 0.9
         assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
+        assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
 
     # Test scoring
     scores = nlp.evaluate(train_examples)
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 0e2579ac4..73aea5b4b 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -1,5 +1,5 @@
 import pytest
-from spacy.pipeline import Pipe
+from spacy.pipeline import TrainablePipe
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.training import Example, Corpus
@@ -271,7 +271,7 @@ def test_issue4272():
 
 
 def test_multiple_predictions():
-    class DummyPipe(Pipe):
+    class DummyPipe(TrainablePipe):
         def __init__(self):
             self.model = "dummy_model"
 
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 9fda413a3..02d0c70dd 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -1,4 +1,3 @@
-from typing import Callable
 import warnings
 from unittest import TestCase
 import pytest
@@ -7,8 +6,7 @@ from numpy import zeros
 from spacy.kb import KnowledgeBase, Writer
 from spacy.vectors import Vectors
 from spacy.language import Language
-from spacy.pipeline import Pipe
-from spacy.util import registry
+from spacy.pipeline import TrainablePipe
 
 from ..util import make_tempdir
 
@@ -45,14 +43,13 @@ def custom_pipe():
         def from_disk(self, path, exclude=tuple(), **kwargs):
             return self
 
-    class MyPipe(Pipe):
+    class MyPipe(TrainablePipe):
         def __init__(self, vocab, model=True, **cfg):
             if cfg:
                 self.cfg = cfg
             else:
                 self.cfg = None
             self.model = SerializableDummy()
-            self.vocab = SerializableDummy()
 
     return MyPipe(None)
 
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index f90531dbb..dfd7f6bd4 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,5 +1,6 @@
 import pytest
-from spacy import registry
+import srsly
+from spacy import registry, Vocab
 from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
 from spacy.pipeline import TextCategorizer, SentenceRecognizer
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
@@ -69,6 +70,29 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
     assert bytes_2 == bytes_3
 
 
+@pytest.mark.parametrize("Parser", test_parsers)
+def test_serialize_parser_strings(Parser):
+    vocab1 = Vocab()
+    label = "FunnyLabel"
+    assert label not in vocab1.strings
+    config = {
+        "learn_tokens": False,
+        "min_action_freq": 0,
+        "update_with_oracle_cut_size": 100,
+    }
+    cfg = {"model": DEFAULT_PARSER_MODEL}
+    model = registry.resolve(cfg, validate=True)["model"]
+    parser1 = Parser(vocab1, model, **config)
+    parser1.add_label(label)
+    assert label in parser1.vocab.strings
+    vocab2 = Vocab()
+    assert label not in vocab2.strings
+    parser2 = Parser(vocab2, model, **config)
+    parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
+    assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
+    assert label in parser2.vocab.strings
+
+
 @pytest.mark.parametrize("Parser", test_parsers)
 def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
     config = {
@@ -132,6 +156,29 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
         assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
 
 
+def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
+    label = "SomeWeirdLabel"
+    assert label not in en_vocab.strings
+    assert label not in de_vocab.strings
+    tagger = taggers[0]
+    assert label not in tagger.vocab.strings
+    with make_tempdir() as d:
+        # check that custom labels are serialized as part of the component's strings.jsonl
+        tagger.add_label(label)
+        assert label in tagger.vocab.strings
+        assert tagger._added_strings == {label}
+        file_path = d / "tagger1"
+        tagger.to_disk(file_path)
+        strings = srsly.read_json(file_path / "strings.json")
+        assert strings == ["SomeWeirdLabel"]
+        # ensure that the custom strings are loaded back in when using the tagger in another pipeline
+        cfg = {"model": DEFAULT_TAGGER_MODEL}
+        model = registry.resolve(cfg, validate=True)["model"]
+        tagger2 = Tagger(de_vocab, model).from_disk(file_path)
+        assert label in tagger2.vocab.strings
+        assert tagger2._added_strings == {label}
+
+
 def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_TEXTCAT_MODEL}
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index f71a5f521..86341dd9a 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,5 +1,5 @@
 from .corpus import Corpus  # noqa: F401
-from .example import Example, validate_examples  # noqa: F401
+from .example import Example, validate_examples, validate_get_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1f3a36b33..a8da49c61 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -44,6 +44,24 @@ def validate_examples(examples, method):
         raise TypeError(err)
 
 
+def validate_get_examples(get_examples, method):
+    """Check that a generator of a batch of examples received during processing is valid:
+    the callable produces a non-empty list of Example objects.
+    This function lives here to prevent circular imports.
+
+    get_examples (Callable[[], Iterable[Example]]): A function that produces a batch of examples.
+    method (str): The method name to show in error messages.
+    """
+    if get_examples is None or not hasattr(get_examples, "__call__"):
+        err = Errors.E930.format(method=method, obj=type(get_examples))
+        raise TypeError(err)
+    examples = get_examples()
+    if not examples:
+        err = Errors.E930.format(method=method, obj=examples)
+        raise TypeError(err)
+    validate_examples(examples, method)
+
+
 cdef class Example:
     def __init__(self, Doc predicted, Doc reference, *, alignment=None):
         if predicted is None:
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 3a133a0df..b431ecf06 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -21,7 +21,7 @@ def console_logger(progress_bar: bool = False):
         logged_pipes = [
             name
             for name, proc in nlp.pipeline
-            if hasattr(proc, "is_trainable") and proc.is_trainable()
+            if hasattr(proc, "is_trainable") and proc.is_trainable
         ]
         eval_frequency = nlp.config["training"]["eval_frequency"]
         score_weights = nlp.config["training"]["score_weights"]
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 48cf582e6..242113cc6 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -188,7 +188,7 @@ def train_while_improving(
             if (
                 name not in exclude
                 and hasattr(proc, "is_trainable")
-                and proc.is_trainable()
+                and proc.is_trainable
                 and proc.model not in (True, False, None)
             ):
                 proc.finish_update(optimizer)
diff --git a/spacy/util.py b/spacy/util.py
index aa321b22f..bf4ea0c92 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1356,3 +1356,16 @@ def check_bool_env_var(env_var: str) -> bool:
     if value == "0":
         return False
     return bool(value)
+
+
+def _pipe(docs, proc, kwargs):
+    if hasattr(proc, "pipe"):
+        yield from proc.pipe(docs, **kwargs)
+    # We added some args for pipe that __call__ doesn't expect.
+    kwargs = dict(kwargs)
+    for arg in ["batch_size"]:
+        if arg in kwargs:
+            kwargs.pop(arg)
+    for doc in docs:
+        doc = proc(doc, **kwargs)
+        yield doc
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index b98768dcf..e7adcdd75 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -1,5 +1,5 @@
 ---
-title: Pipe
+title: TrainablePipe
 tag: class
 teaser: Base class for trainable pipeline components
 ---
@@ -10,30 +10,32 @@ components like the [`EntityRecognizer`](/api/entityrecognizer) or
 interface that components should follow to function as trainable components in a
 spaCy pipeline. See the docs on
 [writing trainable components](/usage/processing-pipelines#trainable-components)
-for how to use the `Pipe` base class to implement custom components.
+for how to use the `TrainablePipe` base class to implement custom components.
 
-> #### Why is Pipe implemented in Cython?
+<!-- TODO: Pipe vs TrainablePipe, check methods below (all renamed to TrainablePipe for now) -->
+
+> #### Why is TrainablePipe implemented in Cython?
 >
-> The `Pipe` class is implemented in a `.pyx` module, the extension used by
-> [Cython](/api/cython). This is needed so that **other** Cython classes, like
-> the [`EntityRecognizer`](/api/entityrecognizer) can inherit from it. But it
-> doesn't mean you have to implement trainable components in Cython – pure
-> Python components like the [`TextCategorizer`](/api/textcategorizer) can also
-> inherit from `Pipe`.
+> The `TrainablePipe` class is implemented in a `.pyx` module, the extension
+> used by [Cython](/api/cython). This is needed so that **other** Cython
+> classes, like the [`EntityRecognizer`](/api/entityrecognizer) can inherit from
+> it. But it doesn't mean you have to implement trainable components in Cython –
+> pure Python components like the [`TextCategorizer`](/api/textcategorizer) can
+> also inherit from `TrainablePipe`.
 
 ```python
-%%GITHUB_SPACY/spacy/pipeline/pipe.pyx
+%%GITHUB_SPACY/spacy/pipeline/trainable_pipe.pyx
 ```
 
-## Pipe.\_\_init\_\_ {#init tag="method"}
+## TrainablePipe.\_\_init\_\_ {#init tag="method"}
 
 > #### Example
 >
 > ```python
-> from spacy.pipeline import Pipe
+> from spacy.pipeline import TrainablePipe
 > from spacy.language import Language
 >
-> class CustomPipe(Pipe):
+> class CustomPipe(TrainablePipe):
 >     ...
 >
 > @Language.factory("your_custom_pipe", default_config={"model": MODEL})
@@ -45,14 +47,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#create_pipe).
 
-| Name    | Description                                                                                                                     |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~                                                                                                |
-| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~                 |
-| `name`  | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                             |
-| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
+| Name    | Description                                                                                                                |
+| ------- | -------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~                                                                                           |
+| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~            |
+| `name`  | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
+| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `cfg` and is serialized with the component. |
 
-## Pipe.\_\_call\_\_ {#call tag="method"}
+## TrainablePipe.\_\_call\_\_ {#call tag="method"}
 
 Apply the pipe to one document. The document is modified in place, and returned.
 This usually happens under the hood when the `nlp` object is called on a text
@@ -75,7 +77,7 @@ and all pipeline components are applied to the `Doc` in order. Both
 | `doc`       | The document to process. ~~Doc~~ |
 | **RETURNS** | The processed document. ~~Doc~~  |
 
-## Pipe.pipe {#pipe tag="method"}
+## TrainablePipe.pipe {#pipe tag="method"}
 
 Apply the pipe to a stream of documents. This usually happens under the hood
 when the `nlp` object is called on a text and all pipeline components are
@@ -98,7 +100,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## Pipe.initialize {#initialize tag="method" new="3"}
+## TrainablePipe.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
 returns an iterable of [`Example`](/api/example) objects. The data examples are
@@ -128,7 +130,7 @@ This method was previously called `begin_training`.
 | _keyword-only_ |                                                                                                                                       |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
 
-## Pipe.predict {#predict tag="method"}
+## TrainablePipe.predict {#predict tag="method"}
 
 Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
 modifying them.
@@ -151,7 +153,7 @@ This method needs to be overwritten with your own custom `predict` method.
 | `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
 | **RETURNS** | The model's prediction for each document.   |
 
-## Pipe.set_annotations {#set_annotations tag="method"}
+## TrainablePipe.set_annotations {#set_annotations tag="method"}
 
 Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
 
@@ -175,7 +177,7 @@ method.
 | `docs`   | The documents to modify. ~~Iterable[Doc]~~       |
 | `scores` | The scores to set, produced by `Tagger.predict`. |
 
-## Pipe.update {#update tag="method"}
+## TrainablePipe.update {#update tag="method"}
 
 Learn from a batch of [`Example`](/api/example) objects containing the
 predictions and gold-standard annotations, and update the component's model.
@@ -198,7 +200,7 @@ predictions and gold-standard annotations, and update the component's model.
 | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
 | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
 
-## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
+## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"}
 
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
 current model to make predictions similar to an initial model, to try to address
@@ -216,12 +218,11 @@ the "catastrophic forgetting" problem. This feature is experimental.
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
 | _keyword-only_ |                                                                                                                          |
-| `drop`         | The dropout rate. ~~float~~                                                                                              |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
 | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
 | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
-## Pipe.get_loss {#get_loss tag="method"}
+## TrainablePipe.get_loss {#get_loss tag="method"}
 
 Find the loss and gradient of loss for the batch of documents and their
 predicted scores.
@@ -246,7 +247,7 @@ This method needs to be overwritten with your own custom `get_loss` method.
 | `scores`    | Scores representing the model's predictions.                                |
 | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
 
-## Pipe.score {#score tag="method" new="3"}
+## TrainablePipe.score {#score tag="method" new="3"}
 
 Score a batch of examples.
 
@@ -261,7 +262,7 @@ Score a batch of examples.
 | `examples`  | The examples to score. ~~Iterable[Example]~~                                                            |
 | **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
-## Pipe.create_optimizer {#create_optimizer tag="method"}
+## TrainablePipe.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component. Defaults to
 [`Adam`](https://thinc.ai/docs/api-optimizers#adam) with default settings.
@@ -277,7 +278,7 @@ Create an optimizer for the pipeline component. Defaults to
 | ----------- | ---------------------------- |
 | **RETURNS** | The optimizer. ~~Optimizer~~ |
 
-## Pipe.use_params {#use_params tag="method, contextmanager"}
+## TrainablePipe.use_params {#use_params tag="method, contextmanager"}
 
 Modify the pipe's model, to use the given parameter values. At the end of the
 context, the original parameters are restored.
@@ -294,7 +295,7 @@ context, the original parameters are restored.
 | -------- | -------------------------------------------------- |
 | `params` | The parameter values to use in the model. ~~dict~~ |
 
-## Pipe.finish_update {#finish_update tag="method"}
+## TrainablePipe.finish_update {#finish_update tag="method"}
 
 Update parameters using the current parameter gradients. Defaults to calling
 [`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update).
@@ -312,7 +313,7 @@ Update parameters using the current parameter gradients. Defaults to calling
 | ----- | ------------------------------------- |
 | `sgd` | An optimizer. ~~Optional[Optimizer]~~ |
 
-## Pipe.add_label {#add_label tag="method"}
+## TrainablePipe.add_label {#add_label tag="method"}
 
 > #### Example
 >
@@ -347,12 +348,12 @@ case, all labels found in the sample will be automatically added to the model,
 and the output dimension will be
 [inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
 
-## Pipe.is_resizable {#is_resizable tag="method"}
+## TrainablePipe.is_resizable {#is_resizable tag="property"}
 
 > #### Example
 >
 > ```python
-> can_resize = pipe.is_resizable()
+> can_resize = pipe.is_resizable
 > ```
 >
 > With custom resizing implemented by a component:
@@ -378,7 +379,7 @@ as an attribute to the component's model.
 | ----------- | ---------------------------------------------------------------------------------------------- |
 | **RETURNS** | Whether or not the output dimension of the model can be changed after initialization. ~~bool~~ |
 
-## Pipe.set_output {#set_output tag="method"}
+## TrainablePipe.set_output {#set_output tag="method"}
 
 Change the output dimension of the component's model. If the component is not
 [resizable](#is_resizable), this method will raise a `NotImplementedError`. If a
@@ -390,7 +391,7 @@ care should be taken to avoid the "catastrophic forgetting" problem.
 > #### Example
 >
 > ```python
-> if pipe.is_resizable():
+> if pipe.is_resizable:
 >     pipe.set_output(512)
 > ```
 
@@ -398,7 +399,7 @@ care should be taken to avoid the "catastrophic forgetting" problem.
 | ---- | --------------------------------- |
 | `nO` | The new output dimension. ~~int~~ |
 
-## Pipe.to_disk {#to_disk tag="method"}
+## TrainablePipe.to_disk {#to_disk tag="method"}
 
 Serialize the pipe to disk.
 
@@ -415,7 +416,7 @@ Serialize the pipe to disk.
 | _keyword-only_ |                                                                                                                                            |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
 
-## Pipe.from_disk {#from_disk tag="method"}
+## TrainablePipe.from_disk {#from_disk tag="method"}
 
 Load the pipe from disk. Modifies the object in place and returns it.
 
@@ -431,9 +432,9 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | _keyword-only_ |                                                                                                 |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
-| **RETURNS**    | The modified pipe. ~~Pipe~~                                                                     |
+| **RETURNS**    | The modified pipe. ~~TrainablePipe~~                                                            |
 
-## Pipe.to_bytes {#to_bytes tag="method"}
+## TrainablePipe.to_bytes {#to_bytes tag="method"}
 
 > #### Example
 >
@@ -450,7 +451,7 @@ Serialize the pipe to a bytestring.
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | **RETURNS**    | The serialized form of the pipe. ~~bytes~~                                                  |
 
-## Pipe.from_bytes {#from_bytes tag="method"}
+## TrainablePipe.from_bytes {#from_bytes tag="method"}
 
 Load the pipe from a bytestring. Modifies the object in place and returns it.
 
@@ -467,16 +468,16 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
 | `bytes_data`   | The data to load from. ~~bytes~~                                                            |
 | _keyword-only_ |                                                                                             |
 | `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The pipe. ~~Pipe~~                                                                          |
+| **RETURNS**    | The pipe. ~~TrainablePipe~~                                                                 |
 
 ## Attributes {#attributes}
 
-| Name    | Description                                                                                                              |
-| ------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~                                                      |
-| `model` | The model powering the component. ~~Model[List[Doc], Any]~~                                                              |
-| `name`  | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~                                   |
-| `cfg`   | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
+| Name    | Description                                                                                                                       |
+| ------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~                                                               |
+| `model` | The model powering the component. ~~Model[List[Doc], Any]~~                                                                       |
+| `name`  | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~                                            |
+| `cfg`   | Keyword arguments passed to [`TrainablePipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
 
 ## Serialization fields {#serialization-fields}
 
@@ -487,11 +488,10 @@ serialization by passing in the string names via the `exclude` argument.
 > #### Example
 >
 > ```python
-> data = pipe.to_disk("/path", exclude=["vocab"])
+> data = pipe.to_disk("/path")
 > ```
 
 | Name    | Description                                                    |
 | ------- | -------------------------------------------------------------- |
-| `vocab` | The shared [`Vocab`](/api/vocab).                              |
 | `cfg`   | The config file. You usually don't want to exclude this.       |
 | `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 6e9120022..18203e204 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -57,7 +57,8 @@ components for different language processing tasks and also allows adding
 | [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
 | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
 | [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
-| [`Pipe`](/api/pipe)                             | Base class that all trainable pipeline components inherit from.                             |
+| [`Pipe`](/api/pipe)                             | Base class that pipeline components may inherit from.                                       |
+| [`TrainablePipe`](/api/pipe)                    | Class that all trainable pipeline components inherit from.                                  |
 
 ### Matchers {#architecture-matchers}
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 7fa60e0f1..e348c4389 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -491,13 +491,14 @@ In addition to [swapping out](#swap-architectures) default models in built-in
 components, you can also implement an entirely new,
 [trainable](/usage/processing-pipelines#trainable-components) pipeline component
 from scratch. This can be done by creating a new class inheriting from
-[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
+[`TrainablePipe`](/api/pipe), and linking it up to your custom model
+implementation.
 
 <Infobox title="Trainable component API" emoji="💡">
 
 For details on how to implement pipeline components, check out the usage guide
 on [custom components](/usage/processing-pipelines#custom-component) and the
-overview of the `Pipe` methods used by
+overview of the `TrainablePipe` methods used by
 [trainable components](/usage/processing-pipelines#trainable-components).
 
 </Infobox>
@@ -646,15 +647,15 @@ get_candidates = model.attrs["get_candidates"]
 
 To use our new relation extraction model as part of a custom
 [trainable component](/usage/processing-pipelines#trainable-components), we
-create a subclass of [`Pipe`](/api/pipe) that holds the model.
+create a subclass of [`TrainablePipe`](/api/pipe) that holds the model.
 
 ![Illustration of Pipe methods](../images/trainable_component.svg)
 
 ```python
 ### Pipeline component skeleton
-from spacy.pipeline import Pipe
+from spacy.pipeline import TrainablePipe
 
-class RelationExtractor(Pipe):
+class RelationExtractor(TrainablePipe):
      def __init__(self, vocab, model, name="rel"):
         """Create a component instance."""
         self.model = model
@@ -757,9 +758,10 @@ def update(
 
 When the internal model is trained, the component can be used to make novel
 **predictions**. The [`predict`](/api/pipe#predict) function needs to be
-implemented for each subclass of `Pipe`. In our case, we can simply delegate to
-the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
-that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
+implemented for each subclass of `TrainablePipe`. In our case, we can simply
+delegate to the internal model's
+[predict](https://thinc.ai/docs/api-model#predict) function that takes a batch
+of `Doc` objects and returns a ~~Floats2d~~ array:
 
 ```python
 ### The predict method
@@ -826,7 +828,7 @@ def __call__(self, Doc doc):
     return doc
 ```
 
-Once our `Pipe` subclass is fully implemented, we can
+Once our `TrainablePipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
 component with the [`@Language.factory`](/api/language#factory) decorator. This
 assigns it a name and lets you create the component with
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 8b4e39ee9..e33ea6001 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1169,10 +1169,10 @@ doc = nlp("This is a text...")
 
 ## Trainable components {#trainable-components new="3"}
 
-spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
-components that have their own model instance, make predictions over `Doc`
-objects and can be updated using [`spacy train`](/api/cli#train). This lets you
-plug fully custom machine learning components into your pipeline.
+spaCy's [`TrainablePipe`](/api/pipe) class helps you implement your own
+trainable components that have their own model instance, make predictions over
+`Doc` objects and can be updated using [`spacy train`](/api/cli#train). This
+lets you plug fully custom machine learning components into your pipeline.
 
 ![Illustration of Pipe methods](../images/trainable_component.svg)
 
@@ -1183,9 +1183,9 @@ You'll need the following:
    a [wrapped model](/usage/layers-architectures#frameworks) implemented in
    PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
    list of [`Doc`](/api/doc) objects as input and can have any type of output.
-2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least
-   two methods: [`Pipe.predict`](/api/pipe#predict) and
-   [`Pipe.set_annotations`](/api/pipe#set_annotations).
+2. **TrainablePipe subclass:** A subclass of [`TrainablePipe`](/api/pipe) that
+   implements at least two methods: [`TrainablePipe.predict`](/api/pipe#predict)
+   and [`TrainablePipe.set_annotations`](/api/pipe#set_annotations).
 3. **Component factory:** A component factory registered with
    [`@Language.factory`](/api/language#factory) that takes the `nlp` object and
    component `name` and optional settings provided by the config and returns an
@@ -1194,10 +1194,10 @@ You'll need the following:
 > #### Example
 >
 > ```python
-> from spacy.pipeline import Pipe
+> from spacy.pipeline import TrainablePipe
 > from spacy.language import Language
 >
-> class TrainableComponent(Pipe):
+> class TrainableComponent(TrainablePipe):
 >     def predict(self, docs):
 >         ...
 >
@@ -1214,11 +1214,11 @@ You'll need the following:
 | [`predict`](/api/pipe#predict)                 | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. |
 | [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`.                      |
 
-By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the
-[`Model`](https://thinc.ai/docs/api-model) and the name of the component
+By default, [`TrainablePipe.__init__`](/api/pipe#init) takes the shared vocab,
+the [`Model`](https://thinc.ai/docs/api-model) and the name of the component
 instance in the pipeline, which you can use as a key in the losses. All other
-keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will
-also be serialized with the component.
+keyword arguments will become available as [`TrainablePipe.cfg`](/api/pipe#cfg)
+and will also be serialized with the component.
 
 <Accordion title="Why components should be passed a Model instance, not create it" spaced>
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index d9ab00b97..250fdb4f4 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -178,7 +178,8 @@ freely combine implementations from different frameworks into a single model.
 - **Thinc: **
   [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks),
   [`Model` API](https://thinc.ai/docs/api-model)
-- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe)
+- **API:** [Model architectures](/api/architectures),
+  [`TrainablePipe`](/api/pipe)
 
 </Infobox>
 
@@ -428,7 +429,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | [`Language.config`](/api/language#config)                                                                                       | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
 | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes)                       | All available components and component names, including disabled components that are not run as part of the pipeline.                                                                            |
 | [`Language.disabled`](/api/language#attributes)                                                                                 | Names of disabled components that are not run as part of the pipeline.                                                                                                                           |
-| [`Pipe.score`](/api/pipe#score)                                                                                                 | Method on pipeline components that returns a dictionary of evaluation scores.                                                                                                                    |
+| [`TrainablePipe.score`](/api/pipe#score)                                                                                        | Method on pipeline components that returns a dictionary of evaluation scores.                                                                                                                    |
 | [`registry`](/api/top-level#registry)                                                                                           | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config).                                                                                  |
 | [`util.load_meta`](/api/top-level#util.load_meta), [`util.load_config`](/api/top-level#util.load_config)                        | Updated helpers for loading a pipeline's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config).                                                                     |
 | [`util.get_installed_models`](/api/top-level#util.get_installed_models)                                                         | Names of all pipeline packages installed in the environment.                                                                                                                                     |
@@ -483,7 +484,7 @@ format for documenting argument and return types.
   [`Morphologizer`](/api/morphologizer),
   [`AttributeRuler`](/api/attributeruler),
   [`SentenceRecognizer`](/api/sentencerecognizer),
-  [`DependencyMatcher`](/api/dependencymatcher), [`Pipe`](/api/pipe),
+  [`DependencyMatcher`](/api/dependencymatcher), [`TrainablePipe`](/api/pipe),
   [`Corpus`](/api/corpus)
 
 </Infobox>
@@ -522,7 +523,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`@Language.factory`](/api/language#factory) decorator.
 - The [`Language.update`](/api/language#update),
   [`Language.evaluate`](/api/language#evaluate) and
-  [`Pipe.update`](/api/pipe#update) methods now all take batches of
+  [`TrainablePipe.update`](/api/pipe#update) methods now all take batches of
   [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
   raw text and a dictionary of annotations.
 - The `begin_training` methods have been renamed to `initialize` and now take a
@@ -947,7 +948,7 @@ annotations = {"entities": [(0, 15, "PERSON"), (30, 38, "ORG")]}
 
 The [`Language.update`](/api/language#update),
 [`Language.evaluate`](/api/language#evaluate) and
-[`Pipe.update`](/api/pipe#update) methods now all take batches of
+[`TrainablePipe.update`](/api/pipe#update) methods now all take batches of
 [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
 raw text and a dictionary of annotations.
 
@@ -967,12 +968,13 @@ for i in range(20):
         nlp.update(examples)
 ```
 
-`Language.begin_training` and `Pipe.begin_training` have been renamed to
-[`Language.initialize`](/api/language#initialize) and
-[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
-that returns a sequence of `Example` objects to initialize the model instead of
-a list of tuples. The data examples are used to **initialize the models** of
-trainable pipeline components, which includes validating the network,
+`Language.begin_training` and `TrainablePipe.begin_training` have been renamed
+to [`Language.initialize`](/api/language#initialize) and
+[`TrainablePipe.initialize`](/api/pipe#initialize), and the methods now take a
+function that returns a sequence of `Example` objects to initialize the model
+instead of a list of tuples. The data examples are used to **initialize the
+models** of trainable pipeline components, which includes validating the
+network,
 [inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 setting up the label scheme.
 

From 59558b1b8062cfd8e25770a3c02ba48e3186e187 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 8 Oct 2020 23:09:14 +0200
Subject: [PATCH 471/516] Update pin [ci skip]

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index e44d01026..69d4e6347 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -67,7 +67,7 @@ console_scripts =
 lookups =
     spacy_lookups_data>=1.0.0rc0,<1.0.0
 transformers =
-    spacy_transformers>=1.0.0a17,<1.0.0
+    spacy_transformers>=1.0.0a22,<1.0.0
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =

From 67652bcbb5530a1e0a262f862ad1daf5855dcc65 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 8 Oct 2020 23:18:02 +0200
Subject: [PATCH 472/516] Upd makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 741366063..3f10e79cc 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ dist/$(SPACY_BIN) : $(WHEELHOUSE)/spacy-$(PYVER)-$(version).stamp
 		--disable-cache \
 		-o $@ \
 		$(package)==$(version) \
-		"$(SPACY_EXTRAS)"
+		$(SPACY_EXTRAS)
 	chmod a+rx $@
 	cp $@ dist/spacy.pex
 

From 2fad279a44b4ae2dbf9441ffc7d1cea0e4f2a3d8 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu.altinok12@gmail.com>
Date: Wed, 7 Oct 2020 11:07:52 +0200
Subject: [PATCH 473/516] Turkish language syntax iterators (#6191)

* added tr_vocab to config

* basic test

* added syntax iterator to Turkish lang class

* first version for Turkish syntax iter, without flat

* added simple tests with nmod, amod, det

* more tests to amod and nmod

* separated noun chunks and parser test

* rearrangement after nchunk parser separation

* added recursive NPs

* tests with complicated recursive NPs

* tests with conjed NPs

* additional tests for conj NP

* small modification for shaving off conj from NP

* added tests with flat

* more tests with flat

* added examples with flats conjed

* added inner func for flat trick

* corrected parse

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/tr/__init__.py               |   2 +
 spacy/lang/tr/syntax_iterators.py       |  59 +++
 spacy/tests/conftest.py                 |   3 +
 spacy/tests/lang/tr/test_noun_chunks.py |  12 +
 spacy/tests/lang/tr/test_parser.py      | 570 ++++++++++++++++++++++++
 5 files changed, 646 insertions(+)
 create mode 100644 spacy/lang/tr/syntax_iterators.py
 create mode 100644 spacy/tests/lang/tr/test_noun_chunks.py
 create mode 100644 spacy/tests/lang/tr/test_parser.py

diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 8bd0b93df..788adb6fb 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -1,5 +1,6 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 
@@ -8,6 +9,7 @@ class TurkishDefaults(Language.Defaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Turkish(Language):
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
new file mode 100644
index 000000000..665ccb590
--- /dev/null
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -0,0 +1,59 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+
+
+def noun_chunks(doclike):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    # Please see documentation for Turkish NP structure
+    labels = [
+        "nsubj",
+        "iobj",
+        "obj",
+        "obl",
+        "appos",
+        "orphan",
+        "dislocated",
+        "ROOT",
+    ]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+
+    np_deps = [doc.vocab.strings.add(label) for label in labels]
+    conj = doc.vocab.strings.add("conj")
+    flat = doc.vocab.strings.add("flat")
+    np_label = doc.vocab.strings.add("NP")
+
+    def extend_right(w):  # Playing a trick for flat
+        rindex = w.i + 1
+        for rdep in doc[w.i].rights:  # Extend the span to right if there is a flat
+            if rdep.dep == flat and rdep.pos in (NOUN, PROPN):
+                rindex = rdep.i + 1
+            else:
+                break
+        return rindex
+
+    prev_end = len(doc) + 1
+    for i, word in reversed(list(enumerate(doclike))):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i >= prev_end:
+            continue
+        if word.dep in np_deps:
+            prev_end = word.left_edge.i
+            yield word.left_edge.i, extend_right(word), np_label
+        elif word.dep == conj:
+            cc_token = word.left_edge  
+            prev_end = cc_token.i
+            yield cc_token.right_edge.i + 1, extend_right(word), np_label  # Shave off cc tokens from the NP
+
+
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 411397b42..7f8ab6768 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -239,6 +239,9 @@ def th_tokenizer():
 def tr_tokenizer():
     return get_lang_class("tr")().tokenizer
 
+@pytest.fixture(scope="session")
+def tr_vocab():
+    return get_lang_class("tr").Defaults.create_vocab()
 
 @pytest.fixture(scope="session")
 def tt_tokenizer():
diff --git a/spacy/tests/lang/tr/test_noun_chunks.py b/spacy/tests/lang/tr/test_noun_chunks.py
new file mode 100644
index 000000000..003e4f08e
--- /dev/null
+++ b/spacy/tests/lang/tr/test_noun_chunks.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+def test_noun_chunks_is_parsed(tr_tokenizer):
+    """Test that noun_chunks raises Value Error for 'tr' language if Doc is not parsed.
+    To check this test, we're constructing a Doc
+    with a new Vocab here and forcing is_parsed to 'False'
+    to make sure the noun chunks don't run.
+    """
+    doc = tr_tokenizer("Dün seni gördüm.")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)
diff --git a/spacy/tests/lang/tr/test_parser.py b/spacy/tests/lang/tr/test_parser.py
new file mode 100644
index 000000000..ff71ac3d4
--- /dev/null
+++ b/spacy/tests/lang/tr/test_parser.py
@@ -0,0 +1,570 @@
+from spacy.tokens import Doc
+
+
+def test_tr_noun_chunks_amod_simple(tr_tokenizer):
+    text = "sarı kedi"
+    heads = [1, 1]
+    deps = ["amod", "ROOT"]
+    pos = ["ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "sarı kedi "
+
+
+def test_tr_noun_chunks_nmod_simple(tr_tokenizer):
+    text = "arkadaşımın kedisi"  # my friend's cat
+    heads = [1, 1]
+    deps = ["nmod", "ROOT"]
+    pos = ["NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "arkadaşımın kedisi "
+
+
+def test_tr_noun_chunks_determiner_simple(tr_tokenizer):
+    text = "O kedi"  # that cat
+    heads = [1, 1]
+    deps = ["det", "ROOT"]
+    pos = ["DET", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "O kedi "
+
+
+def test_tr_noun_chunks_nmod_amod(tr_tokenizer):
+    text = "okulun eski müdürü"
+    heads = [2, 2, 2]
+    deps = ["nmod", "amod", "ROOT"]
+    pos = ["NOUN", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "okulun eski müdürü "
+
+
+def test_tr_noun_chunks_one_det_one_adj_simple(tr_tokenizer):
+    text = "O sarı kedi"
+    heads = [2, 2, 2]
+    deps = ["det", "amod", "ROOT"]
+    pos = ["DET", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "O sarı kedi "
+
+
+def test_tr_noun_chunks_two_adjs_simple(tr_tokenizer):
+    text = "beyaz tombik kedi"
+    heads = [2, 2, 2]
+    deps = ["amod", "amod", "ROOT"]
+    pos = ["ADJ", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "beyaz tombik kedi "
+
+
+def test_tr_noun_chunks_one_det_two_adjs_simple(tr_tokenizer):
+    text = "o beyaz tombik kedi"
+    heads = [3, 3, 3, 3]
+    deps = ["det", "amod", "amod", "ROOT"]
+    pos = ["DET", "ADJ", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "o beyaz tombik kedi "
+
+
+def test_tr_noun_chunks_nmod_two(tr_tokenizer):
+    text = "kızın saçının rengi"
+    heads = [1, 2, 2]
+    deps = ["nmod", "nmod", "ROOT"]
+    pos = ["NOUN", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "kızın saçının rengi "
+
+
+def test_tr_noun_chunks_chain_nmod_with_adj(tr_tokenizer):
+    text = "ev sahibinin tatlı köpeği"
+    heads = [1, 3, 3, 3]
+    deps = ["nmod", "nmod", "amod", "ROOT"]
+    pos = ["NOUN", "NOUN", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "ev sahibinin tatlı köpeği "
+
+
+def test_tr_noun_chunks_chain_nmod_with_acl(tr_tokenizer):
+    text = "ev sahibinin gelen köpeği"
+    heads = [1, 3, 3, 3]
+    deps = ["nmod", "nmod", "acl", "ROOT"]
+    pos = ["NOUN", "NOUN", "VERB", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "ev sahibinin gelen köpeği "
+
+
+def test_tr_noun_chunks_chain_nmod_head_with_amod_acl(tr_tokenizer):
+    text = "arabanın kırdığım sol aynası"
+    heads = [3, 3, 3, 3]
+    deps = ["nmod", "acl", "amod", "ROOT"]
+    pos = ["NOUN", "VERB", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "arabanın kırdığım sol aynası "
+
+
+def test_tr_noun_chunks_nmod_three(tr_tokenizer):
+    text = "güney Afrika ülkelerinden Mozambik"
+    heads = [1, 2, 3, 3]
+    deps = ["nmod", "nmod", "nmod", "ROOT"]
+    pos = ["NOUN", "PROPN", "NOUN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "güney Afrika ülkelerinden Mozambik "
+
+
+def test_tr_noun_chunks_det_amod_nmod(tr_tokenizer):
+    text = "bazı eski oyun kuralları"
+    heads = [3, 3, 3, 3]
+    deps = ["det", "nmod", "nmod", "ROOT"]
+    pos = ["DET", "ADJ", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "bazı eski oyun kuralları "
+
+
+def test_tr_noun_chunks_acl_simple(tr_tokenizer):
+    text = "bahçesi olan okul"
+    heads = [2, 0, 2]
+    deps = ["acl", "cop", "ROOT"]
+    pos = ["NOUN", "AUX", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "bahçesi olan okul "
+
+
+def test_tr_noun_chunks_acl_verb(tr_tokenizer):
+    text = "sevdiğim sanatçılar"
+    heads = [1, 1]
+    deps = ["acl", "ROOT"]
+    pos = ["VERB", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "sevdiğim sanatçılar "
+
+
+def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
+    text = "en sevdiğim ses sanatçısı"
+    heads = [1, 3, 3, 3]
+    deps = ["advmod", "acl", "nmod", "ROOT"]
+    pos = ["ADV", "VERB", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
+
+
+def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
+    text = "bildiğim bir turizm şirketi"
+    heads = [3, 3, 3, 3]
+    deps = ["acl", "det", "nmod", "ROOT"]
+    pos = ["VERB", "DET", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "bildiğim bir turizm şirketi "
+
+
+def test_tr_noun_chunks_np_recursive_nsubj_to_root(tr_tokenizer):
+    text = "Simge'nin okuduğu kitap"
+    heads = [1, 2, 2]
+    deps = ["nsubj", "acl", "ROOT"]
+    pos = ["PROPN", "VERB", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Simge'nin okuduğu kitap "
+
+
+def test_tr_noun_chunks_np_recursive_nsubj_attached_to_pron_root(tr_tokenizer):
+    text = "Simge'nin konuşabileceği birisi"
+    heads = [1, 2, 2]
+    deps = ["nsubj", "acl", "ROOT"]
+    pos = ["PROPN", "VERB", "PRON"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Simge'nin konuşabileceği birisi "
+
+
+def test_tr_noun_chunks_np_recursive_nsubj_in_subnp(tr_tokenizer):
+    text = "Simge'nin yarın gideceği yer"
+    heads = [2, 2, 3, 3]
+    deps = ["nsubj", "obl", "acl", "ROOT"]
+    pos = ["PROPN", "NOUN", "VERB", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Simge'nin yarın gideceği yer "
+
+
+def test_tr_noun_chunks_np_recursive_two_nmods(tr_tokenizer):
+    text = "ustanın kapısını degiştireceği çamasır makinası"
+    heads = [2, 2, 4, 4, 4]
+    deps = ["nsubj", "obj", "acl", "nmod", "ROOT"]
+    pos = ["NOUN", "NOUN", "VERB", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "ustanın kapısını degiştireceği çamasır makinası "
+
+
+def test_tr_noun_chunks_np_recursive_four_nouns(tr_tokenizer):
+    text = "kızına piyano dersi verdiğim hanım"
+    heads = [3, 2, 3, 4, 4]
+    deps = ["obl", "nmod", "obj", "acl", "ROOT"]
+    pos = ["NOUN", "NOUN", "NOUN", "VERB", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım "
+
+    
+def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
+    text = "içine birkaç çiçek konmuş olan bir vazo"
+    heads = [3, 2, 3, 6, 3, 6, 6]
+    deps = ["obl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
+    pos = ["ADP", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "içine birkaç çiçek konmuş olan bir vazo "
+
+
+def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
+    text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
+    heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
+    deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
+    pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
+
+
+def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
+    text = "kız ve erkek çocuklar"
+    heads = [3, 2, 0, 3]
+    deps = ["nmod", "cc", "conj", "ROOT"]
+    pos = ["NOUN", "CCONJ", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
+
+def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
+    text = "tatlı ve gürbüz çocuklar"
+    heads = [3, 2, 0, 3]
+    deps = ["amod", "cc", "conj", "ROOT"]
+    pos = ["ADJ", "CCONJ", "NOUN", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "tatlı ve gürbüz çocuklar "
+
+
+def test_tr_noun_chunks_conj_simple(tr_tokenizer):
+    text = "Sen ya da ben"
+    heads = [0, 3, 1, 0]
+    deps = ["ROOT", "cc", "fixed", "conj"]
+    pos = ["PRON", "CCONJ", "CCONJ", "PRON"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "ben "
+    assert chunks[1].text_with_ws == "Sen "
+
+def test_tr_noun_chunks_conj_three(tr_tokenizer):
+    text = "sen, ben ve ondan"
+    heads = [0, 2, 0, 4, 0]
+    deps = ["ROOT", "punct", "conj", "cc", "conj"]
+    pos = ["PRON", "PUNCT", "PRON", "CCONJ", "PRON"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "ondan "
+    assert chunks[1].text_with_ws == "ben "
+    assert chunks[2].text_with_ws == "sen "
+
+
+def test_tr_noun_chunks_conj_three(tr_tokenizer):
+    text = "ben ya da sen ya da onlar"
+    heads = [0, 3, 1, 0, 6, 4, 3]
+    deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
+    pos = ["PRON", "CCONJ", "CCONJ", "PRON", "CCONJ", "CCONJ", "PRON"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "onlar "
+    assert chunks[1].text_with_ws == "sen "
+    assert chunks[2].text_with_ws == "ben "
+
+
+def test_tr_noun_chunks_conj_and_adj_phrase(tr_tokenizer):
+    text = "ben ve akıllı çocuk"
+    heads = [0, 3, 3, 0]
+    deps = ["ROOT", "cc", "amod", "conj"]
+    pos = ["PRON", "CCONJ", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "akıllı çocuk "
+    assert chunks[1].text_with_ws == "ben "
+
+
+def test_tr_noun_chunks_conj_fixed_adj_phrase(tr_tokenizer):
+    text = "ben ya da akıllı çocuk"
+    heads = [0, 4, 1, 4, 0]
+    deps = ["ROOT", "cc", "fixed", "amod", "conj"]
+    pos = ["PRON", "CCONJ", "CCONJ", "ADJ", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "akıllı çocuk "
+    assert chunks[1].text_with_ws == "ben "
+
+
+def test_tr_noun_chunks_conj_subject(tr_tokenizer):
+    text = "Sen ve ben iyi anlaşıyoruz"
+    heads = [4, 2, 0, 2, 4]
+    deps = ["nsubj", "cc", "conj", "adv", "ROOT"]
+    pos = ["PRON", "CCONJ", "PRON", "ADV", "VERB"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "ben "
+    assert chunks[1].text_with_ws == "Sen "
+
+
+def test_tr_noun_chunks_conj_noun_head_verb(tr_tokenizer):
+    text = "Simge babasını görmüyormuş, annesini değil"
+    heads = [2, 2, 2, 4, 2, 4]
+    deps = ["nsubj", "obj", "ROOT", "punct", "conj", "aux"]
+    pos = ["PROPN", "NOUN", "VERB", "PUNCT", "NOUN", "AUX"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "annesini "
+    assert chunks[1].text_with_ws == "babasını "
+    assert chunks[2].text_with_ws == "Simge "
+
+
+def test_tr_noun_chunks_flat_simple(tr_tokenizer):
+    text = "New York"
+    heads = [0, 0]
+    deps = ["ROOT", "flat"]
+    pos = ["PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "New York "
+
+
+def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
+    text = "Gazi Mustafa Kemal"
+    heads = [1, 1, 1]
+    deps = ["nmod", "ROOT", "flat"]
+    pos = ["PROPN", "PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
+
+
+def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
+    text = "Ahmet Vefik Paşa"
+    heads = [2, 0, 2]
+    deps = ["nmod", "flat", "ROOT"]
+    pos = ["PROPN", "PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Ahmet Vefik Paşa "
+
+
+def test_tr_noun_chunks_flat_name_lastname_and_title(tr_tokenizer):
+    text = "Cumhurbaşkanı Ahmet Necdet Sezer"
+    heads = [1, 1, 1, 1]
+    deps = ["nmod", "ROOT", "flat", "flat"]
+    pos = ["NOUN", "PROPN", "PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Cumhurbaşkanı Ahmet Necdet Sezer "
+
+
+def test_tr_noun_chunks_flat_in_nmod(tr_tokenizer):
+    text = "Ahmet Sezer adında bir ögrenci"
+    heads = [2, 0, 4, 4, 4]
+    deps = ["nmod", "flat", "nmod", "det", "ROOT"]
+    pos = ["PROPN", "PROPN", "NOUN", "DET", "NOUN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Ahmet Sezer adında bir ögrenci "
+
+
+def test_tr_noun_chunks_flat_and_chain_nmod(tr_tokenizer):
+    text = "Batı Afrika ülkelerinden Sierra Leone"
+    heads = [1, 2, 3, 3, 3]
+    deps = ["nmod", "nmod", "nmod", "ROOT", "flat"]
+    pos = ["NOUN", "PROPN", "NOUN", "PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "Batı Afrika ülkelerinden Sierra Leone "
+
+
+def test_tr_noun_chunks_two_flats_conjed(tr_tokenizer):
+    text = "New York ve Sierra Leone"
+    heads = [0, 0, 3, 0, 3]
+    deps = ["ROOT", "flat", "cc", "conj", "flat"]
+    pos = ["PROPN", "PROPN", "CCONJ", "PROPN", "PROPN"]
+    tokens = tr_tokenizer(text)
+    doc = Doc(
+        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
+    )
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "Sierra Leone "
+    assert chunks[1].text_with_ws == "New York "

From 80fb1bffc9a091c8416dc00a73aeaa1dce99a3e0 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu.altinok12@gmail.com>
Date: Wed, 7 Oct 2020 10:25:37 +0200
Subject: [PATCH 474/516] Ordinal numbers for Turkish (#6142)

* minor ordinal number addition

* fixed typo

* added corresponding lexical test
---
 spacy/lang/tr/lex_attrs.py       | 44 +++++++++++++++++++++++++++++++-
 spacy/tests/lang/tr/test_text.py | 29 +++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/lang/tr/test_text.py

diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 3dbc1833a..3615f4b4c 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -32,6 +32,36 @@ _num_words = [
 ]
 
 
+_ordinal_words = [
+    "birinci",
+    "ikinci",
+    "üçüncü",
+    "dördüncü",
+    "beşinci",
+    "altıncı",
+    "yedinci",
+    "sekizinci",
+    "dokuzuncu",
+    "onuncu",
+    "yirminci",
+    "otuzuncu",
+    "kırkıncı",
+    "ellinci",
+    "altmışıncı",
+    "yetmişinci",
+    "sekseninci",
+    "doksanıncı",
+    "yüzüncü",
+    "bininci",
+    "mliyonuncu",
+    "milyarıncı",
+    "trilyonuncu",
+    "katrilyonuncu",
+    "kentilyonuncu",
+]
+
+_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
+
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
@@ -42,8 +72,20 @@ def like_num(text):
         num, denom = text.split("/")
         if num.isdigit() and denom.isdigit():
             return True
-    if text.lower() in _num_words:
+
+    text_lower = text.lower()
+
+    #Check cardinal number
+    if text_lower in _num_words:
         return True
+
+    #Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith(_ordinal_endings):
+        if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
+            return True
+
     return False
 
 
diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py
new file mode 100644
index 000000000..01e279d76
--- /dev/null
+++ b/spacy/tests/lang/tr/test_text.py
@@ -0,0 +1,29 @@
+import pytest
+from spacy.lang.tr.lex_attrs import like_num
+
+
+@pytest.mark.parametrize(
+    "word",
+    [
+        "bir",
+        "iki",
+        "dört",
+        "altı",
+        "milyon",
+        "100",
+        "birinci",
+        "üçüncü",
+        "beşinci",
+        "100üncü",
+        "8inci"
+    ]
+)
+def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["beş", "yedi", "yedinci", "birinci"])
+def test_tr_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
+

From 668507be1b7bb98da029613f01720bda6b5fe653 Mon Sep 17 00:00:00 2001
From: delzac <delzac.jh@gmail.com>
Date: Tue, 6 Oct 2020 21:11:01 +0800
Subject: [PATCH 475/516] Reflect on usage doc that IS_SENT_START attribute
 exist (#6114)

* Reflect on usage doc that IS_SENT_START attribute exist

* Create delzac.md
---
 .github/contributors/delzac.md            | 106 ++++++++++++++++++++++
 website/docs/usage/rule-based-matching.md |   1 +
 2 files changed, 107 insertions(+)
 create mode 100644 .github/contributors/delzac.md

diff --git a/.github/contributors/delzac.md b/.github/contributors/delzac.md
new file mode 100644
index 000000000..0fcfe6f2f
--- /dev/null
+++ b/.github/contributors/delzac.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Matthew Chin        |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-09-22           |
+| GitHub username                | delzac               |
+| Website (optional)             |                      |
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 256f4ccb4..a510398e6 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -167,6 +167,7 @@ rule-based matching are:
 |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          |
 |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                |
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     |
+|  `IS_SENT_START`                                | Token is start of sentence. ~~bool~~                                                                                      |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |

From 287ba94a2f2166d11f692741ae0216ae0c736a7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?=
 <606346+zaibacu@users.noreply.github.com>
Date: Tue, 6 Oct 2020 12:19:36 +0300
Subject: [PATCH 476/516] Website (Universe): An entry for rita-dsl (#6138)

* Create zaibacu.md

* Add RITA-DSL entry

* Update agreement

* Fix formatting
---
 .github/contributors/zaibacu.md | 106 ++++++++++++++++++++++++++++++++
 website/meta/universe.json      |  36 +++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 .github/contributors/zaibacu.md

diff --git a/.github/contributors/zaibacu.md b/.github/contributors/zaibacu.md
new file mode 100644
index 000000000..365b89848
--- /dev/null
+++ b/.github/contributors/zaibacu.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Šarūnas Navickas     |
+| Company name (if applicable)   | TokenMill            |
+| Title or role (if applicable)  | Data Engineer        |
+| Date                           | 2020-09-24           |
+| GitHub username                | zaibacu              |
+| Website (optional)             |                      |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 74c35bdb8..ffad74180 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2542,6 +2542,42 @@
             "author_links": {
                 "github": "abchapman93"
             }
+        },
+	      {
+            "id": "rita-dsl",
+            "title": "RITA DSL",
+            "slogan": "Domain Specific Language for creating language rules",
+            "github": "zaibacu/rita-dsl",
+            "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format",
+            "pip": "rita-dsl",
+	          "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png",
+            "code_language": "python",
+            "code_example": [
+                "import spacy",
+                "from rita.shortcuts import setup_spacy",
+                "",
+                "rules = \"\"\"",
+                "cuts = {\"fitted\", \"wide-cut\"}",
+                "lengths = {\"short\", \"long\", \"calf-length\", \"knee-length\"}",
+                "fabric_types = {\"soft\", \"airy\", \"crinkled\"}",
+                "fabrics = {\"velour\", \"chiffon\", \"knit\", \"woven\", \"stretch\"}",
+                "",
+                "{IN_LIST(cuts)?, IN_LIST(lengths), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")",
+                "{IN_LIST(lengths), IN_LIST(cuts), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")",
+                "{IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK(\"DRESS_FABRIC\")",
+                "\"\"\"",
+                "",
+                "nlp = spacy.load(\"en\")",
+                "setup_spacy(nlp, rules_string=rules)",
+                "r = nlp(\"She was wearing a short wide-cut dress\")",
+                "print(list([{\"label\": e.label_, \"text\": e.text} for e in r.ents]))"
+            ],
+            "category": ["standalone"],
+            "tags": ["dsl", "language-patterns", "language-rules", "nlp"],
+            "author": "Šarūnas Navickas",
+            "author_links": {
+                "github": "zaibacu"
+            }
         }
     ],
 

From 18f5c309dc72366d792cb500032b157a93aedf85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Florijan=20Stamenkovi=C4=87?=
 <florijan.stamenkovic@gmail.com>
Date: Tue, 6 Oct 2020 11:17:37 +0200
Subject: [PATCH 477/516] Fix Issue 6207 (#6208)

* Regression test for issue 6207

* Fix issue 6207

* Sign contributor agreement

* Minor adjustments to test

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/contributors/florijanstamenkovic.md | 106 ++++++++++++++++++++
 spacy/tests/regression/test_issue6207.py    |  15 +++
 spacy/util.py                               |   2 +-
 3 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 .github/contributors/florijanstamenkovic.md
 create mode 100644 spacy/tests/regression/test_issue6207.py

diff --git a/.github/contributors/florijanstamenkovic.md b/.github/contributors/florijanstamenkovic.md
new file mode 100644
index 000000000..65da875b1
--- /dev/null
+++ b/.github/contributors/florijanstamenkovic.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Florijan Stamenkovic |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-10-05           |
+| GitHub username                | florijanstamenkovic  |
+| Website (optional)             |                      |
diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py
new file mode 100644
index 000000000..47e3803e9
--- /dev/null
+++ b/spacy/tests/regression/test_issue6207.py
@@ -0,0 +1,15 @@
+from spacy.util import filter_spans
+
+
+def test_issue6207(en_tokenizer):
+    doc = en_tokenizer("zero one two three four five six")
+
+    # Make spans
+    s1 = doc[:4]
+    s2 = doc[3:6]   # overlaps with s1
+    s3 = doc[5:7]   # overlaps with s2, not s1
+
+    result = filter_spans((s1, s2, s3))
+    assert s1 in result
+    assert s2 not in result
+    assert s3 in result
diff --git a/spacy/util.py b/spacy/util.py
index bf4ea0c92..3d567a425 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1018,7 +1018,7 @@ def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
         # Check for end - 1 here because boundaries are inclusive
         if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
             result.append(span)
-        seen_tokens.update(range(span.start, span.end))
+            seen_tokens.update(range(span.start, span.end))
     result = sorted(result, key=lambda span: span.start)
     return result
 

From 329b61ee7bfdec06e884720660b9130e35b31cce Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 10:36:06 +0200
Subject: [PATCH 478/516] Update docs [ci skip]

---
 website/docs/api/attributeruler.md      |  1 -
 website/docs/api/pipe.md                |  2 +-
 website/docs/api/sentencizer.md         |  7 ++--
 website/docs/usage/101/_architecture.md | 45 ++++++++++++-------------
 website/meta/sidebars.json              |  2 +-
 5 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md
index d60362a47..a253ca9f8 100644
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@@ -4,7 +4,6 @@ tag: class
 source: spacy/pipeline/attributeruler.py
 new: 3
 teaser: 'Pipeline component for rule-based token attribute assignment'
-api_base_class: /api/pipe
 api_string_name: attribute_ruler
 api_trainable: false
 ---
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index e7adcdd75..1f7fab8aa 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -14,7 +14,7 @@ for how to use the `TrainablePipe` base class to implement custom components.
 
 <!-- TODO: Pipe vs TrainablePipe, check methods below (all renamed to TrainablePipe for now) -->
 
-> #### Why is TrainablePipe implemented in Cython?
+> #### Why is it implemented in Cython?
 >
 > The `TrainablePipe` class is implemented in a `.pyx` module, the extension
 > used by [Cython](/api/cython). This is needed so that **other** Cython
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 594a85f74..2cd49127d 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -3,7 +3,6 @@ title: Sentencizer
 tag: class
 source: spacy/pipeline/sentencizer.pyx
 teaser: 'Pipeline component for rule-based sentence boundary detection'
-api_base_class: /api/pipe
 api_string_name: sentencizer
 api_trainable: false
 ---
@@ -130,9 +129,9 @@ Score a batch of examples.
 
 ## Sentencizer.to_disk {#to_disk tag="method"}
 
-Save the sentencizer settings (punctuation characters) to a directory. Will create
-a file `sentencizer.json`. This also happens automatically when you save an
-`nlp` object with a sentencizer added to its pipeline.
+Save the sentencizer settings (punctuation characters) to a directory. Will
+create a file `sentencizer.json`. This also happens automatically when you save
+an `nlp` object with a sentencizer added to its pipeline.
 
 > #### Example
 >
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 18203e204..b012c4ec0 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -20,13 +20,13 @@ It also orchestrates training and serialization.
 
 | Name                        | Description                                                                                                                                             |
 | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
 | [`Doc`](/api/doc)           | A container for accessing linguistic annotations.                                                                                                       |
+| [`DocBin`](/api/docbin)     | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |
+| [`Example`](/api/example)   | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
+| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`.  |
+| [`Lexeme`](/api/lexeme)     | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
 | [`Span`](/api/span)         | A slice from a `Doc` object.                                                                                                                            |
 | [`Token`](/api/token)       | An individual token — i.e. a word, punctuation symbol, whitespace, etc.                                                                                 |
-| [`Lexeme`](/api/lexeme)     | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
-| [`Example`](/api/example)   | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions.                                             |
-| [`DocBin`](/api/docbin)     | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training).                     |
 
 ### Processing pipeline {#architecture-pipeline}
 
@@ -42,23 +42,22 @@ components for different language processing tasks and also allows adding
 
 | Name                                            | Description                                                                                 |
 | ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
-| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
-| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
-| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
-| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words.                                                          |
-| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
-| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
 | [`AttributeRuler`](/api/attributeruler)         | Set token attributes using matcher rules.                                                   |
 | [`DependencyParser`](/api/dependencyparser)     | Predict syntactic dependencies.                                                             |
+| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
 | [`EntityRecognizer`](/api/entityrecognizer)     | Predict named entities, e.g. persons or products.                                           |
 | [`EntityRuler`](/api/entityruler)               | Add entity spans to the `Doc` using token-based rules or exact phrase matches.              |
-| [`EntityLinker`](/api/entitylinker)             | Disambiguate named entities to nodes in a knowledge base.                                   |
-| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
-| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| [`Lemmatizer`](/api/lemmatizer)                 | Determine the base forms of words.                                                          |
+| [`Morphologizer`](/api/morphologizer)           | Predict morphological features and coarse-grained part-of-speech tags.                      |
 | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries.                                                                |
-| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
-| [`Pipe`](/api/pipe)                             | Base class that pipeline components may inherit from.                                       |
+| [`Sentencizer`](/api/sentencizer)               | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| [`Tagger`](/api/tagger)                         | Predict part-of-speech tags.                                                                |
+| [`TextCategorizer`](/api/textcategorizer)       | Predict categories or labels over the whole document.                                       |
+| [`Tok2Vec`](/api/tok2vec)                       | Apply a "token-to-vector" model and set its outputs.                                        |
+| [`Tokenizer`](/api/tokenizer)                   | Segment raw text and create `Doc` objects from the words.                                   |
 | [`TrainablePipe`](/api/pipe)                    | Class that all trainable pipeline components inherit from.                                  |
+| [`Transformer`](/api/transformer)               | Use a transformer model and set its outputs.                                                |
+| [Other functions](/api/pipeline-functions)      | Automatically apply something to the `Doc`, e.g. to merge spans of tokens.                  |
 
 ### Matchers {#architecture-matchers}
 
@@ -68,20 +67,20 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 
 | Name                                          | Description                                                                                                                                                                        |
 | --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 | [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                 |
 | [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                        |
-| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 
 ### Other classes {#architecture-other}
 
 | Name                                             | Description                                                                                        |
 | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
+| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
+| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
+| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
+| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
+| [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
+| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                         |
 | [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                               |
 | [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                   |
-| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
-| [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
-| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
-| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
-| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                         |
-| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |
+| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index c5404b68e..660309a20 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -94,13 +94,13 @@
                     { "text": "EntityRuler", "url": "/api/entityruler" },
                     { "text": "Lemmatizer", "url": "/api/lemmatizer" },
                     { "text": "Morphologizer", "url": "/api/morphologizer" },
-                    { "text": "Pipe", "url": "/api/pipe" },
                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
                     { "text": "Sentencizer", "url": "/api/sentencizer" },
                     { "text": "Tagger", "url": "/api/tagger" },
                     { "text": "TextCategorizer", "url": "/api/textcategorizer" },
                     { "text": "Tok2Vec", "url": "/api/tok2vec" },
                     { "text": "Tokenizer", "url": "/api/tokenizer" },
+                    { "text": "TrainablePipe", "url": "/api/pipe" },
                     { "text": "Transformer", "url": "/api/transformer" },
                     { "text": "Other Functions", "url": "/api/pipeline-functions" }
                 ]

From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 11:54:48 +0200
Subject: [PATCH 479/516] Also rename to include_static_vectors in CharEmbed

---
 spacy/ml/models/tok2vec.py           | 6 +++---
 spacy/pipeline/morphologizer.pyx     | 2 +-
 spacy/tests/pipeline/test_tok2vec.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 23cfe883b..6ef7b2325 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -177,7 +177,7 @@ def CharacterEmbed(
     rows: int,
     nM: int,
     nC: int,
-    also_use_static_vectors: bool,
+    include_static_vectors: bool,
     feature: Union[int, str] = "LOWER",
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedded representation based on character embeddings, using
@@ -204,13 +204,13 @@ def CharacterEmbed(
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
-    also_use_static_vectors (bool): Whether to also use static word vectors.
+    include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
     feature = intify_attr(feature)
     if feature is None:
         raise ValueError(Errors.E911(feat=feature))
-    if also_use_static_vectors:
+    if include_static_vectors:
         model = chain(
             concatenate(
                 chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index a456b7a0f..00188a762 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,7 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
-also_use_static_vectors = false
+include_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 90882ae3f..ec4ed17dd 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From e50dc2c1c9fb6b8b536c25d4eb2548771b92083b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 12:04:52 +0200
Subject: [PATCH 480/516] Update docs [ci skip]

---
 website/docs/api/cli.md                       |  2 +-
 website/docs/usage/embeddings-transformers.md |  4 +++-
 website/docs/usage/linguistic-features.md     | 12 +++++++-----
 website/docs/usage/processing-pipelines.md    |  3 ++-
 website/docs/usage/rule-based-matching.md     |  2 +-
 website/docs/usage/v3.md                      |  5 +++--
 6 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 138b4b94b..168465fab 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -643,7 +643,7 @@ Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
 sample text and checking how it updates its internal weights and parameters.
 
 ```cli
-$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id]
+$ python -m spacy debug model [config_path] [component] [--layers] [--dimensions] [--parameters] [--gradients] [--attributes] [--print-step0] [--print-step1] [--print-step2] [--print-step3] [--gpu-id]
 ```
 
 <Accordion title="Example outputs" spaced>
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 549c3bcc4..73540b3d3 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -232,7 +232,9 @@ transformers as subnetworks directly, you can also use them via the
 
 The `Transformer` component sets the
 [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
-which lets you access the transformers outputs at runtime.
+which lets you access the transformers outputs at runtime. The trained
+transformer-based [pipelines](/models) provided by spaCy end on `_trf`, e.g.
+[`en_core_web_trf`](/models/en#en_core_web_trf).
 
 ```cli
 $ python -m spacy download en_core_web_trf
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 6dbf2525e..eb443c645 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1656,9 +1656,10 @@ because it only requires annotated sentence boundaries rather than full
 dependency parses. spaCy's [trained pipelines](/models) include both a parser
 and a trained sentence segmenter, which is
 [disabled](/usage/processing-pipelines#disabling) by default. If you only need
-sentence boundaries and no parser, you can use the `enable` and `disable`
-arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
-disable the parser.
+sentence boundaries and no parser, you can use the `exclude` or `disable`
+argument on [`spacy.load`](/api/top-level#spacy.load) to load the pipeline
+without the parser and then enable the sentence recognizer explicitly with
+[`nlp.enable_pipe`](/api/language#enable_pipe).
 
 > #### senter vs. parser
 >
@@ -1670,7 +1671,8 @@ disable the parser.
 ### {executable="true"}
 import spacy
 
-nlp = spacy.load("en_core_web_sm", enable=["senter"], disable=["parser"])
+nlp = spacy.load("en_core_web_sm", exclude=["parser"])
+nlp.enable_pipe("senter")
 doc = nlp("This is a sentence. This is another sentence.")
 for sent in doc.sents:
     print(sent.text)
@@ -1734,7 +1736,7 @@ nlp = spacy.load("en_core_web_sm")
 doc = nlp(text)
 print("Before:", [sent.text for sent in doc.sents])
 
-@Language.component("set_custom_coundaries")
+@Language.component("set_custom_boundaries")
 def set_custom_boundaries(doc):
     for token in doc[:-1]:
         if token.text == "...":
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index e33ea6001..fdae6d3e5 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1159,7 +1159,8 @@ class DebugComponent:
         self.logger.info(f"Pipeline: {nlp.pipe_names}")
 
     def __call__(self, doc: Doc) -> Doc:
-        self.logger.debug(f"Doc: {len(doc)} tokens, is_tagged: {doc.is_tagged}")
+        is_tagged = doc.has_annotation("TAG")
+        self.logger.debug(f"Doc: {len(doc)} tokens, is tagged: {is_tagged}")
         return doc
 
 nlp = spacy.load("en_core_web_sm")
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index a510398e6..f5825f3a9 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -838,7 +838,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 
 # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
-matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
+matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
 
 # Register token extension
 Token.set_extension("is_hashtag", default=False)
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 250fdb4f4..9191a7db2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -285,6 +285,7 @@ add to your pipeline and customize for your use case:
 | [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                           |
 | [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                            |
 | [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+| [`TrainablePipe`](/api/pipe)                    | Base class for trainable pipeline components.                                                                                                                                                                           |
 
 <Infobox title="Details & Documentation" emoji="📖" list>
 
@@ -396,8 +397,8 @@ type-check model definitions.
 For data validation, spaCy v3.0 adopts
 [`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
 validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
-lets you register **custom functions with typed arguments**, reference them
-in your config and see validation errors if the argument values don't match.
+lets you register **custom functions with typed arguments**, reference them in
+your config and see validation errors if the argument values don't match.
 
 <Infobox title="Details & Documentation" emoji="📖" list>
 

From 18dfb279850adb00c3b3efa18bbb6d58c17bc453 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:05:33 +0200
Subject: [PATCH 481/516] Add custom error when evaluation throws a KeyError

---
 spacy/errors.py        | 3 +++
 spacy/training/loop.py | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 2bc2f3e20..06653edcf 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,9 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already trained and initialized. "
+            "You can also consider moving them to the 'disabled' list instead.")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 242113cc6..8e688a27d 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -249,7 +249,10 @@ def create_evaluation_callback(
 
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
-        scores = nlp.evaluate(dev_examples)
+        try:
+            scores = nlp.evaluate(dev_examples)
+        except KeyError as e:
+            raise KeyError(Errors.E900) from e
         # Calculate a weighted sum based on score_weights for the main score.
         # We can only consider scores that are ints/floats, not dicts like
         # entity scores per type etc.

From 8316bc7d4a6dbd989d53f97a8c7a06758c8d356c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:06:20 +0200
Subject: [PATCH 482/516] bugfix DisabledPipes

---
 spacy/language.py                         |  3 +++
 spacy/tests/pipeline/test_pipe_methods.py | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 1fb559657..24e593043 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1034,6 +1034,9 @@ class Language:
                     )
                 )
             disable = to_disable
+        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
+        # those pipes that were already disabled.
+        disable = [d for d in disable if d not in self._disabled]
         return DisabledPipes(self, disable)
 
     def make_doc(self, text: str) -> Doc:
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index c693a7487..cd18b0159 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
 
 @pytest.mark.parametrize("name", ["my_component"])
 def test_disable_pipes_context(nlp, name):
+    """Test that an enabled component stays enabled after running the context manager."""
     nlp.add_pipe("new_pipe", name=name)
     assert nlp.has_pipe(name)
     with nlp.select_pipes(disable=name):
@@ -136,6 +137,19 @@ def test_disable_pipes_context(nlp, name):
     assert nlp.has_pipe(name)
 
 
+@pytest.mark.parametrize("name", ["my_component"])
+def test_disable_pipes_context_restore(nlp, name):
+    """Test that a disabled component stays disabled after running the context manager."""
+    nlp.add_pipe("new_pipe", name=name)
+    assert nlp.has_pipe(name)
+    nlp.disable_pipes(name)
+    assert not nlp.has_pipe(name)
+    with nlp.select_pipes(disable=name):
+        assert not nlp.has_pipe(name)
+    assert not nlp.has_pipe(name)
+
+
+
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From cc3646b06ce21eb35edea34258a72e6b481af71f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 12:10:25 +0200
Subject: [PATCH 483/516] Add xfailing test for peculiar spans failure [ci
 skip]

---
 spacy/tests/doc/test_doc_api.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index e3e056685..136affab2 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -606,3 +606,16 @@ def test_doc_init_iob():
     ents = [0, "B-", "O", "I-PERSON", "I-GPE"]
     with pytest.raises(ValueError):
         doc = Doc(Vocab(), words=words, ents=ents)
+
+
+@pytest.mark.xfail
+def test_doc_set_ents_spans(en_tokenizer):
+    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
+    spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
+    with doc.retokenize() as retokenizer:
+        for span in spans:
+            retokenizer.merge(span)
+    # If this line is uncommented, it works:
+    # print(spans)
+    doc.ents = list(doc.ents) + spans
+    print([ent.text for ent in doc.ents])

From 4771a10503f6c2a9eadca80bda22dfd4e8c758ad Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 12:15:26 +0200
Subject: [PATCH 484/516] Make test more explicit [ci skip]

---
 spacy/tests/doc/test_doc_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 136affab2..ea832c136 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -617,5 +617,5 @@ def test_doc_set_ents_spans(en_tokenizer):
             retokenizer.merge(span)
     # If this line is uncommented, it works:
     # print(spans)
-    doc.ents = list(doc.ents) + spans
-    print([ent.text for ent in doc.ents])
+    doc.ents = spans
+    assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]

From 2cafba5f50d83a93582bddea6bd1f569f98207f7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:17:35 +0200
Subject: [PATCH 485/516] shorten error message for clarity

---
 spacy/errors.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 06653edcf..3ab9661e0 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,8 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. "
-            "You can also consider moving them to the 'disabled' list instead.")
+            "frozen components, make sure they were already trained and initialized. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "

From 06b9d213fd91397896a24dcf5fa4f90950570e9d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 12:19:47 +0200
Subject: [PATCH 486/516] formatting

---
 spacy/errors.py                           | 2 +-
 spacy/tests/pipeline/test_pipe_methods.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 3ab9661e0..0932ba0fd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -457,7 +457,7 @@ class Errors:
 
     # TODO: fix numbering after merging develop into master
     E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already trained and initialized. ")
+            "frozen components, make sure they were already initialized and trained. ")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index cd18b0159..b744aed98 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -149,7 +149,6 @@ def test_disable_pipes_context_restore(nlp, name):
     assert not nlp.has_pipe(name)
 
 
-
 def test_select_pipes_list_arg(nlp):
     for name in ["c1", "c2", "c3"]:
         nlp.add_pipe("new_pipe", name=name)

From 853edace37af044e21b0631d8d35ede18d16a482 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 14:11:06 +0200
Subject: [PATCH 487/516] fix MultiHashEmbed example in documentation

---
 spacy/ml/models/tok2vec.py                    | 2 +-
 website/docs/usage/embeddings-transformers.md | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 23cfe883b..1a78cf75e 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -110,7 +110,7 @@ def MultiHashEmbed(
 
     The features used can be configured with the 'attrs' argument. The suggested
     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
-    account some subword information, without construction a fully character-based
+    account some subword information, without constructing a fully character-based
     representation. If pretrained vectors are available, they can be included in
     the representation as well, with the vectors table will be kept static
     (i.e. it's not updated).
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 73540b3d3..856685dad 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -516,16 +516,14 @@ Many neural network models are able to use word vector tables as additional
 features, which sometimes results in significant improvements in accuracy.
 spaCy's built-in embedding layer,
 [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
-word vector tables using the `also_use_static_vectors` flag. This setting is
-also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
-layer, which builds the default token-to-vector encoding architecture.
+word vector tables using the `include_static_vectors` flag. 
 
 ```ini
 [tagger.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = 128
-rows = 7000
-also_embed_subwords = true
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [7000, 3500, 3500, 3500]
 also_use_static_vectors = true
 ```
 

From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:07 +0200
Subject: [PATCH 488/516] Update docs

---
 website/docs/usage/embeddings-transformers.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 549c3bcc4..942fc4e7b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional
 features, which sometimes results in significant improvements in accuracy.
 spaCy's built-in embedding layer,
 [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
-word vector tables using the `also_use_static_vectors` flag. This setting is
+word vector tables using the `include_static_vectors` flag. This setting is
 also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
 layer, which builds the default token-to-vector encoding architecture.
 
@@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture.
 [tagger.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v1"
 width = 128
-rows = 7000
-also_embed_subwords = true
-also_use_static_vectors = true
+attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = true
 ```
 
 <Infobox title="How it works" emoji="💡">

From 727370c633b37457ddbedc80aecf07e1dc2c967d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Oct 2020 14:42:51 +0200
Subject: [PATCH 489/516] Remove Span._recalculate_indices

Remove `Span._recalculate_indices`, which is a remnant from the
deprecated `Span.merge`.
---
 spacy/tests/doc/test_doc_api.py          |  9 +++------
 spacy/tests/doc/test_retokenize_merge.py |  1 +
 spacy/tokens/span.pxd                    |  1 -
 spacy/tokens/span.pyx                    | 17 -----------------
 4 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ea832c136..db8a6d1c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -608,14 +608,11 @@ def test_doc_init_iob():
         doc = Doc(Vocab(), words=words, ents=ents)
 
 
-@pytest.mark.xfail
-def test_doc_set_ents_spans(en_tokenizer):
+def test_doc_set_ents_invalid_spans(en_tokenizer):
     doc = en_tokenizer("Some text about Colombia and the Czech Republic")
     spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
     with doc.retokenize() as retokenizer:
         for span in spans:
             retokenizer.merge(span)
-    # If this line is uncommented, it works:
-    # print(spans)
-    doc.ents = spans
-    assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
+    with pytest.raises(IndexError):
+        doc.ents = spans
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index cb886545a..b483255c8 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
         attrs = {"lemma": "none", "ent_type": "none"}
         retokenizer.merge(doc[0:2], attrs=attrs)
         retokenizer.merge(doc[-2:], attrs=attrs)
+    sent1, sent2 = list(doc.sents)
     assert len(sent1) == init_len - 1
     assert len(sent2) == init_len2 - 1
 
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index f6f88a23e..cc6b908bb 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -16,5 +16,4 @@ cdef class Span:
     cdef public _vector
     cdef public _vector_norm
 
-    cpdef int _recalculate_indices(self) except -1
     cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 64c3c7df0..491ba0266 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -150,7 +150,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#len
         """
-        self._recalculate_indices()
         if self.end < self.start:
             return 0
         return self.end - self.start
@@ -167,7 +166,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#getitem
         """
-        self._recalculate_indices()
         if isinstance(i, slice):
             start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self.doc, start + self.start, end + self.start)
@@ -188,7 +186,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#iter
         """
-        self._recalculate_indices()
         for i in range(self.start, self.end):
             yield self.doc[i]
 
@@ -339,19 +336,6 @@ cdef class Span:
                 output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
         return output
 
-    cpdef int _recalculate_indices(self) except -1:
-        if self.end > self.doc.length \
-        or self.doc.c[self.start].idx != self.start_char \
-        or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
-            start = token_by_start(self.doc.c, self.doc.length, self.start_char)
-            if self.start == -1:
-                raise IndexError(Errors.E036.format(start=self.start_char))
-            end = token_by_end(self.doc.c, self.doc.length, self.end_char)
-            if end == -1:
-                raise IndexError(Errors.E037.format(end=self.end_char))
-            self.start = start
-            self.end = end + 1
-
     @property
     def vocab(self):
         """RETURNS (Vocab): The Span's Doc's vocab."""
@@ -520,7 +504,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#root
         """
-        self._recalculate_indices()
         if "root" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["root"](self)
         # This should probably be called 'head', and the other one called

From 040c7c054125d32da2af9c73f604b811e6ae0d97 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 15:40:58 +0200
Subject: [PATCH 490/516] fix get_dim calls in build_simple_cnn_text_classifier

---
 spacy/ml/models/textcat.py | 4 ++--
 spacy/util.py              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 1117b4fde..ec8998e2d 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -24,11 +24,11 @@ def build_simple_cnn_text_classifier(
     """
     with Model.define_operators({">>": chain}):
         if exclusive_classes:
-            output_layer = Softmax(nO=nO, nI=tok2vec.get_dim("nO"))
+            output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
             model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer
             model.set_ref("output_layer", output_layer)
         else:
-            linear_layer = Linear(nO=nO, nI=tok2vec.get_dim("nO"))
+            linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO"))
             model = (
                 tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic()
             )
diff --git a/spacy/util.py b/spacy/util.py
index 3d567a425..47fbcce1c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -622,7 +622,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
     if not path.parent.exists():
         raise IOError(Errors.E052.format(path=path.parent))
     if not path.exists() or not path.is_file():
-        raise IOError(Errors.E053.format(path=path, name="meta.json"))
+        raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
     meta = srsly.read_json(path)
     for setting in ["lang", "name", "version"]:
         if setting not in meta or not meta[setting]:

From e972ecba727a35d59080dc0e217faa02044abb4e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 9 Oct 2020 16:03:14 +0200
Subject: [PATCH 491/516] add utf8 encoding for opening file

---
 spacy/cli/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e4559929e..8413c639b 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -253,7 +253,7 @@ def _get_converter(msg, converter, input_path):
     if converter == "auto":
         converter = input_path.suffix[1:]
     if converter == "ner" or converter == "iob":
-        with input_path.open() as file_:
+        with input_path.open(encoding="utf8") as file_:
             input_data = file_.read()
         converter_autodetect = autodetect_ner_format(input_data)
         if converter_autodetect == "ner":

From 97ff090e495208a5944561e210c76ef77e93eab3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 16:03:57 +0200
Subject: [PATCH 492/516] Fix docs example [ci skip]

---
 website/docs/usage/processing-pipelines.md | 54 +++++++++-------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index fdae6d3e5..83134962b 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
 
 This example shows the implementation of a pipeline component that fetches
 country meta data via the [REST Countries API](https://restcountries.eu), sets
-entity annotations for countries, merges entities into one token and sets custom
-attributes on the `Doc`, `Span` and `Token` – for example, the capital,
-latitude/longitude coordinates and even the country flag.
+entity annotations for countries and sets custom attributes on the `Doc` and
+`Span` – for example, the capital, latitude/longitude coordinates and even the
+country flag.
 
 ```python
 ### {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
         # Set up the PhraseMatcher with Doc patterns for each country name
         self.matcher = PhraseMatcher(nlp.vocab)
         self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
-        # Register attribute on the Token. We'll be overwriting this based on
+        # Register attributes on the Span. We'll be overwriting this based on
         # the matches, so we're only setting a default value, not a getter.
-        Token.set_extension("is_country", default=False)
-        Token.set_extension("country_capital", default=False)
-        Token.set_extension("country_latlng", default=False)
-        Token.set_extension("country_flag", default=False)
-        # Register attributes on Doc and Span via a getter that checks if one of
-        # the contained tokens is set to is_country == True.
+        Span.set_extension("is_country", default=None)
+        Span.set_extension("country_capital", default=None)
+        Span.set_extension("country_latlng", default=None)
+        Span.set_extension("country_flag", default=None)
+        # Register attribute on Doc via a getter that checks if the Doc
+        # contains a country entity
         Doc.set_extension("has_country", getter=self.has_country)
-        Span.set_extension("has_country", getter=self.has_country)
 
     def __call__(self, doc):
         spans = []  # keep the spans for later so we can merge them afterwards
         for _, start, end in self.matcher(doc):
             # Generate Span representing the entity & set label
             entity = Span(doc, start, end, label=self.label)
+            # Set custom attributes on entity. Can be extended with other data
+            # returned by the API, like currencies, country code, calling code etc.
+            entity._.set("is_country", True)
+            entity._.set("country_capital", self.countries[entity.text]["capital"])
+            entity._.set("country_latlng", self.countries[entity.text]["latlng"])
+            entity._.set("country_flag", self.countries[entity.text]["flag"])
             spans.append(entity)
-            # Set custom attribute on each token of the entity
-            # Can be extended with other data returned by the API, like
-            # currencies, country code, flag, calling code etc.
-            for token in entity:
-                token._.set("is_country", True)
-                token._.set("country_capital", self.countries[entity.text]["capital"])
-                token._.set("country_latlng", self.countries[entity.text]["latlng"])
-                token._.set("country_flag", self.countries[entity.text]["flag"])
-        # Iterate over all spans and merge them into one token
-        with doc.retokenize() as retokenizer:
-            for span in spans:
-                retokenizer.merge(span)
         # Overwrite doc.ents and add entity – be careful not to replace!
         doc.ents = list(doc.ents) + spans
         return doc  # don't forget to return the Doc!
 
-    def has_country(self, tokens):
-        """Getter for Doc and Span attributes. Since the getter is only called
-        when we access the attribute, we can refer to the Token's 'is_country'
+    def has_country(self, doc):
+        """Getter for Doc attributes. Since the getter is only called
+        when we access the attribute, we can refer to the Span's 'is_country'
         attribute here, which is already set in the processing step."""
-        return any([t._.get("is_country") for t in tokens])
+        return any([entity._.get("is_country") for entity in doc.ents])
 
 nlp = English()
 nlp.add_pipe("rest_countries", config={"label": "GPE"})
 doc = nlp("Some text about Colombia and the Czech Republic")
 print("Pipeline", nlp.pipe_names)  # pipeline contains component name
 print("Doc has countries", doc._.has_country)  # Doc contains countries
-for token in doc:
-    if token._.is_country:
-        print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
-print("Entities", [(e.text, e.label_) for e in doc.ents])
+for ent in doc.ents:
+    if ent._.is_country:
+        print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
 ```
 
 In this case, all data can be fetched on initialization in one request. However,

From 8ac5f222531dcb602d08118693618598bc0c045d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:16 +0200
Subject: [PATCH 493/516] Adjust error message

---
 spacy/errors.py        | 5 +++--
 spacy/training/loop.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 0932ba0fd..be327a784 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,8 +456,9 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
-    E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
-            "frozen components, make sure they were already initialized and trained. ")
+    E900 = ("Could not run the full pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already initialized and "
+            "trained. Full pipeline: {pipeline}")
     E901 = ("Failed to remove existing output directory: {path}. If your "
             "config and the components you train change between runs, a "
             "non-empty output directory can lead to stale pipeline data. To "
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 8e688a27d..c3fa83b39 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -252,7 +252,7 @@ def create_evaluation_callback(
         try:
             scores = nlp.evaluate(dev_examples)
         except KeyError as e:
-            raise KeyError(Errors.E900) from e
+            raise KeyError(Errors.E900.format(pipeline=nlp.pipe_names)) from e
         # Calculate a weighted sum based on score_weights for the main score.
         # We can only consider scores that are ints/floats, not dicts like
         # entity scores per type etc.

From 525f7988416f9d944f5993a793f999f91e8685f8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:21 +0200
Subject: [PATCH 494/516] Fix typo in test

---
 spacy/tests/pipeline/test_pipe_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index b744aed98..6a21ddfaa 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -142,7 +142,7 @@ def test_disable_pipes_context_restore(nlp, name):
     """Test that a disabled component stays disabled after running the context manager."""
     nlp.add_pipe("new_pipe", name=name)
     assert nlp.has_pipe(name)
-    nlp.disable_pipes(name)
+    nlp.disable_pipe(name)
     assert not nlp.has_pipe(name)
     with nlp.select_pipes(disable=name):
         assert not nlp.has_pipe(name)

From 796f8b9424737b51da81d35fe33b8383f1d5bdf7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 9 Oct 2020 18:00:27 +0200
Subject: [PATCH 495/516] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 095d726a0..763faa3eb 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a36"
+__version__ = "3.0.0a37"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From bfa3931c9dc9f1ab960d81c985ddaf4bb4a4d023 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 18:55:07 +0200
Subject: [PATCH 496/516] Revert added_strings change (#6236)

---
 spacy/about.py                                |  2 +-
 spacy/errors.py                               |  8 ++++
 spacy/kb.pxd                                  |  1 -
 spacy/kb.pyx                                  | 15 ++----
 spacy/pipeline/attributeruler.py              | 17 +++----
 spacy/pipeline/entity_linker.py               |  3 +-
 spacy/pipeline/lemmatizer.py                  |  4 ++
 spacy/pipeline/morphologizer.pyx              |  5 --
 spacy/pipeline/senter.pyx                     |  1 -
 spacy/pipeline/tagger.pyx                     |  3 +-
 spacy/pipeline/textcat.py                     |  3 +-
 spacy/pipeline/tok2vec.py                     |  1 -
 spacy/pipeline/trainable_pipe.pxd             |  1 -
 spacy/pipeline/trainable_pipe.pyx             | 36 +++++++++------
 spacy/pipeline/transition_parser.pyx          | 27 ++++++-----
 spacy/tests/pipeline/test_entity_linker.py    | 19 ++------
 spacy/tests/pipeline/test_morphologizer.py    |  1 -
 spacy/tests/pipeline/test_senter.py           |  1 -
 spacy/tests/pipeline/test_tagger.py           |  2 -
 spacy/tests/pipeline/test_textcat.py          |  2 -
 spacy/tests/regression/test_issue5230.py      |  4 +-
 .../serialize/test_serialize_pipeline.py      | 46 ++++++++++++++++---
 spacy/util.py                                 |  2 +-
 23 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 763faa3eb..efdfd26c0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a37"
+__version__ = "3.0.0a38"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/errors.py b/spacy/errors.py
index be327a784..5fab0bab1 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,6 +456,14 @@ class Errors:
             "issue tracker: http://github.com/explosion/spaCy/issues")
 
     # TODO: fix numbering after merging develop into master
+    E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the component model as `self.model` in your "
+            "component's __init__ method.")
+    E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the current `nlp` object's vocab as `self.vocab` in "
+            "your component's __init__ method.")
     E900 = ("Could not run the full pipeline for evaluation. If you specified "
             "frozen components, make sure they were already initialized and "
             "trained. Full pipeline: {pipeline}")
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d61bd43fa..4a71b26a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -30,7 +30,6 @@ cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly Vocab vocab
     cdef int64_t entity_vector_length
-    cdef public set _added_strings
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _KBEntryC struct in the _entries vector).
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 478579d71..10aa377eb 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -92,7 +92,6 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.vocab = vocab
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
-        self._added_strings = set()
 
     @property
     def entity_vector_length(self):
@@ -114,16 +113,12 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
         Return the hash of the entity ID/name at the end.
         """
-        cdef hash_t entity_hash = self.add_string(entity)
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 
         # Return if this entity was added before
         if entity_hash in self._entry_index:
@@ -157,7 +152,7 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash
         while i < len(entity_list):
             # only process this entity if its unique ID hadn't been added before
-            entity_hash = self.add_string(entity_list[i])
+            entity_hash = self.vocab.strings.add(entity_list[i])
             if entity_hash in self._entry_index:
                 warnings.warn(Warnings.W018.format(entity=entity_list[i]))
 
@@ -203,7 +198,7 @@ cdef class KnowledgeBase:
         if prob_sum > 1.00001:
             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 
-        cdef hash_t alias_hash = self.add_string(alias)
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
         # Check whether this alias was added before
         if alias_hash in self._alias_index:
@@ -332,7 +327,7 @@ cdef class KnowledgeBase:
             raise ValueError(Errors.E928.format(loc=path))
         serialize = {}
         serialize["contents"] = lambda p: self.write_contents(p)
-        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
@@ -343,7 +338,7 @@ cdef class KnowledgeBase:
             raise ValueError(Errors.E928.format(loc=path))
         deserialize = {}
         deserialize["contents"] = lambda p: self.read_contents(p)
-        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
+        deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
         util.from_disk(path, deserialize, exclude)
 
     def write_contents(self, file_path):
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 7a6a1de5b..e17d3be98 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator
+from typing import List, Dict, Union, Iterable, Any, Optional, Callable
 from typing import Tuple
 import srsly
 from pathlib import Path
@@ -57,7 +57,6 @@ class AttributeRuler(Pipe):
         self.attrs = []
         self._attrs_unnormed = []  # store for reference
         self.indices = []
-        self._added_strings = set()
 
     def clear(self) -> None:
         """Reset all patterns."""
@@ -187,16 +186,12 @@ class AttributeRuler(Pipe):
         # We need to make a string here, because otherwise the ID we pass back
         # will be interpreted as the hash of a string, rather than an ordinal.
         key = str(len(self.attrs))
-        self.matcher.add(self.add_string(key), patterns)
+        self.matcher.add(self.vocab.strings.add(key), patterns)
         self._attrs_unnormed.append(attrs)
         attrs = normalize_token_attrs(self.vocab, attrs)
         self.attrs.append(attrs)
         self.indices.append(index)
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None:
         """Add patterns from a list of pattern dicts with the keys as the
         arguments to AttributeRuler.add.
@@ -256,8 +251,8 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_bytes
         """
         serialize = {}
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
-        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(
@@ -276,7 +271,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.msgpack_loads(b))
 
         deserialize = {
-            "strings.json": lambda b: [self.add_string(s) for s in srsly.json_loads(b)],
+            "vocab": lambda b: self.vocab.from_bytes(b),
             "patterns": load_patterns,
         }
         util.from_bytes(bytes_data, deserialize, exclude)
@@ -293,7 +288,7 @@ class AttributeRuler(Pipe):
         DOCS: https://nightly.spacy.io/api/attributeruler#to_disk
         """
         serialize = {
-            "strings.json": lambda p: srsly.write_json(p, self._added_strings),
+            "vocab": lambda p: self.vocab.to_disk(p),
             "patterns": lambda p: srsly.write_msgpack(p, self.patterns),
         }
         util.to_disk(path, serialize, exclude)
@@ -314,7 +309,7 @@ class AttributeRuler(Pipe):
             self.add_patterns(srsly.read_msgpack(p))
 
         deserialize = {
-            "strings.json": lambda p: [self.add_string(s) for s in srsly.read_json(p)],
+            "vocab": lambda p: self.vocab.from_disk(p),
             "patterns": load_patterns,
         }
         util.from_disk(path, deserialize, exclude)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 881e98785..3bb449b4d 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -453,6 +453,7 @@ class EntityLinker(TrainablePipe):
         DOCS: https://nightly.spacy.io/api/entitylinker#to_disk
         """
         serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
         serialize["kb"] = lambda p: self.kb.to_disk(p)
         serialize["model"] = lambda p: self.model.to_disk(p)
@@ -481,8 +482,6 @@ class EntityLinker(TrainablePipe):
         deserialize["kb"] = lambda p: self.kb.from_disk(p)
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
-        for s in self.kb._added_strings:
-            self.vocab.strings.add(s)
         return self
 
     def rehearse(self, examples, *, sgd=None, losses=None, **config):
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 7f5370753..9be596868 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -281,6 +281,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
         """
         serialize = {}
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["lookups"] = lambda p: self.lookups.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
@@ -296,6 +297,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
         """
         deserialize = {}
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
         util.from_disk(path, deserialize, exclude)
         self._validate_tables()
@@ -310,6 +312,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
         """
         serialize = {}
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["lookups"] = self.lookups.to_bytes
         return util.to_bytes(serialize, exclude)
 
@@ -325,6 +328,7 @@ class Lemmatizer(Pipe):
         DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
         """
         deserialize = {}
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
         util.from_bytes(bytes_data, deserialize, exclude)
         self._validate_tables()
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 00188a762..ac111f28b 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -95,7 +95,6 @@ class Morphologizer(Tagger):
         # add mappings for empty morph
         self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
         self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
-        self._added_strings = set()
 
     @property
     def labels(self):
@@ -129,7 +128,6 @@ class Morphologizer(Tagger):
             label_dict.pop(self.POS_FEAT)
         # normalize morph string and add to morphology table
         norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
-        self.add_string(norm_morph)
         # add label mappings
         if norm_label not in self.cfg["labels_morph"]:
             self.cfg["labels_morph"][norm_label] = norm_morph
@@ -161,7 +159,6 @@ class Morphologizer(Tagger):
                     if pos:
                         morph_dict[self.POS_FEAT] = pos
                     norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
-                    self.add_string(norm_label)
                     # add label->morph and label->POS mappings
                     if norm_label not in self.cfg["labels_morph"]:
                         self.cfg["labels_morph"][norm_label] = morph
@@ -179,7 +176,6 @@ class Morphologizer(Tagger):
                 if pos:
                     morph_dict[self.POS_FEAT] = pos
                 norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
-                self.add_string(norm_label)
                 gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
             doc_sample.append(example.x)
             label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
@@ -238,7 +234,6 @@ class Morphologizer(Tagger):
                 if pos:
                     label_dict[self.POS_FEAT] = pos
                 label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
-                self.add_string(label)
                 eg_truths.append(label)
             truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 8ea4ed1b3..15a21902a 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -61,7 +61,6 @@ class SentenceRecognizer(Tagger):
         self.name = name
         self._rehearsal_model = None
         self.cfg = {}
-        self._added_strings = set()
 
     @property
     def labels(self):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 535b71270..1b0f79cea 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -78,7 +78,6 @@ class Tagger(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": labels or []}
         self.cfg = dict(sorted(cfg.items()))
-        self._added_strings = set()
 
     @property
     def labels(self):
@@ -313,7 +312,7 @@ class Tagger(TrainablePipe):
             return 0
         self._allow_extra_label()
         self.cfg["labels"].append(label)
-        self.add_string(label)
+        self.vocab.strings.add(label)
         return 1
 
     def score(self, examples, **kwargs):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index e57954184..5ebe0e104 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -110,7 +110,6 @@ class TextCategorizer(TrainablePipe):
         self._rehearsal_model = None
         cfg = {"labels": [], "threshold": threshold, "positive_label": None}
         self.cfg = dict(cfg)
-        self._added_strings = set()
 
     @property
     def labels(self) -> Tuple[str]:
@@ -301,7 +300,7 @@ class TextCategorizer(TrainablePipe):
             return 0
         self._allow_extra_label()
         self.cfg["labels"].append(label)
-        self.add_string(label)
+        self.vocab.strings.add(label)
         return 1
 
     def initialize(
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index b4625291b..0ad875035 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -64,7 +64,6 @@ class Tok2Vec(TrainablePipe):
         self.name = name
         self.listeners = []
         self.cfg = {}
-        self._added_strings = set()
 
     def add_listener(self, listener: "Tok2VecListener") -> None:
         """Add a listener for a downstream component. Usually internals."""
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index 8df5cb775..d5cdbb511 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -5,4 +5,3 @@ cdef class TrainablePipe(Pipe):
     cdef public Vocab vocab
     cdef public object model
     cdef public object cfg
-    cdef public set _added_strings
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 07a308953..88e50e7c6 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -13,6 +13,7 @@ from ..vocab import Vocab
 from ..language import Language
 from ..training import Example
 
+
 cdef class TrainablePipe(Pipe):
     """This class is a base class and not instantiated directly. Trainable
     pipeline components like the EntityRecognizer or TextCategorizer inherit
@@ -35,7 +36,6 @@ cdef class TrainablePipe(Pipe):
         self.model = model
         self.name = name
         self.cfg = dict(cfg)
-        self._added_strings = set()
 
     def __call__(self, Doc doc) -> Doc:
         """Apply the pipe to one document. The document is modified in place,
@@ -198,10 +198,6 @@ cdef class TrainablePipe(Pipe):
         """
         raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
 
-    def add_string(self, string: str):
-        self._added_strings.add(string)
-        return self.vocab.strings.add(string)
-
     @property
     def is_trainable(self) -> bool:
         return True
@@ -244,6 +240,16 @@ cdef class TrainablePipe(Pipe):
         """
         self.model.finish_update(sgd)
 
+    def _validate_serialization_attrs(self):
+        """Check that the pipe implements the required attributes. If a subclass
+        implements a custom __init__ method but doesn't set these attributes,
+        the currently default to None, so we need to perform additonal checks.
+        """
+        if not hasattr(self, "vocab") or self.vocab is None:
+            raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
+        if not hasattr(self, "model") or self.model is None:
+            raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
+
     def to_bytes(self, *, exclude=tuple()):
         """Serialize the pipe to a bytestring.
 
@@ -252,11 +258,12 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#to_bytes
         """
+        self._validate_serialization_attrs()
         serialize = {}
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["vocab"] = self.vocab.to_bytes
         serialize["model"] = self.model.to_bytes
-        serialize["strings.json"] = lambda: srsly.json_dumps(sorted(self._added_strings))
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, *, exclude=tuple()):
@@ -267,6 +274,7 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#from_bytes
         """
+        self._validate_serialization_attrs()
 
         def load_model(b):
             try:
@@ -275,9 +283,9 @@ cdef class TrainablePipe(Pipe):
                 raise ValueError(Errors.E149) from None
 
         deserialize = {}
-        deserialize["strings.json"] = lambda b: [self.add_string(s) for s in srsly.json_loads(b)]
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
         deserialize["model"] = load_model
         util.from_bytes(bytes_data, deserialize, exclude)
         return self
@@ -290,10 +298,11 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#to_disk
         """
+        self._validate_serialization_attrs()
         serialize = {}
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize["strings.json"] = lambda p: srsly.write_json(p, self._added_strings)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["model"] = lambda p: self.model.to_disk(p)
         util.to_disk(path, serialize, exclude)
 
@@ -306,6 +315,7 @@ cdef class TrainablePipe(Pipe):
 
         DOCS: https://nightly.spacy.io/api/pipe#from_disk
         """
+        self._validate_serialization_attrs()
 
         def load_model(p):
             try:
@@ -314,9 +324,9 @@ cdef class TrainablePipe(Pipe):
                 raise ValueError(Errors.E149) from None
 
         deserialize = {}
-        deserialize["strings.json"] = lambda p: [self.add_string(s) for s in srsly.read_json(p)]
-        if hasattr(self, "cfg"):
+        if hasattr(self, "cfg") and self.cfg is not None:
             deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
         deserialize["model"] = load_model
         util.from_disk(path, deserialize, exclude)
         return self
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 3743e1018..63a8595cc 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -76,7 +76,6 @@ cdef class Parser(TrainablePipe):
             self.add_multitask_objective(multitask)
 
         self._rehearsal_model = None
-        self._added_strings = set()
 
     def __getnewargs_ex__(self):
         """This allows pickling the Parser and its keyword-only init arguments"""
@@ -120,7 +119,7 @@ cdef class Parser(TrainablePipe):
                 resized = True
         if resized:
             self._resize()
-            self.add_string(label)
+            self.vocab.strings.add(label)
             return 1
         return 0
 
@@ -456,24 +455,24 @@ cdef class Parser(TrainablePipe):
 
     def to_disk(self, path, exclude=tuple()):
         serializers = {
-            'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
-            'strings.json': lambda p: srsly.write_json(p, self._added_strings),
-            'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]),
-            'cfg': lambda p: srsly.write_json(p, self.cfg)
+            "model": lambda p: (self.model.to_disk(p) if self.model is not True else True),
+            "vocab": lambda p: self.vocab.to_disk(p),
+            "moves": lambda p: self.moves.to_disk(p, exclude=["strings"]),
+            "cfg": lambda p: srsly.write_json(p, self.cfg)
         }
         util.to_disk(path, serializers, exclude)
 
     def from_disk(self, path, exclude=tuple()):
         deserializers = {
-            'strings.json': lambda p: [self.add_string(s) for s in srsly.read_json(p)],
-            'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]),
-            'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
-            'model': lambda p: None,
+            "vocab": lambda p: self.vocab.from_disk(p),
+            "moves": lambda p: self.moves.from_disk(p, exclude=["strings"]),
+            "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
+            "model": lambda p: None,
         }
         util.from_disk(path, deserializers, exclude)
-        if 'model' not in exclude:
+        if "model" not in exclude:
             path = util.ensure_path(path)
-            with (path / 'model').open('rb') as file_:
+            with (path / "model").open("rb") as file_:
                 bytes_data = file_.read()
             try:
                 self._resize()
@@ -485,7 +484,7 @@ cdef class Parser(TrainablePipe):
     def to_bytes(self, exclude=tuple()):
         serializers = {
             "model": lambda: (self.model.to_bytes()),
-            "strings.json": lambda: srsly.json_dumps(sorted(self._added_strings)),
+            "vocab": lambda: self.vocab.to_bytes(),
             "moves": lambda: self.moves.to_bytes(exclude=["strings"]),
             "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)
         }
@@ -493,7 +492,7 @@ cdef class Parser(TrainablePipe):
 
     def from_bytes(self, bytes_data, exclude=tuple()):
         deserializers = {
-            "strings.json": lambda b: [self.add_string(s) for s in  srsly.json_loads(b)],
+            "vocab": lambda b: self.vocab.from_bytes(b),
             "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]),
             "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
             "model": lambda b: None,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 71496327b..ff2e33fc7 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -121,9 +121,7 @@ def test_kb_default(nlp):
 
 def test_kb_custom_length(nlp):
     """Test that the default (empty) KB can be configured with a custom entity length"""
-    entity_linker = nlp.add_pipe(
-        "entity_linker", config={"entity_vector_length": 35}
-    )
+    entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 35})
     assert len(entity_linker.kb) == 0
     assert entity_linker.kb.get_size_entities() == 0
     assert entity_linker.kb.get_size_aliases() == 0
@@ -213,16 +211,11 @@ def test_el_pipe_configuration(nlp):
         kb = KnowledgeBase(vocab, entity_vector_length=1)
         kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
         kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
-        kb.add_alias(
-            alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]
-        )
+        kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
         return kb
 
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
-    entity_linker = nlp.add_pipe(
-        "entity_linker",
-        config={"incl_context": False},
-    )
+    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},)
     entity_linker.set_kb(create_kb)
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
@@ -453,14 +446,10 @@ def test_overfitting_IO():
         return mykb
 
     # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe(
-        "entity_linker",
-        last=True,
-    )
+    entity_linker = nlp.add_pipe("entity_linker", last=True,)
     entity_linker.set_kb(create_kb)
     assert "Q2146908" in entity_linker.vocab.strings
     assert "Q2146908" in entity_linker.kb.vocab.strings
-    assert "Q2146908" in entity_linker.kb._added_strings
 
     # train the NEL pipe
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index ce9c0fa54..fd7aa05be 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -101,4 +101,3 @@ def test_overfitting_IO():
         doc2 = nlp2(test_text)
         assert [str(t.morph) for t in doc2] == gold_morphs
         assert [t.pos_ for t in doc2] == gold_pos_tags
-        assert nlp.get_pipe("morphologizer")._added_strings == nlp2.get_pipe("morphologizer")._added_strings
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 472216512..c9722e5de 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -80,4 +80,3 @@ def test_overfitting_IO():
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
         assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
-        assert nlp.get_pipe("senter")._added_strings == nlp2.get_pipe("senter")._added_strings
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 590c22233..b9db76cdf 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -98,7 +98,6 @@ def test_overfitting_IO():
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["tagger"] < 0.00001
-    assert tagger._added_strings == {"J", "N", "V"}
 
     # test the trained model
     test_text = "I like blue eggs"
@@ -117,7 +116,6 @@ def test_overfitting_IO():
         assert doc2[1].tag_ is "V"
         assert doc2[2].tag_ is "J"
         assert doc2[3].tag_ is "N"
-        assert nlp2.get_pipe("tagger")._added_strings == {"J", "N", "V"}
 
 
 def test_tagger_requires_labels():
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 7eb7ff658..dd2f1070b 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -146,7 +146,6 @@ def test_overfitting_IO():
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
     optimizer = nlp.initialize(get_examples=lambda: train_examples)
     assert textcat.model.get_dim("nO") == 2
-    assert textcat._added_strings == {"NEGATIVE", "POSITIVE"}
 
     for i in range(50):
         losses = {}
@@ -168,7 +167,6 @@ def test_overfitting_IO():
         cats2 = doc2.cats
         assert cats2["POSITIVE"] > 0.9
         assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
-        assert nlp2.get_pipe("textcat")._added_strings == {"NEGATIVE", "POSITIVE"}
 
     # Test scoring
     scores = nlp.evaluate(train_examples)
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index 02d0c70dd..a00b2a688 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -7,6 +7,7 @@ from spacy.kb import KnowledgeBase, Writer
 from spacy.vectors import Vectors
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.vocab import Vocab
 
 from ..util import make_tempdir
 
@@ -50,8 +51,9 @@ def custom_pipe():
             else:
                 self.cfg = None
             self.model = SerializableDummy()
+            self.vocab = vocab
 
-    return MyPipe(None)
+    return MyPipe(Vocab())
 
 
 def tagger():
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index dfd7f6bd4..951dd3035 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -1,13 +1,13 @@
 import pytest
-import srsly
 from spacy import registry, Vocab
 from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
-from spacy.pipeline import TextCategorizer, SentenceRecognizer
+from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe
 from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
 from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
 from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
 from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
 from spacy.lang.en import English
+from thinc.api import Linear
 import spacy
 
 from ..util import make_tempdir
@@ -89,7 +89,6 @@ def test_serialize_parser_strings(Parser):
     assert label not in vocab2.strings
     parser2 = Parser(vocab2, model, **config)
     parser2 = parser2.from_bytes(parser1.to_bytes(exclude=["vocab"]))
-    assert parser1._added_strings == parser2._added_strings == {"FunnyLabel"}
     assert label in parser2.vocab.strings
 
 
@@ -166,17 +165,13 @@ def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
         # check that custom labels are serialized as part of the component's strings.jsonl
         tagger.add_label(label)
         assert label in tagger.vocab.strings
-        assert tagger._added_strings == {label}
         file_path = d / "tagger1"
         tagger.to_disk(file_path)
-        strings = srsly.read_json(file_path / "strings.json")
-        assert strings == ["SomeWeirdLabel"]
         # ensure that the custom strings are loaded back in when using the tagger in another pipeline
         cfg = {"model": DEFAULT_TAGGER_MODEL}
         model = registry.resolve(cfg, validate=True)["model"]
         tagger2 = Tagger(de_vocab, model).from_disk(file_path)
         assert label in tagger2.vocab.strings
-        assert tagger2._added_strings == {label}
 
 
 def test_serialize_textcat_empty(en_vocab):
@@ -253,3 +248,40 @@ def test_serialize_pipeline_disable_enable():
     assert nlp5.pipe_names == ["ner"]
     assert nlp5.component_names == ["ner"]
     assert nlp5.disabled == []
+
+
+def test_serialize_custom_trainable_pipe():
+    class BadCustomPipe1(TrainablePipe):
+        def __init__(self, vocab):
+            pass
+
+    class BadCustomPipe2(TrainablePipe):
+        def __init__(self, vocab):
+            self.vocab = vocab
+            self.model = None
+
+    class CustomPipe(TrainablePipe):
+        def __init__(self, vocab, model):
+            self.vocab = vocab
+            self.model = model
+
+    pipe = BadCustomPipe1(Vocab())
+    with pytest.raises(ValueError):
+        pipe.to_bytes()
+    with make_tempdir() as d:
+        with pytest.raises(ValueError):
+            pipe.to_disk(d)
+    pipe = BadCustomPipe2(Vocab())
+    with pytest.raises(ValueError):
+        pipe.to_bytes()
+    with make_tempdir() as d:
+        with pytest.raises(ValueError):
+            pipe.to_disk(d)
+    pipe = CustomPipe(Vocab(), Linear())
+    pipe_bytes = pipe.to_bytes()
+    new_pipe = CustomPipe(Vocab(), Linear()).from_bytes(pipe_bytes)
+    assert new_pipe.to_bytes() == pipe_bytes
+    with make_tempdir() as d:
+        pipe.to_disk(d)
+        new_pipe = CustomPipe(Vocab(), Linear()).from_disk(d)
+    assert new_pipe.to_bytes() == pipe_bytes
diff --git a/spacy/util.py b/spacy/util.py
index 47fbcce1c..58f951f86 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -821,7 +821,7 @@ def get_object_name(obj: Any) -> str:
     obj (Any): The Python object, typically a function or class.
     RETURNS (str): A human-readable name.
     """
-    if hasattr(obj, "name"):
+    if hasattr(obj, "name") and obj.name is not None:
         return obj.name
     if hasattr(obj, "__name__"):
         return obj.__name__

From 74972744e589969af8d0ebc83259d92c9e9a5f2f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 19:08:57 +0200
Subject: [PATCH 497/516] Update Thinc

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d48886e0c..c175ded66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a43,<8.0.0a50",
+    "thinc>=8.0.0a44,<8.0.0a50",
     "blis>=0.4.0,<0.8.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 3f3886a60..d6b6267a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a43,<8.0.0a50
+thinc>=8.0.0a44,<8.0.0a50
 blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 69d4e6347..d9414a4f4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a43,<8.0.0a50
+    thinc>=8.0.0a44,<8.0.0a50
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a43,<8.0.0a50
+    thinc>=8.0.0a44,<8.0.0a50
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.3.0,<3.0.0

From 539b0c10daef8bb5d6f7e4f230a02452c6569996 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 10 Oct 2020 19:14:48 +0200
Subject: [PATCH 498/516] Tidy up and auto-format

---
 spacy/lang/tr/lex_attrs.py                 |  5 +++--
 spacy/lang/tr/syntax_iterators.py          |  7 +++----
 spacy/language.py                          |  6 ++++--
 spacy/tests/conftest.py                    |  2 ++
 spacy/tests/lang/tr/test_parser.py         | 19 ++++++++++++-------
 spacy/tests/lang/tr/test_text.py           |  5 ++---
 spacy/tests/pipeline/test_entity_linker.py |  2 +-
 spacy/tests/regression/test_issue6207.py   |  4 ++--
 spacy/tests/test_models.py                 | 13 +++----------
 9 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 3615f4b4c..d9e12c4aa 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -62,6 +62,7 @@ _ordinal_words = [
 
 _ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
 
+
 def like_num(text):
     if text.startswith(("+", "-", "±", "~")):
         text = text[1:]
@@ -75,11 +76,11 @@ def like_num(text):
 
     text_lower = text.lower()
 
-    #Check cardinal number
+    # Check cardinal number
     if text_lower in _num_words:
         return True
 
-    #Check ordinal number
+    # Check ordinal number
     if text_lower in _ordinal_words:
         return True
     if text_lower.endswith(_ordinal_endings):
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 665ccb590..d9b342949 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -49,11 +49,10 @@ def noun_chunks(doclike):
             prev_end = word.left_edge.i
             yield word.left_edge.i, extend_right(word), np_label
         elif word.dep == conj:
-            cc_token = word.left_edge  
+            cc_token = word.left_edge
             prev_end = cc_token.i
-            yield cc_token.right_edge.i + 1, extend_right(word), np_label  # Shave off cc tokens from the NP
-
-
+            # Shave off cc tokens from the NP
+            yield cc_token.right_edge.i + 1, extend_right(word), np_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/language.py b/spacy/language.py
index 24e593043..dd790e85f 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,5 +1,5 @@
 from typing import Optional, Any, Dict, Callable, Iterable, Union, List, Pattern
-from typing import Tuple, Iterator
+from typing import Tuple
 from dataclasses import dataclass
 import random
 import itertools
@@ -1197,7 +1197,9 @@ class Language:
             doc = Doc(self.vocab, words=["x", "y", "z"])
             get_examples = lambda: [Example.from_dict(doc, {})]
         if not hasattr(get_examples, "__call__"):
-            err = Errors.E930.format(method="Language.initialize", obj=type(get_examples))
+            err = Errors.E930.format(
+                method="Language.initialize", obj=type(get_examples)
+            )
             raise TypeError(err)
         # Make sure the config is interpolated so we can resolve subsections
         config = self.config.interpolate()
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 7f8ab6768..3b0de899b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -239,10 +239,12 @@ def th_tokenizer():
 def tr_tokenizer():
     return get_lang_class("tr")().tokenizer
 
+
 @pytest.fixture(scope="session")
 def tr_vocab():
     return get_lang_class("tr").Defaults.create_vocab()
 
+
 @pytest.fixture(scope="session")
 def tt_tokenizer():
     return get_lang_class("tt")().tokenizer
diff --git a/spacy/tests/lang/tr/test_parser.py b/spacy/tests/lang/tr/test_parser.py
index ff71ac3d4..b23d0869c 100644
--- a/spacy/tests/lang/tr/test_parser.py
+++ b/spacy/tests/lang/tr/test_parser.py
@@ -225,7 +225,7 @@ def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
     assert chunks[0].text_with_ws == "en sevdiğim ses sanatçısı "
 
 
-def test_tr_noun_chunks_acl_nmod(tr_tokenizer):
+def test_tr_noun_chunks_acl_nmod2(tr_tokenizer):
     text = "bildiğim bir turizm şirketi"
     heads = [3, 3, 3, 3]
     deps = ["acl", "det", "nmod", "ROOT"]
@@ -308,7 +308,7 @@ def test_tr_noun_chunks_np_recursive_four_nouns(tr_tokenizer):
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "kızına piyano dersi verdiğim hanım "
 
-    
+
 def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
     text = "içine birkaç çiçek konmuş olan bir vazo"
     heads = [3, 2, 3, 6, 3, 6, 6]
@@ -326,7 +326,7 @@ def test_tr_noun_chunks_np_recursive_no_nmod(tr_tokenizer):
 def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
     text = "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo"
     heads = [6, 2, 3, 5, 5, 6, 9, 6, 9, 9]
-    deps = ["obl", "nmod" , "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
+    deps = ["obl", "nmod", "obl", "acl", "det", "nsubj", "acl", "aux", "det", "ROOT"]
     pos = ["ADP", "PROPN", "NOUN", "VERB", "DET", "NOUN", "VERB", "AUX", "DET", "NOUN"]
     tokens = tr_tokenizer(text)
     doc = Doc(
@@ -334,7 +334,10 @@ def test_tr_noun_chunks_np_recursive_long_two_acls(tr_tokenizer):
     )
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 1
-    assert chunks[0].text_with_ws == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
+    assert (
+        chunks[0].text_with_ws
+        == "içine Simge'nin bahçesinden toplanmış birkaç çiçeğin konmuş olduğu bir vazo "
+    )
 
 
 def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
@@ -350,7 +353,8 @@ def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "kız ve erkek çocuklar "
 
-def test_tr_noun_chunks_two_nouns_in_nmod(tr_tokenizer):
+
+def test_tr_noun_chunks_two_nouns_in_nmod2(tr_tokenizer):
     text = "tatlı ve gürbüz çocuklar"
     heads = [3, 2, 0, 3]
     deps = ["amod", "cc", "conj", "ROOT"]
@@ -378,6 +382,7 @@ def test_tr_noun_chunks_conj_simple(tr_tokenizer):
     assert chunks[0].text_with_ws == "ben "
     assert chunks[1].text_with_ws == "Sen "
 
+
 def test_tr_noun_chunks_conj_three(tr_tokenizer):
     text = "sen, ben ve ondan"
     heads = [0, 2, 0, 4, 0]
@@ -394,7 +399,7 @@ def test_tr_noun_chunks_conj_three(tr_tokenizer):
     assert chunks[2].text_with_ws == "sen "
 
 
-def test_tr_noun_chunks_conj_three(tr_tokenizer):
+def test_tr_noun_chunks_conj_three2(tr_tokenizer):
     text = "ben ya da sen ya da onlar"
     heads = [0, 3, 1, 0, 6, 4, 3]
     deps = ["ROOT", "cc", "fixed", "conj", "cc", "fixed", "conj"]
@@ -499,7 +504,7 @@ def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
     assert chunks[0].text_with_ws == "Gazi Mustafa Kemal "
 
 
-def test_tr_noun_chunks_flat_names_and_title(tr_tokenizer):
+def test_tr_noun_chunks_flat_names_and_title2(tr_tokenizer):
     text = "Ahmet Vefik Paşa"
     heads = [2, 0, 2]
     deps = ["nmod", "flat", "ROOT"]
diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py
index 01e279d76..ed7dbb805 100644
--- a/spacy/tests/lang/tr/test_text.py
+++ b/spacy/tests/lang/tr/test_text.py
@@ -15,8 +15,8 @@ from spacy.lang.tr.lex_attrs import like_num
         "üçüncü",
         "beşinci",
         "100üncü",
-        "8inci"
-    ]
+        "8inci",
+    ],
 )
 def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
     assert like_num(word)
@@ -26,4 +26,3 @@ def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
 def test_tr_lex_attrs_capitals(word):
     assert like_num(word)
     assert like_num(word.upper())
-
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index ff2e33fc7..e0c63d09e 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -446,7 +446,7 @@ def test_overfitting_IO():
         return mykb
 
     # Create the Entity Linker component and add it to the pipeline
-    entity_linker = nlp.add_pipe("entity_linker", last=True,)
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
     entity_linker.set_kb(create_kb)
     assert "Q2146908" in entity_linker.vocab.strings
     assert "Q2146908" in entity_linker.kb.vocab.strings
diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py
index 47e3803e9..9d8b047bf 100644
--- a/spacy/tests/regression/test_issue6207.py
+++ b/spacy/tests/regression/test_issue6207.py
@@ -6,8 +6,8 @@ def test_issue6207(en_tokenizer):
 
     # Make spans
     s1 = doc[:4]
-    s2 = doc[3:6]   # overlaps with s1
-    s3 = doc[5:7]   # overlaps with s2, not s1
+    s2 = doc[3:6]  # overlaps with s1
+    s3 = doc[5:7]  # overlaps with s2, not s1
 
     result = filter_spans((s1, s2, s3))
     assert s1 in result
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 8ca7f8b66..e8884e6b2 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -1,10 +1,8 @@
 from typing import List
-
 import pytest
 from thinc.api import fix_random_seed, Adam, set_dropout_rate
 from numpy.testing import assert_array_equal
 import numpy
-
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
 from spacy.ml.staticvectors import StaticVectors
@@ -188,12 +186,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
     assert_array_equal(get_all_params(model1), get_all_params(model2))
 
 
-@pytest.mark.parametrize(
-    "model_func,kwargs",
-    [
-        (StaticVectors, {"nO": 128, "nM": 300}),
-    ]
-)
+@pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
 def test_empty_docs(model_func, kwargs):
     nlp = English()
     model = model_func(**kwargs).initialize()
@@ -201,7 +194,7 @@ def test_empty_docs(model_func, kwargs):
     for n_docs in range(3):
         docs = [nlp("") for _ in range(n_docs)]
         # Test predict
-        _ = model.predict(docs)
+        model.predict(docs)
         # Test backprop
         output, backprop = model.begin_update(docs)
-        _ = backprop(output)
+        backprop(output)

From 68d79796c65d83b934e785bb3d8ffbea16fe832f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 10 Oct 2020 20:59:48 +0200
Subject: [PATCH 499/516] add test for vocab after serializing KB

---
 spacy/pipeline/trainable_pipe.pyx          |  2 +-
 spacy/tests/pipeline/test_entity_linker.py | 28 +++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 88e50e7c6..07cb01059 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -243,7 +243,7 @@ cdef class TrainablePipe(Pipe):
     def _validate_serialization_attrs(self):
         """Check that the pipe implements the required attributes. If a subclass
         implements a custom __init__ method but doesn't set these attributes,
-        the currently default to None, so we need to perform additonal checks.
+        they currently default to None, so we need to perform additonal checks.
         """
         if not hasattr(self, "vocab") or self.vocab is None:
             raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index e0c63d09e..673a354dd 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -5,6 +5,7 @@ from spacy.kb import KnowledgeBase, get_candidates, Candidate
 from spacy.vocab import Vocab
 
 from spacy import util, registry
+from spacy.ml import load_kb
 from spacy.scorer import Scorer
 from spacy.training import Example
 from spacy.lang.en import English
@@ -215,7 +216,7 @@ def test_el_pipe_configuration(nlp):
         return kb
 
     # run an EL pipe without a trained context encoder, to check the candidate generation step only
-    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False},)
+    entity_linker = nlp.add_pipe("entity_linker", config={"incl_context": False})
     entity_linker.set_kb(create_kb)
     # With the default get_candidates function, matching is case-sensitive
     text = "Douglas and douglas are not the same."
@@ -496,6 +497,31 @@ def test_overfitting_IO():
         assert predictions == GOLD_entities
 
 
+def test_kb_serialization():
+    # Test that the KB can be used in a pipeline with a different vocab
+    vector_length = 3
+    with make_tempdir() as tmp_dir:
+        kb_dir = tmp_dir / "kb"
+        nlp1 = English()
+        assert "Q2146908" not in nlp1.vocab.strings
+        mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
+        assert "Q2146908" in nlp1.vocab.strings
+        mykb.to_disk(kb_dir)
+
+        nlp2 = English()
+        nlp2.vocab.strings.add("RandomWord")
+        assert "RandomWord" in nlp2.vocab.strings
+        assert "Q2146908" not in nlp2.vocab.strings
+
+        # Create the Entity Linker component with the KB from file, and check the final vocab
+        entity_linker = nlp2.add_pipe("entity_linker", last=True)
+        entity_linker.set_kb(load_kb(kb_dir))
+        assert "Q2146908" in nlp2.vocab.strings
+        assert "RandomWord" in nlp2.vocab.strings
+
+
 def test_scorer_links():
     train_examples = []
     nlp = English()

From 3a505e7e14acf70e82910ca285b762259f20d5d4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 10 Oct 2020 21:05:28 +0200
Subject: [PATCH 500/516] small edit to ensure the new word was indeed new

---
 spacy/tests/pipeline/test_entity_linker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 673a354dd..f2e6defcb 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -511,6 +511,7 @@ def test_kb_serialization():
         mykb.to_disk(kb_dir)
 
         nlp2 = English()
+        assert "RandomWord" not in nlp2.vocab.strings
         nlp2.vocab.strings.add("RandomWord")
         assert "RandomWord" in nlp2.vocab.strings
         assert "Q2146908" not in nlp2.vocab.strings

From 99606e46fe90a8cb813a10d62d2d234ebdf4540f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 12:30:57 +0200
Subject: [PATCH 501/516] Relax meta.json schema [ci skip]

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 07d17d193..f3664acff 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -282,7 +282,7 @@ class ModelMetaSchema(BaseModel):
     sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
     vectors: Dict[str, Any] = Field({}, title="Included word vectors")
     labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
-    performance: Dict[str, Union[float, Dict[str, Union[float, dict]]]] = Field({}, title="Accuracy and speed numbers")
+    performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
     spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
     # fmt: on
 

From ab890a35f9b54c625d423930cf81e75a27bfa69d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 12:55:46 +0200
Subject: [PATCH 502/516] Make console logger table more compact

---
 spacy/training/loggers.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index b431ecf06..79459a89b 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -11,11 +11,25 @@ if TYPE_CHECKING:
     from ..language import Language  # noqa: F401
 
 
+def setup_table(
+    *, cols: List[str], widths: List[int], max_width: int = 13
+) -> Tuple[List[str], List[int], List[str]]:
+    final_cols = []
+    final_widths = []
+    for col, width in zip(cols, widths):
+        if len(col) > max_width:
+            col = col[: max_width - 3] + "..."  # shorten column if too long
+        final_cols.append(col.upper())
+        final_widths.append(max(len(col), width))
+    return final_cols, final_widths, ["r" for _ in final_widths]
+
+
 @registry.loggers("spacy.ConsoleLogger.v1")
 def console_logger(progress_bar: bool = False):
     def setup_printer(
         nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
     ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]:
+        write = lambda text: stdout.write(f"{text}\n")
         msg = Printer(no_print=True)
         # ensure that only trainable components are logged
         logged_pipes = [
@@ -26,15 +40,14 @@ def console_logger(progress_bar: bool = False):
         eval_frequency = nlp.config["training"]["eval_frequency"]
         score_weights = nlp.config["training"]["score_weights"]
         score_cols = [col for col, value in score_weights.items() if value is not None]
-        score_widths = [max(len(col), 6) for col in score_cols]
         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
-        loss_widths = [max(len(col), 8) for col in loss_cols]
-        table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
-        table_header = [col.upper() for col in table_header]
-        table_widths = [3, 6] + loss_widths + score_widths + [6]
-        table_aligns = ["r" for _ in table_widths]
-        stdout.write(msg.row(table_header, widths=table_widths) + "\n")
-        stdout.write(msg.row(["-" * width for width in table_widths]) + "\n")
+        spacing = 2
+        table_header, table_widths, table_aligns = setup_table(
+            cols=["E", "#"] + loss_cols + score_cols + ["Score"],
+            widths=[3, 6] + [8 for _ in loss_cols] + [6 for _ in score_cols] + [6],
+        )
+        write(msg.row(table_header, widths=table_widths, spacing=spacing))
+        write(msg.row(["-" * width for width in table_widths], spacing=spacing))
         progress = None
 
         def log_step(info: Optional[Dict[str, Any]]) -> None:
@@ -70,7 +83,9 @@ def console_logger(progress_bar: bool = False):
             )
             if progress is not None:
                 progress.close()
-            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n")
+            write(
+                msg.row(data, widths=table_widths, aligns=table_aligns, spacing=spacing)
+            )
             if progress_bar:
                 # Set disable=None, so that it disables on non-TTY
                 progress = tqdm.tqdm(

From 4fa967ea843c2b1db0147a2b4d303266e5563f73 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 11 Oct 2020 13:10:58 +0200
Subject: [PATCH 503/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index efdfd26c0..38efce3e9 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a38"
+__version__ = "3.0.0a39"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 40276fd3be231be6969f8c51889c13e77a726fa8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 12 Oct 2020 11:41:27 +0200
Subject: [PATCH 504/516] update NEL docs after latest refactor

---
 spacy/ml/models/entity_linker.py  |  3 +-
 website/docs/api/architectures.md | 19 +++----
 website/docs/api/entitylinker.md  | 84 ++++++++++++++++++++-----------
 3 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index d945e5fba..f37203b1b 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Optional, Callable, Iterable
 from thinc.api import chain, clone, list2ragged, reduce_mean, residual
 from thinc.api import Model, Maxout, Linear
@@ -25,7 +26,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
 
 
 @registry.misc.register("spacy.KBFromFile.v1")
-def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]:
+def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
     def kb_from_file(vocab):
         kb = KnowledgeBase(vocab, entity_vector_length=1)
         kb.from_disk(kb_path)
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 5246a3ed6..3157c261a 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -637,13 +637,6 @@ into the "real world". This requires 3 main components:
 > window_size = 1
 > maxout_pieces = 3
 > subword_features = true
->
-> [kb_loader]
-> @misc = "spacy.EmptyKB.v1"
-> entity_vector_length = 64
->
-> [get_candidates]
-> @misc = "spacy.CandidateGenerator.v1"
 > ```
 
 The `EntityLinker` model architecture is a Thinc `Model` with a
@@ -657,13 +650,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 
 ### spacy.EmptyKB.v1 {#EmptyKB}
 
-A function that creates a default, empty `KnowledgeBase` from a
-[`Vocab`](/api/vocab) instance.
+A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
+instance. This is the default when a new entity linker component is created.
 
 | Name                   | Description                                                                         |
 | ---------------------- | ----------------------------------------------------------------------------------- |
 | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 
+### spacy.KBFromFile.v1 {#KBFromFile}
+
+A function that reads an existing `KnowledgeBase` from file.
+
+| Name      | Description                                              |
+| --------- | -------------------------------------------------------- |
+| `kb_path` | The location of the KB that was stored to file. ~~Path~~ |
+
 ### spacy.CandidateGenerator.v1 {#CandidateGenerator}
 
 A function that takes as input a [`KnowledgeBase`](/api/kb) and a
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 169a175e2..0904bbf72 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -34,20 +34,20 @@ architectures and their arguments and hyperparameters.
 >    "incl_prior": True,
 >    "incl_context": True,
 >    "model": DEFAULT_NEL_MODEL,
->    "kb_loader": {'@misc': 'spacy.EmptyKB.v1', 'entity_vector_length': 64},
+>    "entity_vector_length": 64,
 >    "get_candidates": {'@misc': 'spacy.CandidateGenerator.v1'},
 > }
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-| Setting          | Description                                                                                                                                                                                                                                                              |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
-| `incl_prior`     | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
-| `incl_context`   | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
-| `model`          | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
-| `kb_loader`      | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~                |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| Setting                | Description                                                                                                                                                                                                                                                              |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `labels_discard`       | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
+| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
+| `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
+| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~                                                                                                                                                                                                              |
+| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 
 ```python
 %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@@ -65,10 +65,6 @@ architectures and their arguments and hyperparameters.
 > config = {"model": {"@architectures": "my_el.v1"}}
 > entity_linker = nlp.add_pipe("entity_linker", config=config)
 >
-> # Construction via add_pipe with custom KB and candidate generation
-> config = {"kb": {"@misc": "my_kb.v1"}}
-> entity_linker = nlp.add_pipe("entity_linker", config=config)
->
 > # Construction from class
 > from spacy.pipeline import EntityLinker
 > entity_linker = EntityLinker(nlp.vocab, model)
@@ -76,21 +72,25 @@ architectures and their arguments and hyperparameters.
 
 Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
-`KnowledgeBase` as well as the Candidate generator can be customized by
-providing custom registered functions.
+[`nlp.add_pipe`](/api/language#add_pipe).
 
-| Name             | Description                                                                                                                      |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`          | The shared vocabulary. ~~Vocab~~                                                                                                 |
-| `model`          | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
-| `name`           | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
-| _keyword-only_   |                                                                                                                                  |
-| `kb_loader`      | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~                 |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
-| `incl_prior`     | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
-| `incl_context`   | Whether or not to include the local context in the model. ~~bool~~                                                               |
+Upon construction of the entity linker component, an empty knowledge base is
+constructed with the provided `entity_vector_length`. If you want to use a
+custom knowledge base, you should either call
+[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
+[`initialize`](/api/entitylinker#initialize) call.
+
+| Name                   | Description                                                                                                                      |
+| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                | The shared vocabulary. ~~Vocab~~                                                                                                 |
+| `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~                                        |
+| `name`                 | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                              |
+| _keyword-only_         |                                                                                                                                  |
+| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~                                                                                      |
+| `get_candidates`       | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `labels_discard`       | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
+| `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
+| `incl_context`         | Whether or not to include the local context in the model. ~~bool~~                                                               |
 
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
 
@@ -139,6 +139,28 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
+## EntityLinker.set_kb {#initialize tag="method" new="3"}
+
+The `kb_loader` should be a function that takes a `Vocab` instance and creates
+the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced
+with the current vocab.
+
+> #### Example
+>
+> ```python
+> def create_kb(vocab):
+>     kb = KnowledgeBase(vocab, entity_vector_length=128)
+>     kb.add_entity(...)
+>     kb.add_alias(...)
+>     return kb
+> entity_linker = nlp.add_pipe("entity_linker")
+> entity_linker.set_kb(lambda: [], nlp=nlp, kb_loader=create_kb)
+> ```
+
+| Name        | Description                                                                                                      |
+| ----------- | ---------------------------------------------------------------------------------------------------------------- |
+| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ |
+
 ## EntityLinker.initialize {#initialize tag="method" new="3"}
 
 Initialize the component for training. `get_examples` should be a function that
@@ -150,6 +172,11 @@ network,
 setting up the label scheme based on the data. This method is typically called
 by [`Language.initialize`](/api/language#initialize).
 
+Optionally, a `kb_loader` argument may be specified to change the internal
+knowledge base. This argument should be a function that takes a `Vocab` instance
+and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base
+are synced with the current vocab.
+
 <Infobox variant="warning" title="Changed in v3.0" id="begin_training">
 
 This method was previously called `begin_training`.
@@ -160,7 +187,7 @@ This method was previously called `begin_training`.
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker")
-> entity_linker.initialize(lambda: [], nlp=nlp)
+> entity_linker.initialize(lambda: [], nlp=nlp, kb_loader=my_kb)
 > ```
 
 | Name           | Description                                                                                                                           |
@@ -168,6 +195,7 @@ This method was previously called `begin_training`.
 | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
 | _keyword-only_ |                                                                                                                                       |
 | `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                  |
+| `kb_loader`    | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~                      |
 
 ## EntityLinker.predict {#predict tag="method"}
 

From 1f465bea185d6aff3f4320b84f6a006b72b71917 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 13 Oct 2020 09:27:19 +0200
Subject: [PATCH 505/516] if-else

---
 spacy/util.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 58f951f86..8335a4fcc 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1361,11 +1361,12 @@ def check_bool_env_var(env_var: str) -> bool:
 def _pipe(docs, proc, kwargs):
     if hasattr(proc, "pipe"):
         yield from proc.pipe(docs, **kwargs)
-    # We added some args for pipe that __call__ doesn't expect.
-    kwargs = dict(kwargs)
-    for arg in ["batch_size"]:
-        if arg in kwargs:
-            kwargs.pop(arg)
-    for doc in docs:
-        doc = proc(doc, **kwargs)
-        yield doc
+    else:
+        # We added some args for pipe that __call__ doesn't expect.
+        kwargs = dict(kwargs)
+        for arg in ["batch_size"]:
+            if arg in kwargs:
+                kwargs.pop(arg)
+        for doc in docs:
+            doc = proc(doc, **kwargs)
+            yield doc

From a0e12c136b5864e7c0390a70902b2b158118d9b8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 10:00:53 +0200
Subject: [PATCH 506/516] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 38efce3e9..2aeef3c8d 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a39"
+__version__ = "3.0.0a40"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 4d99d2b94a73d7d950f92526efc5a5f6f9b98121 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 11:38:52 +0200
Subject: [PATCH 507/516] Update docs [ci skip]

---
 website/docs/api/entitylinker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 0904bbf72..683927b1c 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -46,7 +46,7 @@ architectures and their arguments and hyperparameters.
 | `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
 | `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
 | `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to 64. ~~int~~                                                                                                                                                                                                              |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                            |
 | `get_candidates`       | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 
 ```python

From 86d648740fc4f1fea9ac5c779c2d578c2431cafe Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 11:39:03 +0200
Subject: [PATCH 508/516] Fix morph representation in Doc.to_json

---
 spacy/tokens/doc.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 4a57e4c83..abc82030d 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1419,7 +1419,7 @@ cdef class Doc:
             if include_annotation["POS"]:
                 token_data["pos"] = token.pos_
             if include_annotation["MORPH"]:
-                token_data["morph"] = token.morph
+                token_data["morph"] = token.morph.to_json()
             if include_annotation["LEMMA"]:
                 token_data["lemma"] = token.lemma_
             if include_annotation["DEP"]:

From f8a1c1afd6fff111b4434e6d19a2b1aec5b55501 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 13 Oct 2020 14:39:59 +0200
Subject: [PATCH 509/516] avoid dropout at runtime (#6247)

---
 spacy/about.py            | 2 +-
 spacy/ml/staticvectors.py | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 2aeef3c8d..9c5dd0b4f 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a40"
+__version__ = "3.0.0a41"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index da731dadb..f0213a9b8 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -39,7 +39,6 @@ def forward(
     key_attr = model.attrs["key_attr"]
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
     V = cast(Floats2d, docs[0].vocab.vectors.data)
-    mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
     rows = model.ops.flatten(
         [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
     )
@@ -47,8 +46,11 @@ def forward(
         model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True),
         model.ops.asarray([len(doc) for doc in docs], dtype="i"),
     )
-    if mask is not None:
-        output.data *= mask
+    mask = None
+    if is_train:
+        mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate"))
+        if mask is not None:
+            output.data *= mask
 
     def backprop(d_output: Ragged) -> List[Doc]:
         if mask is not None:

From 1f4930086209128876e2804ae070ded54471e6f2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 13 Oct 2020 15:41:17 +0200
Subject: [PATCH 510/516] Update transformer recommendations [ci skip]

---
 .../quickstart_training_recommendations.yml   | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml
index 206e69954..54aec2e31 100644
--- a/spacy/cli/templates/quickstart_training_recommendations.yml
+++ b/spacy/cli/templates/quickstart_training_recommendations.yml
@@ -32,10 +32,10 @@ es:
   word_vectors: null
   transformer:
     efficiency:
-      name: mrm8488/RuPERTa-base
+      name: dccuchile/bert-base-spanish-wwm-cased
       size_factor: 3
     accuracy:
-      name: mrm8488/RuPERTa-base
+      name: dccuchile/bert-base-spanish-wwm-cased
       size_factor: 3
 sv:
   word_vectors: null
@@ -101,3 +101,21 @@ pl:
     accuracy:
       name: dkleczek/bert-base-polish-cased-v1
       size_factor: 3
+nl:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: pdelobelle/robbert-v2-dutch-base
+      size_factor: 3
+    accuracy:
+      name: pdelobelle/robbert-v2-dutch-base
+      size_factor: 3
+pt:
+  word_vectors: null
+  transformer:
+    efficiency:
+      name: neuralmind/bert-base-portuguese-cased
+      size_factor: 3
+    accuracy:
+      name: neuralmind/bert-base-portuguese-cased
+      size_factor: 3

From 03e3bab64b96beb563e1d5bb8071c4f2b0fd43f3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 14 Oct 2020 14:58:15 +0200
Subject: [PATCH 511/516] Update README.md [ci skip]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5d310492d..55e4c6512 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
 spaCy is commercial open-source software, released under the MIT license.
 
-💫 **Version 3.0 out now!**
+💫 **Version 3.0 (nightly) out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

From 1aa8e8f2af7c180294bb47047e913fa655f278a4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 14 Oct 2020 14:58:45 +0200
Subject: [PATCH 512/516] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 12 ++++++------
 website/docs/usage/facts-figures.md      |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index a604c4b57..becd313f4 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -1,14 +1,14 @@
 import { Help } from 'components/typography'; import Link from 'components/link'
 
-<!-- TODO: update numbers, add note on previous NER evaluation issues -->
+<!-- TODO: update speed and v2 NER numbers -->
 
 <figure>
 
 | Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
-| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
+| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |   95.5 |   98.3 | 89.7 |                                                                  1k |                                                                 8k |
 | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |   92.2 |   97.4 | 85.8 |                                                                  7k |                                                                    |
-| `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
+| `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 |      |                                                                 10k |                                                                    |
 
 <figcaption class="caption">
 
@@ -23,9 +23,9 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 | Named Entity Recognition System                                                | OntoNotes | CoNLL '03 |
 | ------------------------------------------------------------------------------ | --------: | --------: |
-| spaCy RoBERTa (2020)                                                           |           |      92.2 |
-| spaCy CNN (2020)                                                               |      85.3 |      88.4 |
-| spaCy CNN (2017)                                                               |      86.4 |           |
+| spaCy RoBERTa (2020)                                                           |      89.7 |      91.6 |
+| spaCy CNN (2020)                                                               |      84.5 |           |
+| spaCy CNN (2017)                                                               |           |           |
 | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 |
 | <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 |
 | BERT Base<sup>3</sup>                                                          |         - |      92.4 |
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index a31559b04..2707f68fa 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -65,8 +65,8 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 | Dependency Parsing System                                                      |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: |
-| spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 |
-| spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 |
+| spaCy RoBERTa (2020)<sup>1</sup>                                               | 95.5 | 94.3 |
+| spaCy CNN (2020)<sup>1</sup>                                                   |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 |
 

From 2e8dcba37947b5fc99ba5d9d581b549da0698a1a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 14 Oct 2020 14:59:09 +0200
Subject: [PATCH 513/516] Update version pins

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c175ded66..14a2d7690 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a44,<8.0.0a50",
+    "thinc>=8.0.0rc0,<8.1.0",
     "blis>=0.4.0,<0.8.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index d6b6267a9..36f0d1e92 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a44,<8.0.0a50
+thinc>=8.0.0rc0,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index d9414a4f4..adf0c0e20 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a44,<8.0.0a50
+    thinc>=8.0.0rc0,<8.1.0
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a44,<8.0.0a50
+    thinc>=8.0.0rc0,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.3.0,<3.0.0
@@ -65,9 +65,9 @@ console_scripts =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.0rc0,<1.0.0
+    spacy_lookups_data>=1.0.0rc0,<1.1.0
 transformers =
-    spacy_transformers>=1.0.0a22,<1.0.0
+    spacy_transformers>=1.0.0rc0,<1.1.0
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =

From 0aa88518786ca95f5750e3a79a87967bd3558a94 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 14 Oct 2020 15:00:49 +0200
Subject: [PATCH 514/516] always return losses

---
 spacy/pipeline/tagger.pyx         | 5 +++--
 spacy/pipeline/trainable_pipe.pyx | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 1b0f79cea..3be93c32c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -195,7 +195,7 @@ class Tagger(TrainablePipe):
         validate_examples(examples, "Tagger.update")
         if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
             # Handle cases where there are no tokens in any docs.
-            return
+            return losses
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
         for sc in tag_scores:
@@ -233,7 +233,7 @@ class Tagger(TrainablePipe):
             return
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
-            return
+            return losses
         set_dropout_rate(self.model, drop)
         guesses, backprop = self.model.begin_update(docs)
         target = self._rehearsal_model(examples)
@@ -243,6 +243,7 @@ class Tagger(TrainablePipe):
         if losses is not None:
             losses.setdefault(self.name, 0.0)
             losses[self.name] += (gradient**2).sum()
+        return losses
 
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 07cb01059..6cd73d256 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -116,7 +116,7 @@ cdef class TrainablePipe(Pipe):
         validate_examples(examples, "TrainablePipe.update")
         if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
             # Handle cases where there are no tokens in any docs.
-            return
+            return losses
         set_dropout_rate(self.model, drop)
         scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
         loss, d_scores = self.get_loss(examples, scores)

From 478a14a61934e617988f70aaf692fcd6d7b1e226 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 14 Oct 2020 15:01:19 +0200
Subject: [PATCH 515/516] fix few typos

---
 website/docs/usage/layers-architectures.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index e348c4389..9677398cf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -503,7 +503,7 @@ overview of the `TrainablePipe` methods used by
 
 </Infobox>
 
-### Example: Entity elation extraction component {#component-rel}
+### Example: Entity relation extraction component {#component-rel}
 
 This section outlines an example use-case of implementing a **novel relation
 extraction component** from scratch. We'll implement a binary relation
@@ -618,7 +618,7 @@ we can define our relation model in a config file as such:
 # ...
 
 [model.get_candidates]
-@misc = "rel_cand_generator.v2"
+@misc = "rel_cand_generator.v1"
 max_length = 20
 
 [model.create_candidate_tensor]
@@ -687,8 +687,8 @@ Before the model can be used, it needs to be
 [initialized](/usage/training#initialization). This function receives a callback
 to access the full **training data set**, or a representative sample. This data
 set can be used to deduce all **relevant labels**. Alternatively, a list of
-labels can be provided to `initialize`, or you can call the
-`RelationExtractoradd_label` directly. The number of labels defines the output
+labels can be provided to `initialize`, or you can call 
+`RelationExtractor.add_label` directly. The number of labels defines the output
 dimensionality of the network, and will be used to do
 [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
 layers of the neural network. This is triggered by calling
@@ -729,7 +729,7 @@ and its internal model can be trained and used to make predictions.
 During training, the function [`update`](/api/pipe#update) is invoked which
 delegates to
 [`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
-[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
+[`get_loss`](/api/pipe#get_loss) function that **calculates the loss** for a
 batch of examples, as well as the **gradient** of loss that will be used to
 update the weights of the model layers. Thinc provides several
 [loss functions](https://thinc.ai/docs/api-loss) that can be used for the

From 44e14ccae87d4077cfc3b730e76ab32bbb15cafb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 14 Oct 2020 15:11:34 +0200
Subject: [PATCH 516/516] one more losses fix

---
 spacy/pipeline/tagger.pyx | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 3be93c32c..16633a7b8 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -227,10 +227,13 @@ class Tagger(TrainablePipe):
 
         DOCS: https://nightly.spacy.io/api/tagger#rehearse
         """
+        if losses is None:
+            losses = {}
+        losses.setdefault(self.name, 0.0)
         validate_examples(examples, "Tagger.rehearse")
         docs = [eg.predicted for eg in examples]
         if self._rehearsal_model is None:
-            return
+            return losses
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             return losses
@@ -240,9 +243,7 @@ class Tagger(TrainablePipe):
         gradient = guesses - target
         backprop(gradient)
         self.finish_update(sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += (gradient**2).sum()
+        losses[self.name] += (gradient**2).sum()
         return losses
 
     def get_loss(self, examples, scores):