From ba6cf9821f0ba4174fe91a840688785fbaa5ed98 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 4 Sep 2020 14:28:28 +0200
Subject: [PATCH 001/133] Replace docs analytics [ci skip]

---
 website/gatsby-config.js | 9 ---------
 website/meta/site.json   | 1 -
 website/package.json     | 1 -
 3 files changed, 11 deletions(-)
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 2a5f957f4..144b8e93e 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -131,15 +131,6 @@ module.exports = {
                 icon: `src/images/icon.png`,
             },
         },
-        {
-            resolve: `gatsby-plugin-google-analytics`,
-            options: {
-                trackingId: site.analytics,
-                head: false,
-                anonymize: true,
-                respectDNT: true,
-            },
-        },
         {
             resolve: `gatsby-plugin-plausible`,
             options: {
diff --git a/website/meta/site.json b/website/meta/site.json
index 4d12a4c46..31f2f2f68 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -14,7 +14,6 @@
         "github": "explosion"
     },
     "theme": "#09a3d5",
-    "analytics": "UA-58931649-1",
     "newsletter": {
         "user": "spacy.us12",
         "id": "83b0498b1e7fa3c91ce68c3f1",
diff --git a/website/package.json b/website/package.json
index a59bc9bdc..8d8ba6408 100644
--- a/website/package.json
+++ b/website/package.json
@@ -20,7 +20,6 @@
         "gatsby-image": "^2.0.29",
         "gatsby-mdx": "^0.3.6",
         "gatsby-plugin-catch-links": "^2.0.11",
-        "gatsby-plugin-google-analytics": "^2.0.14",
         "gatsby-plugin-manifest": "^2.0.17",
         "gatsby-plugin-offline": "^2.0.24",
         "gatsby-plugin-plausible": "0.0.6",

From 33d9c649771cf03122ccb9fe7544e8c14ed788fa Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 4 Sep 2020 14:44:38 +0200
Subject: [PATCH 002/133] Fix outbound link and update package lock [ci skip]

---
 website/package-lock.json      |  8 --------
 website/src/components/link.js | 11 ++---------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/website/package-lock.json b/website/package-lock.json
index dded33fb0..63e67ebd2 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -7441,14 +7441,6 @@
         "escape-string-regexp": "^1.0.5"
       }
     },
-    "gatsby-plugin-google-analytics": {
-      "version": "2.0.14",
-      "resolved": "https://registry.npmjs.org/gatsby-plugin-google-analytics/-/gatsby-plugin-google-analytics-2.0.14.tgz",
-      "integrity": "sha512-sFD73d9isJQknnDAAkDidaybHJx6VIaLfy3nO3DwbFaitvZ08RimbynYOkcWAeA0zwwix2RgAvbq/9pAmtTb/A==",
-      "requires": {
-        "@babel/runtime": "^7.0.0"
-      }
-    },
     "gatsby-plugin-manifest": {
       "version": "2.0.17",
       "resolved": "https://registry.npmjs.org/gatsby-plugin-manifest/-/gatsby-plugin-manifest-2.0.17.tgz",
diff --git a/website/src/components/link.js b/website/src/components/link.js
index 4c4aa9492..dc0cfda8e 100644
--- a/website/src/components/link.js
+++ b/website/src/components/link.js
@@ -1,7 +1,6 @@
 import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import { Link as GatsbyLink } from 'gatsby'
-import { OutboundLink } from 'gatsby-plugin-google-analytics'
 import classNames from 'classnames'
 
 import Icon from './icon'
@@ -74,15 +73,9 @@ const Link = ({
     const rel = isInternal ? null : 'noopener nofollow noreferrer'
     return (
         <Wrapper>
-            <OutboundLink
-                href={dest}
-                className={linkClassNames}
-                target="_blank"
-                rel={rel}
-                {...other}
-            >
+            <a href={dest} className={linkClassNames} target="_blank" rel={rel} {...other}>
                 {content}
-            </OutboundLink>
+            </a>
         </Wrapper>
     )
 }

From a26f864ed3c227fab1d2a506e27cb4b5b5d831d2 Mon Sep 17 00:00:00 2001
From: Marek Grzenkowicz <chopeen@gmail.com>
Date: Tue, 8 Sep 2020 21:13:50 +0200
Subject: [PATCH 003/133] Clarify how to choose pretrained weights files
 (closes #6027) [ci skip] (#6039)

---
 website/docs/api/cli.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 779fa7695..b97308aab 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -445,7 +445,8 @@ an approximate language-modeling objective. Specifically, we load pretrained
 vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
 match the pretrained ones. The weights are saved to a directory after each
 epoch. You can then pass a path to one of these pretrained weights files to the
-`spacy train` command.
+`spacy train` command. You can try to use a few with low `Loss` values reported
+in the output.
 
 This technique may be especially helpful if you have little labelled data.
 However, it's still quite experimental, so your mileage may vary. To load the

From bd87e8686e05487116c3a0c631bcb789059b2636 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 21:40:38 +0200
Subject: [PATCH 004/133] move tests to correct subdir

---
 spacy/tests/{ => pipeline}/test_tok2vec.py  | 2 +-
 spacy/tests/training/__init__.py            | 0
 spacy/tests/{ => training}/test_training.py | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename spacy/tests/{ => pipeline}/test_tok2vec.py (99%)
 create mode 100644 spacy/tests/training/__init__.py
 rename spacy/tests/{ => training}/test_training.py (99%)

diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
similarity index 99%
rename from spacy/tests/test_tok2vec.py
rename to spacy/tests/pipeline/test_tok2vec.py
index fb30c6ae5..0365554bc 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -9,7 +9,7 @@ from spacy.tokens import Doc
 from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
-from .util import get_batch
+from ..util import get_batch
 
 from thinc.api import Config
 
diff --git a/spacy/tests/training/__init__.py b/spacy/tests/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/test_training.py b/spacy/tests/training/test_training.py
similarity index 99%
rename from spacy/tests/test_training.py
rename to spacy/tests/training/test_training.py
index 1926aca1f..67cc37b1c 100644
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from .util import make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.fixture

From 51fa929f47120272bd6b8dfbba1f000833446f0f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 21:58:04 +0200
Subject: [PATCH 005/133] rewrite train_corpus to corpus.train in config

---
 extra/experiments/onto-joint/defaults.cfg     |  6 ++--
 .../ptb-joint-pos-dep/defaults.cfg            |  6 ++--
 spacy/cli/templates/quickstart_training.jinja |  6 ++--
 spacy/cli/train.py                            |  4 +--
 spacy/default_config.cfg                      |  6 ++--
 spacy/schemas.py                              |  3 +-
 .../tests/serialize/test_serialize_config.py  | 16 +++++----
 website/docs/api/corpus.md                    |  2 +-
 website/docs/api/data-formats.md              | 35 +++++++++----------
 website/docs/api/top-level.md                 |  4 +--
 website/docs/usage/projects.md                |  2 +-
 website/docs/usage/training.md                |  2 +-
 12 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
index 7954b57b5..97eebe6b4 100644
--- a/extra/experiments/onto-joint/defaults.cfg
+++ b/extra/experiments/onto-joint/defaults.cfg
@@ -21,14 +21,16 @@ eval_frequency = 200
 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
 frozen_components = []
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
index 8f9c5666e..03e2f5bd7 100644
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -20,14 +20,16 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
 
-[training.read_train]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths:train}
 gold_preproc = true
 max_length = 0
 limit = 0
 
-[training.read_dev]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths:dev}
 gold_preproc = ${training.read_train:gold_preproc}
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 199aae217..39d4d875d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -195,12 +195,14 @@ total_steps = 20000
 initial_rate = 5e-5
 {% endif %}
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index ae4a8455e..2c2eeb88b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -92,8 +92,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = T_cfg["train_corpus"]
-    dev_corpus = T_cfg["dev_corpus"]
+    train_corpus = T_cfg["corpus"]["train"]
+    dev_corpus = T_cfg["corpus"]["dev"]
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7cd71453f..61f3dfe25 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -44,7 +44,9 @@ frozen_components = []
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 # Whether to train on sequences with 'gold standard' sentence boundaries
@@ -56,7 +58,7 @@ max_length = 0
 # Limitation on number of training examples
 limit = 0
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 # Whether to train on sequences with 'gold standard' sentence boundaries
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 0dd2b9204..d8bcf3c1d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,8 +198,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    train_corpus: Reader = Field(..., title="Reader for the training data")
-    dev_corpus: Reader = Field(..., title="Reader for the dev data")
+    corpus: Reader = Field(..., title="Reader for the training and dev data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 0ab212fda..d113ac2a5 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -19,11 +19,13 @@ dev = ""
 
 [training]
 
-[training.train_corpus]
+[training.corpus]
+
+[training.corpus.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 
-[training.dev_corpus]
+[training.corpus.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 
@@ -300,20 +302,20 @@ def test_config_overrides():
 
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["training"]["train_corpus"]["path"] == ""
+    assert interpolated["training"]["corpus"]["train"]["path"] == ""
     nlp = English.from_config(config)
-    assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
+    assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
     width = "${components.tok2vec.model.width}"
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["training"]["train_corpus"]["path"] == ""
+    assert interpolated2["training"]["corpus"]["train"]["path"] == ""
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["training"]["train_corpus"]["path"] == ""
+    assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index 0f49b02e3..c25ce1651 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -26,7 +26,7 @@ streaming.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 79ecb08b3..74d612862 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -126,24 +126,23 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                  |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus`          | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `train_corpus`        | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~                        |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
+| Name                  | Description                                                                                                                                                                                                                                           |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
+| `corpus`              | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                       |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                       |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                             |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                               |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                       |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                                   |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                         |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                       |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                      |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..be7994d5d 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -448,7 +448,7 @@ remain in the config file stored on your local system.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 | Name                   | Description                                                                                                                           |
@@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 9776dab1b..3a6bd4551 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -969,7 +969,7 @@ your results.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 65cfb563b..bba2e2853 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
 > #### config.cfg
 >
 > ```ini
-> [training.train_corpus]
+> [training.corpus.train]
 > @readers = "corpus_variants.v1"
 > source = "s3://your_bucket/path/data.csv"
 > ```

From 733665766205f350398d3216e94ab8a5ac6c3751 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:07:16 +0200
Subject: [PATCH 006/133] corpus is a Dict

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index d8bcf3c1d..2030048d8 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,7 +198,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    corpus: Reader = Field(..., title="Reader for the training and dev data")
+    corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")

From 55f8d5478ecb5fd913a3a5fe7c469e8bc8a4f038 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:09:30 +0200
Subject: [PATCH 007/133] fix example output

---
 website/docs/api/cli.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8449d23e1..7dd6e6184 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -272,7 +272,7 @@ training -> dropout     field required
 training -> optimizer   field required
 training -> optimize    extra fields not permitted
 
-{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
+{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
 
 If your config contains missing values, you can run the 'init fill-config'
 command to fill in all the defaults, if possible:
@@ -370,7 +370,12 @@ Registry   @schedules
 Name       compounding.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 43)
-ℹ [training.dev_corpus]
+ℹ [training.corpus.dev]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
+ℹ [training.corpus.train]
 Registry   @readers
 Name       spacy.Corpus.v1
 Module     spacy.training.corpus
@@ -385,11 +390,6 @@ Registry   @schedules
 Name       warmup_linear.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 91)
-ℹ [training.train_corpus]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
 ```
 
 </Accordion>

From f420aa1138f52c732102b6ad00825bab797792ec Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 15 Sep 2020 22:30:09 +0200
Subject: [PATCH 008/133] use e.value to get to the ExceptionInfo value

---
 spacy/tests/test_language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index fba362b76..2a24d368a 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -298,4 +298,4 @@ def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
     with pytest.raises(ValueError) as e:
         Language(value)
-    assert err_fragment in str(e)
+    assert err_fragment in str(e.value)

From 714a5a05c65e28b5264d16e7dba202126de2cbfb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:39:55 +0200
Subject: [PATCH 009/133] test for custom readers with ml_datasets >= 0.2

---
 spacy/pipeline/textcat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 3f6250680..e7cb62a0d 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -181,9 +181,9 @@ class TextCategorizer(Pipe):
 
         DOCS: https://nightly.spacy.io/api/textcategorizer#predict
         """
-        tensors = [doc.tensor for doc in docs]
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
+            tensors = [doc.tensor for doc in docs]
             xp = get_array_module(tensors)
             scores = xp.zeros((len(docs), len(self.labels)))
             return scores

From 1040e250d8f740db7d0a6b012962b25ce7f95ffb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:41:28 +0200
Subject: [PATCH 010/133] actual commit with test for custom readers with
 ml_datasets >= 0.2

---
 requirements.txt                     |  2 +-
 spacy/tests/training/test_readers.py | 58 ++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/training/test_readers.py

diff --git a/requirements.txt b/requirements.txt
index db6eae2ef..a67ade640 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a31,<8.0.0a40
 blis>=0.4.0,<0.5.0
-ml_datasets>=0.1.1
+ml_datasets>=0.2.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
new file mode 100644
index 000000000..c81ec0897
--- /dev/null
+++ b/spacy/tests/training/test_readers.py
@@ -0,0 +1,58 @@
+import pytest
+from thinc.api import Config
+from spacy.util import load_model_from_config
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "reader,additional_config",
+    [
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
+    ],
+)
+def test_cat_readers(reader, additional_config):
+    nlp_config_string = """
+    [training]
+    
+    [training.corpus]
+    @readers = "PLACEHOLDER"
+
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec", "textcat"]
+    
+    [components]
+    
+    [components.tok2vec]
+    factory = "tok2vec"
+    
+    [components.textcat]
+    factory = "textcat"
+    """
+    config = Config().from_str(nlp_config_string)
+    config["training"]["corpus"]["@readers"] = reader
+    config["training"]["corpus"].update(additional_config)
+    nlp, resolved = load_model_from_config(config, auto_fill=True)
+
+    train_corpus = resolved["training"]["corpus"]["train"]
+    optimizer = resolved["training"]["optimizer"]
+    # simulate a training loop
+    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    for example in train_corpus(nlp):
+        assert example.y.cats
+        # this shouldn't fail if each training example has at least one positive label
+        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
+        nlp.update([example], sgd=optimizer)
+    # simulate performance benchmark on dev corpus
+    dev_corpus = resolved["training"]["corpus"]["dev"]
+    dev_examples = list(dev_corpus(nlp))
+    for example in dev_examples:
+        # this shouldn't fail if each dev example has at least one positive label
+        assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
+    scores = nlp.evaluate(dev_examples)
+    assert scores["cats_score"]
+    # ensure the pipeline runs
+    doc = nlp("Quick test")
+    assert doc.cats

From 0dc914b667706b4e598b61e3cfff0a85e820118f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 16 Sep 2020 16:42:58 +0200
Subject: [PATCH 011/133] bump thinc to 8.0.0a33

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e610e603e..a413a099c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a31,<8.0.0a40",
+    "thinc>=8.0.0a33,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index a67ade640..69477c2d3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a31,<8.0.0a40
+thinc>=8.0.0a33,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets>=0.2.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 10a8972b0..359e63172 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a31,<8.0.0a40
+    thinc>=8.0.0a33,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a31,<8.0.0a40
+    thinc>=8.0.0a33,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0

From 87c329c7114767d8788090a3838fce0bf36822b7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:37:29 +0200
Subject: [PATCH 012/133] Set rule-based lemmatizers as default (#6076)

For languages without provided models and with lemmatizer rules in
`spacy-lookups-data`, make the rule-based lemmatizer the default:
Bengali, Persian, Norwegian, Swedish
---
 spacy/lang/bn/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/fa/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/nb/__init__.py            | 22 ++++++++++++++++++++++
 spacy/lang/sv/__init__.py            | 23 +++++++++++++++++++++++
 spacy/tests/lang/test_lemmatizers.py |  2 +-
 5 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6c1d66cba..270185a4b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,7 +1,11 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class BengaliDefaults(Language.Defaults):
@@ -17,4 +21,22 @@ class Bengali(Language):
     Defaults = BengaliDefaults
 
 
+@Bengali.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Bengali"]
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 7fdb9d065..244534120 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_SUFFIXES
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class PersianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Persian(Language):
     Defaults = PersianDefaults
 
 
+@Persian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Persian"]
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index d2bb92072..28a2f0bf2 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,9 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 from .punctuation import TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
 
 
 class NorwegianDefaults(Language.Defaults):
@@ -20,4 +24,22 @@ class Norwegian(Language):
     Defaults = NorwegianDefaults
 
 
+@Norwegian.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Norwegian"]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 0c6a1b9f4..6db74cd39 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,8 +1,13 @@
+from typing import Optional
+from thinc.api import Model
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from ...lookups import Lookups
+from ...pipeline import Lemmatizer
+
 
 # Punctuation stolen from Danish
 from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@@ -22,4 +27,22 @@ class Swedish(Language):
     Defaults = SwedishDefaults
 
 
+@Swedish.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "lookups": None},
+    scores=["lemma_acc"],
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language,
+    model: Optional[Model],
+    name: str,
+    mode: str,
+    lookups: Optional[Lookups],
+):
+    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
+    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
+
+
 __all__ = ["Swedish"]
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index 14c59659a..6e7f82341 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 # Only include languages with no external dependencies
 # excluded: ru, uk
 # excluded for custom tables: pl
-LANGUAGES = ["el", "en", "fr", "nl"]
+LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 # fmt: on
 
 

From d722a439aa3bef5d4b4fa677aa6b427f7186a673 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:39:41 +0200
Subject: [PATCH 013/133] Remove unneeded methods in senter and morphologizer
 (#6074)

Now that the tagger doesn't manage the tag map, the child classes senter
and morphologizer don't need to override the serialization methods.
---
 spacy/pipeline/morphologizer.pyx | 76 --------------------------------
 spacy/pipeline/senter.pyx        | 76 --------------------------------
 2 files changed, 152 deletions(-)

diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 57bdb28d7..0e0791004 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -259,79 +259,3 @@ class Morphologizer(Tagger):
         results.update(Scorer.score_token_attr_per_feat(examples,
             "morph", **kwargs))
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The loaded Morphologizer.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Morphologizer): The modified Morphologizer object.
-
-        DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 00664131b..a7eb721fd 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
         results = Scorer.score_spans(examples, "sents", **kwargs)
         del results["sents_per_type"]
         return results
-
-    def to_bytes(self, *, exclude=tuple()):
-        """Serialize the pipe to a bytestring.
-
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
-        """
-        serialize = {}
-        serialize["model"] = self.model.to_bytes
-        serialize["vocab"] = self.vocab.to_bytes
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, *, exclude=tuple()):
-        """Load the pipe from a bytestring.
-
-        bytes_data (bytes): The serialized pipe.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The loaded SentenceRecognizer.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
-        """
-        def load_model(b):
-            try:
-                self.model.from_bytes(b)
-            except AttributeError:
-                raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda b: self.vocab.from_bytes(b),
-            "cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
-            "model": lambda b: load_model(b),
-        }
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
-    def to_disk(self, path, *, exclude=tuple()):
-        """Serialize the pipe to disk.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
-        """
-        serialize = {
-            "vocab": lambda p: self.vocab.to_disk(p),
-            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
-            "cfg": lambda p: srsly.write_json(p, self.cfg),
-        }
-        util.to_disk(path, serialize, exclude)
-
-    def from_disk(self, path, *, exclude=tuple()):
-        """Load the pipe from disk. Modifies the object in place and returns it.
-
-        path (str / Path): Path to a directory.
-        exclude (Iterable[str]): String names of serialization fields to exclude.
-        RETURNS (Tagger): The modified SentenceRecognizer object.
-
-        DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
-        """
-        def load_model(p):
-            with p.open("rb") as file_:
-                try:
-                    self.model.from_bytes(file_.read())
-                except AttributeError:
-                    raise ValueError(Errors.E149) from None
-
-        deserialize = {
-            "vocab": lambda p: self.vocab.from_disk(p),
-            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
-            "model": load_model,
-        }
-        util.from_disk(path, deserialize, exclude)
-        return self

From f3db3f6fe00455f69bf05135f941ba88d307738b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 17:45:04 +0200
Subject: [PATCH 014/133] Add vectors option to CharacterEmbed (#6069)

* Add vectors option to CharacterEmbed

* Update spacy/pipeline/morphologizer.pyx

* Adjust default morphologizer config

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/ml/models/tok2vec.py       | 39 +++++++++++++++++++++++---------
 spacy/pipeline/morphologizer.pyx |  1 +
 spacy/tests/test_tok2vec.py      |  4 ++--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 2e5f8a802..7ced4bd04 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -164,7 +164,7 @@ def MultiHashEmbed(
 
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
+def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is
@@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
     nC (int): The number of UTF-8 bytes to embed per word. Recommended values
         are between 3 and 8, although it may depend on the length of words in the
         language.
+    also_use_static_vectors (bool): Whether to also use static word vectors.
+        Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    model = chain(
-        concatenate(
-            chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
-            chain(
-                FeatureExtractor([NORM]),
-                list2ragged(),
-                with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+    if also_use_static_vectors:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+                StaticVectors(width, dropout=0.0),
             ),
-        ),
-        with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
-        ragged2list(),
+            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
+    )
+    else:
+        model = chain(
+            concatenate(
+                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
+                chain(
+                    FeatureExtractor([NORM]),
+                    list2ragged(),
+                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
+                ),
+            ),
+            with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
+            ragged2list(),
     )
     return model
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 0e0791004..bb68a358c 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -32,6 +32,7 @@ width = 128
 rows = 7000
 nM = 64
 nC = 8
+also_use_static_vectors = false
 
 [model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
index fb30c6ae5..f3f35e4a7 100644
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
     [
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
         (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
-        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
+        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
     ],
 )
 # fmt: on

From d31afc833485fb6fd347fd41d94a4050a69dfa96 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:49:48 +0200
Subject: [PATCH 015/133] Fix Language.link_components when model is None

---
 spacy/language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 8f7cb1973..4c0a6d7e6 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1450,8 +1450,8 @@ class Language:
         """
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
-                for name2, proc2 in self.pipeline[i:]:
-                    if hasattr(proc2, "model"):
+                for name2, proc2 in self.pipeline[i+1:]:
+                    if isinstance(getattr(proc2, "model", None), Model):
                         proc1.find_listeners(proc2.model)
 
     @classmethod

From 4a573d18b3a818d3f9de3115d5376bf564337ba5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 17:51:29 +0200
Subject: [PATCH 016/133] Add comment

---
 spacy/language.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 4c0a6d7e6..3f0f850c2 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1448,6 +1448,11 @@ class Language:
         """Register 'listeners' within pipeline components, to allow them to
         effectively share weights.
         """
+        # I had though, "Why do we do this inside the Language object? Shouldn't
+        # it be the tok2vec/transformer/etc's job?
+        # The problem is we need to do it during deserialization...And the
+        # components don't receive the pipeline then. So this does have to be
+        # here :(
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
                 for name2, proc2 in self.pipeline[i+1:]:

From c776594ab1a27f51ddb6e5ea1ea815f515ad5213 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 16 Sep 2020 18:15:14 +0200
Subject: [PATCH 017/133] Fix

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3f0f850c2..d530e6b92 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from copy import deepcopy
 from pathlib import Path
 import warnings
-from thinc.api import get_current_ops, Config, require_gpu, Optimizer
+from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
 import srsly
 import multiprocessing as mp
 from itertools import chain, cycle

From a119667a36cced2ae5db6333e1539eb407fff70d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Sep 2020 20:32:38 +0200
Subject: [PATCH 018/133] Clean up spacy.tokens (#6046)

* Clean up spacy.tokens

* Update `set_children_from_heads`:
  * Don't check `dep` when setting lr_* or sentence starts
  * Set all non-sentence starts to `False`

* Use `set_children_from_heads` in `Token.head` setter
  * Reduce similar/duplicate code (admittedly adds a bit of overhead)
  * Update sentence starts consistently

* Remove unused `Doc.set_parse`

* Minor changes:
  * Declare cython variables (to avoid cython warnings)
  * Clean up imports

* Modify set_children_from_heads to set token range

Modify `set_children_from_heads` so that it adjust tokens within a
specified range rather then the whole document.

Modify the `Token.head` setter to adjust only the tokens affected by the
new head assignment.
---
 .../pipeline/_parser_internals/arc_eager.pyx  |  2 +-
 spacy/pipeline/_parser_internals/nonproj.pyx  |  2 +-
 spacy/tests/doc/test_doc_api.py               | 12 +--
 spacy/tests/doc/test_token_api.py             | 35 ++++++-
 spacy/tests/parser/test_parse.py              |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  2 +-
 spacy/tokens/_retokenize.pyx                  |  5 +-
 spacy/tokens/doc.pxd                          |  9 +-
 spacy/tokens/doc.pyx                          | 63 +++++--------
 spacy/tokens/span.pyx                         |  3 -
 spacy/tokens/token.pyx                        | 92 +++----------------
 11 files changed, 85 insertions(+), 142 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index bb0bf35b8..a5fc2ea0e 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -680,7 +680,7 @@ cdef class ArcEager(TransitionSystem):
 
     def finalize_doc(self, Doc doc):
         doc.is_parsed = True
-        set_children_from_heads(doc.c, doc.length)
+        set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
         for word in eg.y[start:end]:
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 8f5fdaa71..82070cd27 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
             new_head = _find_new_head(doc[i], head_label)
             doc.c[i].head = new_head.i - i
             doc.c[i].dep = doc.vocab.strings.add(new_label)
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     return doc
 
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index b37a31e43..31dbad9ca 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -265,17 +265,11 @@ def test_doc_is_nered(en_vocab):
 
 def test_doc_from_array_sent_starts(en_vocab):
     words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
-    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = Doc(en_vocab, words=words)
-    for i, (dep, head) in enumerate(zip(deps, heads)):
-        doc[i].dep_ = dep
-        doc[i].head = doc[head]
-        if head == i:
-            doc[i].is_sent_start = True
-    doc.is_parsed
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index be56c9b71..28ef0dd7f 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -112,7 +112,6 @@ def test_doc_token_api_ancestors(en_tokenizer):
 
 
 def test_doc_token_api_head_setter(en_tokenizer):
-    # the structure of this sentence depends on the English annotation scheme
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
     tokens = en_tokenizer(text)
@@ -169,6 +168,40 @@ def test_doc_token_api_head_setter(en_tokenizer):
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
 
+    # test sentence starts when two sentences are joined
+    text = "This is one sentence. This is another sentence."
+    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
+    tokens = en_tokenizer(text)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=["dep"] * len(heads),
+    )
+    # initially two sentences
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # modifying with a sentence doesn't change sent starts
+    doc[2].head = doc[3]
+    assert doc[0].is_sent_start
+    assert doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[4]
+    assert doc[5].left_edge == doc[5]
+    assert doc[5].right_edge == doc[9]
+
+    # attach the second sentence to the first, resulting in one sentence
+    doc[5].head = doc[0]
+    assert doc[0].is_sent_start
+    assert not doc[5].is_sent_start
+    assert doc[0].left_edge == doc[0]
+    assert doc[0].right_edge == doc[9]
+
 
 def test_is_sent_start(en_tokenizer):
     doc = en_tokenizer("This is a sentence. This is another.")
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 8d45e2132..691a7c3aa 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -184,7 +184,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert doc[i].is_sent_start is None
+            assert not doc[i].is_sent_start
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index beb8faca1..859e4d80e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert doc[1].is_sent_start is None
+    assert not doc[1].is_sent_start
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 9323bb579..cd1e73a2b 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
     for i in range(doc.length):
         doc.c[i].head -= i
     # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
     # Make sure ent_iob remains consistent
     make_iob_consistent(doc.c, doc.length)
     # Return the merged Python object
@@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     for i in range(doc.length):
         doc.c[i].head -= i
     # set children from head
-    set_children_from_heads(doc.c, doc.length)
+    set_children_from_heads(doc.c, 0, doc.length)
 
 
 def _validate_extensions(extensions):
@@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
 def normalize_token_attrs(Vocab vocab, attrs):
     if "_" in attrs:  # Extension attributes
         extensions = attrs["_"]
-        print("EXTENSIONS", extensions)
         _validate_extensions(extensions)
         attrs = {key: value for key, value in attrs.items() if key != "_"}
         attrs = intify_attrs(attrs, strings_map=vocab.strings)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 2775aa97e..9b382d687 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
     const_TokenC_ptr
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
 
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
@@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
 
 
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1
-
-
 cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
 
 cdef class Doc:
@@ -74,5 +71,3 @@ cdef class Doc:
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
 
     cpdef np.ndarray to_array(self, object features)
-
-    cdef void set_parse(self, const TokenC* parsed) nogil
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 93520aeda..62a6dd6db 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,32 +1,27 @@
 # cython: infer_types=True, bounds_check=False, profile=True
 cimport cython
 cimport numpy as np
-from libc.string cimport memcpy, memset
+from libc.string cimport memcpy
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
 import copy
 from collections import Counter
 import numpy
-import numpy.linalg
-import struct
 import srsly
 from thinc.api import get_array_module
 from thinc.util import copy_array
 import warnings
-import copy
 
 from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
+from ..attrs cimport attr_id_t
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
-from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
+from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 
-from ..attrs import intify_attr, intify_attrs, IDS
-from ..util import normalize_slice
+from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
 from .. import util
@@ -291,7 +286,7 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#getitem
         """
         if isinstance(i, slice):
-            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
+            start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self, start, stop, label=0)
         if i < 0:
             i = self.length + i
@@ -627,10 +622,7 @@ cdef class Doc:
     @property
     def sents(self):
         """Iterate over the sentences in the document. Yields sentence `Span`
-        objects. Sentence spans have no label. To improve accuracy on informal
-        texts, spaCy calculates sentence boundaries from the syntactic
-        dependency parse. If the parser is disabled, the `sents` iterator will
-        be unavailable.
+        objects. Sentence spans have no label.
 
         YIELDS (Span): Sentences in the document.
 
@@ -786,14 +778,6 @@ cdef class Doc:
         for i in range(self.length, self.max_length + PADDING):
             self.c[i].lex = &EMPTY_LEXEME
 
-    cdef void set_parse(self, const TokenC* parsed) nogil:
-        # TODO: This method is fairly misleading atm. It's used by Parser
-        # to actually apply the parse calculated. Need to rethink this.
-        # Probably we should use from_array?
-        self.is_parsed = True
-        for i in range(self.length):
-            self.c[i] = parsed[i]
-
     def from_array(self, attrs, array):
         """Load attributes from a numpy array. Write to a `Doc` object, from an
         `(M, N)` array of attributes.
@@ -884,7 +868,7 @@ cdef class Doc:
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
         # If document is parsed, set children
         if self.is_parsed:
-            set_children_from_heads(self.c, length)
+            set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
@@ -1321,13 +1305,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
             return mid
     return -1
 
-
-cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
+cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
+    # note: end is exclusive
     cdef TokenC* head
     cdef TokenC* child
     cdef int i
     # Set number of left/right children to 0. We'll increment it in the loops.
-    for i in range(length):
+    for i in range(start, end):
         tokens[i].l_kids = 0
         tokens[i].r_kids = 0
         tokens[i].l_edge = i
@@ -1341,38 +1325,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
     # without risking getting stuck in an infinite loop if something is
     # terribly malformed.
     while not heads_within_sents:
-        heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
+        heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
         if loop_count > 10:
             warnings.warn(Warnings.W026)
             break
         loop_count += 1
     # Set sentence starts
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        tokens[i].sent_start = -1
+    for i in range(start, end):
+        if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = True
 
 
-cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
+cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
     # Set left edges
     cdef TokenC* head
     cdef TokenC* child
     cdef int i, j
-    for i in range(length):
+    for i in range(start, end):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child < head and loop_count == 0:
+        if loop_count == 0 and child < head:
             head.l_kids += 1
         if child.l_edge < head.l_edge:
             head.l_edge = child.l_edge
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
     # Set right edges - same as above, but iterate in reverse
-    for i in range(length-1, -1, -1):
+    for i in range(end-1, start-1, -1):
         child = &tokens[i]
         head = &tokens[i + child.head]
-        if child > head and loop_count == 0:
+        if loop_count == 0 and child > head:
             head.r_kids += 1
         if child.r_edge > head.r_edge:
             head.r_edge = child.r_edge
@@ -1380,14 +1366,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
             head.l_edge = child.l_edge
     # Get sentence start positions according to current state
     sent_starts = set()
-    for i in range(length):
-        if tokens[i].head == 0 and tokens[i].dep != 0:
+    for i in range(start, end):
+        if tokens[i].head == 0:
             sent_starts.add(tokens[i].l_edge)
     cdef int curr_sent_start = 0
     cdef int curr_sent_end = 0
     # Check whether any heads are not within the current sentence
-    for i in range(length):
-        if (i > 0 and i in sent_starts) or i == length - 1:
+    for i in range(start, end):
+        if (i > 0 and i in sent_starts) or i == end - 1:
             curr_sent_end = i
             for j in range(curr_sent_start, curr_sent_end):
                 if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
@@ -1436,6 +1422,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
         with shape (n, n), where n = len(doc).
     """
     cdef int [:,:] lca_matrix
+    cdef int j, k
     n_tokens= end - start
     lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
     lca_mat.fill(-1)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f06f3307d..1f42c84ee 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -4,13 +4,10 @@ cimport numpy as np
 from libc.math cimport sqrt
 
 import numpy
-import numpy.linalg
 from thinc.api import get_array_module
-from collections import defaultdict
 import warnings
 
 from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
-from .token cimport TokenC
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 2474f0637..35142c35e 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,6 +1,4 @@
 # cython: infer_types=True
-from libc.string cimport memcpy
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
-from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
-from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
+from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
+from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
 from ..symbols cimport conj
 from .morphanalysis cimport MorphAnalysis
+from .doc cimport set_children_from_heads
 
 from .. import parts_of_speech
-from .. import util
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
@@ -658,78 +655,19 @@ cdef class Token:
             # Do nothing if old head is new head
             if self.i + self.c.head == new_head.i:
                 return
-            cdef Token old_head = self.head
-            cdef int rel_newhead_i = new_head.i - self.i
-            # Is the new head a descendant of the old head
-            cdef bint is_desc = old_head.is_ancestor(new_head)
-            cdef int new_edge
-            cdef Token anc, child
-            # Update number of deps of old head
-            if self.c.head > 0:  # left dependent
-                old_head.c.l_kids -= 1
-                if self.c.l_edge == old_head.c.l_edge:
-                    # The token dominates the left edge so the left edge of
-                    # the head may change when the token is reattached, it may
-                    # not change if the new head is a descendant of the current
-                    # head.
-                    new_edge = self.c.l_edge
-                    # The new l_edge is the left-most l_edge on any of the
-                    # other dependents where the l_edge is left of the head,
-                    # otherwise it is the head
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.l_edge < new_edge:
-                                new_edge = child.c.l_edge
-                        old_head.c.l_edge = new_edge
-                    # Walk up the tree from old_head and assign new l_edge to
-                    # ancestors until an ancestor already has an l_edge that's
-                    # further left
-                    for anc in old_head.ancestors:
-                        if anc.c.l_edge <= new_edge:
-                            break
-                        anc.c.l_edge = new_edge
-            elif self.c.head < 0:  # right dependent
-                old_head.c.r_kids -= 1
-                # Do the same thing as for l_edge
-                if self.c.r_edge == old_head.c.r_edge:
-                    new_edge = self.c.r_edge
-                    if not is_desc:
-                        new_edge = old_head.i
-                        for child in old_head.children:
-                            if child == self:
-                                continue
-                            if child.c.r_edge > new_edge:
-                                new_edge = child.c.r_edge
-                        old_head.c.r_edge = new_edge
-                    for anc in old_head.ancestors:
-                        if anc.c.r_edge >= new_edge:
-                            break
-                        anc.c.r_edge = new_edge
-            # Update number of deps of new head
-            if rel_newhead_i > 0:  # left dependent
-                new_head.c.l_kids += 1
-                # Walk up the tree from new head and set l_edge to self.l_edge
-                # until you hit a token with an l_edge further to the left
-                if self.c.l_edge < new_head.c.l_edge:
-                    new_head.c.l_edge = self.c.l_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.l_edge <= self.c.l_edge:
-                            break
-                        anc.c.l_edge = self.c.l_edge
-            elif rel_newhead_i < 0:  # right dependent
-                new_head.c.r_kids += 1
-                # Do the same as for l_edge
-                if self.c.r_edge > new_head.c.r_edge:
-                    new_head.c.r_edge = self.c.r_edge
-                    for anc in new_head.ancestors:
-                        if anc.c.r_edge >= self.c.r_edge:
-                            break
-                        anc.c.r_edge = self.c.r_edge
+            # Find the widest l/r_edges of the roots of the two tokens involved
+            # to limit the number of tokens for set_children_from_heads
+            cdef Token self_root, new_head_root
+            self_ancestors = list(self.ancestors)
+            new_head_ancestors = list(new_head.ancestors)
+            self_root = self_ancestors[-1] if self_ancestors else self
+            new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
+            start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
+            end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
             # Set new head
-            self.c.head = rel_newhead_i
+            self.c.head = new_head.i - self.i
+            # Adjust parse properties and sentence starts
+            set_children_from_heads(self.doc.c, start, end + 1)
 
     @property
     def conjuncts(self):

From 7e4cd7575c33929bca0d3f7d932b0968803e4a71 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 17 Sep 2020 00:14:01 +0200
Subject: [PATCH 019/133] Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 spacy/displacy/__init__.py                    |   2 +-
 spacy/errors.py                               |  14 +-
 spacy/lang/de/syntax_iterators.py             |   2 +-
 spacy/lang/el/syntax_iterators.py             |   2 +-
 spacy/lang/en/syntax_iterators.py             |   2 +-
 spacy/lang/es/syntax_iterators.py             |   2 +-
 spacy/lang/fa/syntax_iterators.py             |   2 +-
 spacy/lang/fr/syntax_iterators.py             |   2 +-
 spacy/lang/id/syntax_iterators.py             |   2 +-
 spacy/lang/nb/syntax_iterators.py             |   2 +-
 spacy/lang/sv/syntax_iterators.py             |   2 +-
 spacy/matcher/matcher.pyx                     |  15 +-
 spacy/matcher/phrasematcher.pyx               |  20 ++-
 .../pipeline/_parser_internals/arc_eager.pyx  |   1 -
 spacy/pipeline/functions.py                   |   2 +-
 spacy/pipeline/morphologizer.pyx              |   2 -
 spacy/pipeline/tagger.pyx                     |   1 -
 spacy/tests/doc/test_doc_api.py               |  89 ++++++++--
 spacy/tests/doc/test_span.py                  |   6 +-
 spacy/tests/doc/test_token_api.py             |   9 +-
 spacy/tests/lang/de/test_noun_chunks.py       |   4 -
 spacy/tests/lang/el/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_noun_chunks.py       |   4 -
 spacy/tests/lang/en/test_sbd.py               |   3 +-
 spacy/tests/lang/es/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fa/test_noun_chunks.py       |   4 -
 spacy/tests/lang/fr/test_noun_chunks.py       |   4 -
 spacy/tests/lang/id/test_noun_chunks.py       |   4 -
 spacy/tests/lang/nb/test_noun_chunks.py       |   4 -
 spacy/tests/lang/sv/test_noun_chunks.py       |   4 -
 spacy/tests/matcher/test_matcher_api.py       |  11 +-
 spacy/tests/matcher/test_phrase_matcher.py    |  17 +-
 spacy/tests/parser/test_parse.py              |   5 +-
 spacy/tests/parser/test_parse_navigate.py     |   2 +-
 spacy/tests/parser/test_space_attachment.py   |   3 +-
 spacy/tests/pipeline/test_attributeruler.py   |   6 +
 spacy/tests/pipeline/test_functions.py        |   2 -
 spacy/tests/pipeline/test_sentencizer.py      |  12 +-
 spacy/tests/regression/test_issue1-1000.py    |   5 +-
 spacy/tests/regression/test_issue1501-2000.py |  27 ++-
 spacy/tests/regression/test_issue2001-2500.py |   5 +-
 spacy/tests/regression/test_issue2501-3000.py |   8 +-
 spacy/tests/regression/test_issue3001-3500.py |  18 +-
 spacy/tests/regression/test_issue3501-4000.py |   2 -
 spacy/tests/regression/test_issue4001-4500.py |   5 +-
 spacy/tests/test_scorer.py                    |   1 -
 spacy/tests/test_training.py                  |  20 +--
 spacy/tokens/_serialize.py                    |   2 +-
 spacy/tokens/doc.pxd                          |   4 -
 spacy/tokens/doc.pyx                          | 157 +++++++++---------
 spacy/tokens/span.pyx                         |  17 +-
 spacy/tokens/token.pyx                        |   2 +-
 spacy/training/converters/conllu2docs.py      |   4 -
 spacy/training/gold_io.pyx                    |  12 +-
 website/docs/api/doc.md                       |  47 +++---
 website/docs/usage/v3.md                      |  20 +++
 56 files changed, 350 insertions(+), 282 deletions(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 0e80c3b5f..48229572b 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         warnings.warn(Warnings.W005)
     if options.get("collapse_phrases", False):
         with doc.retokenize() as retokenizer:
diff --git a/spacy/errors.py b/spacy/errors.py
index 3bdeeccbe..173aedab9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -119,6 +119,11 @@ class Warnings:
     W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
             "need to match on a stream of documents, you can use nlp.pipe and "
             "call the {matcher} on each Doc object.")
+    W106 = ("Both HEAD and SENT_START are included as attributes in "
+            "doc.from_array(). The parse trees based on the HEAD attribute "
+            "will override the values in SENT_START.")
+    W107 = ("The property Doc.{prop} is deprecated. Use "
+            "Doc.has_annotation(\"{attr}\") instead.")
 
 
 @add_codes
@@ -192,11 +197,6 @@ class Errors:
             "Alternatively, add the dependency parser, or set sentence "
             "boundaries by setting doc[i].is_sent_start.")
     E031 = ("Invalid token: empty string ('') at position {i}.")
-    E032 = ("Conflicting attributes specified in doc.from_array(): "
-            "(HEAD, SENT_START). The HEAD attribute currently sets sentence "
-            "boundaries implicitly, based on the tree structure. This means "
-            "the HEAD attribute would potentially override the sentence "
-            "boundaries set by SENT_START.")
     E033 = ("Cannot load into non-empty Doc of length {length}.")
     E035 = ("Error creating span with start {start} and end {end} for Doc of "
             "length {length}.")
@@ -397,8 +397,8 @@ class Errors:
     E154 = ("One of the attributes or values is not supported for token "
             "patterns. Please use the option validate=True with Matcher, "
             "PhraseMatcher, or EntityRuler for more details.")
-    E155 = ("The pipeline needs to include a tagger in order to use "
-            "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
+    E155 = ("The pipeline needs to include a {pipe} in order to use "
+            "Matcher or PhraseMatcher with the attribute {attr}. "
             "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
             "instead of list(nlp.tokenizer.pipe()).")
     E156 = ("The pipeline needs to include a parser in order to use "
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index bd495f792..bd75a61eb 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_label = doc.vocab.strings.add("NP")
     np_deps = set(doc.vocab.strings.add(label) for label in labels)
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 0a13edcc0..89cfd8b72 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     # Further improvement of the models will eliminate the need for this tag.
     labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 59ae733bd..2a1b0867e 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings.add(label) for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index 427f1f203..ad0a1b838 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
 def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
     doc = doclike.doc
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     if not len(doc):
         return
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index b63db3539..0be06e73c 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -19,7 +19,7 @@ def noun_chunks(doclike):
     ]
     doc = doclike.doc  # Ensure works on both Doc and Span.
 
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
 
     np_deps = [doc.vocab.strings.add(label) for label in labels]
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index f6d261643..0f29bfe16 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d297203e3..68117a54d 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 662b508ed..d5ae47853 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
     labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
     # fmt: on
     doc = doclike.doc  # Ensure works on both Doc and Span.
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         raise ValueError(Errors.E029)
     np_deps = [doc.vocab.strings[label] for label in labels]
     conj = doc.vocab.strings.add("conj")
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 079cac788..d83f58181 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -17,7 +17,7 @@ from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
 
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@@ -215,10 +215,15 @@ cdef class Matcher:
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
         cdef Pool tmp_pool = Pool()
-        if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
-          and not doc.is_tagged:
-            raise ValueError(Errors.E155.format())
-        if DEP in self._seen_attrs and not doc.is_parsed:
+        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
+            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+        if POS in self._seen_attrs and not doc.has_annotation("POS"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
+            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
+            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
             raise ValueError(Errors.E156.format())
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                 extensions=self._extensions, predicates=self._extra_predicates)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index fae513367..b00ba157f 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
 
 import warnings
 
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
+from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
@@ -184,12 +184,20 @@ cdef class PhraseMatcher:
             if len(doc) == 0:
                 continue
             if isinstance(doc, Doc):
-                if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
-                    raise ValueError(Errors.E155.format())
-                if self.attr == DEP and not doc.is_parsed:
+                attrs = (TAG, POS, MORPH, LEMMA, DEP)
+                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
+                if self.attr == TAG and not has_annotation[TAG]:
+                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
+                if self.attr == POS and not has_annotation[POS]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
+                if self.attr == MORPH and not has_annotation[MORPH]:
+                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
+                if self.attr == LEMMA and not has_annotation[LEMMA]:
+                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
+                if self.attr == DEP and not has_annotation[DEP]:
                     raise ValueError(Errors.E156.format())
-                if self._validate and (doc.is_tagged or doc.is_parsed) \
-                  and self.attr not in (DEP, POS, TAG, LEMMA):
+                if self._validate and any(has_annotation.values()) \
+                        and self.attr not in attrs:
                     string_attr = self.vocab.strings[self.attr]
                     warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
                 keyword = self._convert_to_array(doc)
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index a5fc2ea0e..dafa99bdd 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -679,7 +679,6 @@ cdef class ArcEager(TransitionSystem):
                 st._sent[i].dep = self.root_label
 
     def finalize_doc(self, Doc doc):
-        doc.is_parsed = True
         set_children_from_heads(doc.c, 0, doc.length)
 
     def has_gold(self, Example eg, start=0, end=None):
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 7e68ea369..614608b25 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
 
     DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
     """
-    if not doc.is_parsed:
+    if not doc.has_annotation("DEP"):
         return doc
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bb68a358c..62ad9e0eb 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -204,8 +204,6 @@ class Morphologizer(Tagger):
                 doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
                 doc.c[j].pos = self.cfg["labels_pos"][morph]
 
-            doc.is_morphed = True
-
     def get_loss(self, examples, scores):
         """Find the loss and gradient of loss for the batch of documents and
         their predicted scores.
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 1f8b4eb7a..0d78047ae 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -168,7 +168,6 @@ class Tagger(Pipe):
                 # Don't clobber preset POS tags
                 if doc.c[j].tag == 0:
                     doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
-            doc.is_tagged = True
 
     def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
         """Learn from a batch of documents and gold-standard information,
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 31dbad9ca..ce979d3d1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
     tokens = en_tokenizer(text)
     tokens[0].lemma_ = "lemma"
     tokens[0].norm_ = "norm"
+    tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
     tokens[0].ent_kb_id_ = "ent_kb_id"
     new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
     assert tokens.text == new_tokens.text
@@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
 
 def test_doc_api_sents_empty_string(en_tokenizer):
     doc = en_tokenizer("")
-    doc.is_parsed = True
     sents = list(doc.sents)
     assert len(sents) == 0
 
@@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
     text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
     heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
              -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    deps = ["dep"] * len(heads)
     # fmt: on
 
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
 )
 def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
     tokens = en_tokenizer(sentence)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
+    doc = get_doc(
+        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
+    )
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -251,16 +254,16 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
 def test_doc_is_nered(en_vocab):
     words = ["I", "live", "in", "New", "York"]
     doc = Doc(en_vocab, words=words)
-    assert not doc.is_nered
+    assert not doc.has_annotation("ENT_IOB")
     doc.ents = [Span(doc, 3, 5, label="GPE")]
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test creating doc from array with unknown values
     arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
     doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
-    assert doc.is_nered
+    assert doc.has_annotation("ENT_IOB")
     # Test serialization
     new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_nered
+    assert new_doc.has_annotation("ENT_IOB")
 
 
 def test_doc_from_array_sent_starts(en_vocab):
@@ -271,25 +274,35 @@ def test_doc_from_array_sent_starts(en_vocab):
     # fmt: on
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
+    # HEAD overrides SENT_START with warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
-    with pytest.raises(ValueError):
+    with pytest.warns(UserWarning):
         new_doc.from_array(attrs, arr)
 
-    attrs = [SENT_START, DEP]
+    # no warning using default attrs
+    attrs = doc._get_array_attrs()
+    arr = doc.to_array(attrs)
+    with pytest.warns(None) as record:
+        new_doc.from_array(attrs, arr)
+        assert len(record) == 0
+
+    # only SENT_START uses SENT_START
+    attrs = [SENT_START]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert not new_doc.is_parsed
+    assert not new_doc.has_annotation("DEP")
 
+    # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
-    assert new_doc.is_parsed
+    assert new_doc.has_annotation("DEP")
 
 
 def test_doc_from_array_morph(en_vocab):
@@ -359,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    with pytest.raises(ValueError):
-        # important attributes from sentenziser or parser are missing
-        assert list(m_doc.sents)
     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
     assert str(m_doc) == " ".join(en_texts_without_empty)
@@ -373,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
 
 
+def test_doc_api_from_docs_ents(en_tokenizer):
+    texts = ["Merging the docs is fun.", "They don't think alike."]
+    docs = [en_tokenizer(t) for t in texts]
+    docs[0].ents = ()
+    docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
+    doc = Doc.from_docs(docs)
+    assert len(doc.ents) == 1
+
+
 def test_doc_lang(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "world"])
     assert doc.lang_ == "en"
@@ -393,3 +412,45 @@ def test_token_lexeme(en_vocab):
     assert isinstance(token.lex, Lexeme)
     assert token.lex.text == token.text
     assert en_vocab[token.orth] == token.lex
+
+
+def test_has_annotation(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "world"])
+    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
+    for attr in attrs:
+        assert not doc.has_annotation(attr)
+
+    doc[0].tag_ = "A"
+    doc[0].pos_ = "X"
+    doc[0].morph_ = "Feat=Val"
+    doc[0].lemma_ = "a"
+    doc[0].dep_ = "dep"
+    doc[0].head = doc[1]
+    doc.ents = [Span(doc, 0, 1, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert not doc.has_annotation(attr, require_complete=True)
+
+    doc[1].tag_ = "A"
+    doc[1].pos_ = "X"
+    doc[1].morph_ = ""
+    doc[1].lemma_ = "a"
+    doc[1].dep_ = "dep"
+    doc.ents = [Span(doc, 0, 2, label="HELLO")]
+
+    for attr in attrs:
+        assert doc.has_annotation(attr)
+        assert doc.has_annotation(attr, require_complete=True)
+
+
+def test_is_flags_deprecated(en_tokenizer):
+    doc = en_tokenizer("test")
+    with pytest.deprecated_call():
+        doc.is_tagged
+    with pytest.deprecated_call():
+        doc.is_parsed
+    with pytest.deprecated_call():
+        doc.is_nered
+    with pytest.deprecated_call():
+        doc.is_sentenced
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 1e9623484..ad4f49042 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
     text = "This is a sentence. This is another sentence. And a third."
     tokens = en_tokenizer(text)
     doc = Doc(tokens.vocab, words=[t.text for t in tokens])
-    doc.is_parsed = False
     return doc
 
 
@@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
     heads = [0, 3, -1, -2, -4]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
     lca = doc[:2].get_lca_matrix()
     assert lca.shape == (2, 2)
     assert lca[0, 0] == 0  # the & the -> the
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 28ef0dd7f..1308df67b 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -114,8 +114,9 @@ def test_doc_token_api_ancestors(en_tokenizer):
 def test_doc_token_api_head_setter(en_tokenizer):
     text = "Yesterday I saw a dog that barked loudly."
     heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
@@ -208,7 +209,6 @@ def test_is_sent_start(en_tokenizer):
     assert doc[5].is_sent_start is None
     doc[5].is_sent_start = True
     assert doc[5].is_sent_start is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -217,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
     assert doc[4].is_sent_end is None
     doc[5].is_sent_start = True
     assert doc[4].is_sent_end is True
-    doc.is_parsed = True
     assert len(list(doc.sents)) == 2
 
 
@@ -242,14 +241,14 @@ def test_token0_has_sent_start_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_start is True
     assert doc[1].is_sent_start is None
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_tokenlast_has_sent_end_true():
     doc = Doc(Vocab(), words=["hello", "world"])
     assert doc[0].is_sent_end is None
     assert doc[1].is_sent_end is True
-    assert not doc.is_sentenced
+    assert not doc.has_annotation("SENT_START")
 
 
 def test_token_api_conjuncts_chain(en_vocab):
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index ff9f8d5e5..0ed12d208 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_de(de_tokenizer):
     """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = de_tokenizer("Er lag auf seinem")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index 38e72b0b2..2d376c612 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_el(el_tokenizer):
     """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index 5395dbabe..fa3a134bd 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -11,12 +11,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = en_tokenizer("This is a sentence")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index 38c8d94d8..ee1e6be17 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
 def test_en_sbd_single_punct(en_tokenizer, text, punct):
     heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index a7ec4e562..db89fd903 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_es(es_tokenizer):
     """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = es_tokenizer("en Oxford este verano")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
index 767e91f6b..53b39d9a1 100644
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -3,12 +3,8 @@ import pytest
 
 def test_noun_chunks_is_parsed_fa(fa_tokenizer):
     """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
 
     doc = fa_tokenizer("این یک جمله نمونه می باشد.")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index 5fd6897f7..d81199a3e 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_fr(fr_tokenizer):
     """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = fr_tokenizer("trouver des travaux antérieurs")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index 445643933..fef1524f1 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_id(id_tokenizer):
     """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = id_tokenizer("sebelas")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index c6a00354b..9965fcd14 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -3,11 +3,7 @@ import pytest
 
 def test_noun_chunks_is_parsed_nb(nb_tokenizer):
     """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = nb_tokenizer("Smørsausen brukes bl.a. til")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index f352ca648..458cdadd5 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -5,12 +5,8 @@ from ...util import get_doc
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
     """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
-    To check this test, we're constructing a Doc
-    with a new Vocab here and forcing is_parsed to 'False'
-    to make sure the noun chunks don't run.
     """
     doc = sv_tokenizer("Studenten läste den bästa boken")
-    doc.is_parsed = False
     with pytest.raises(ValueError):
         list(doc.noun_chunks)
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f335a19..04f9585f1 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = Matcher(en_vocab)
     matcher.add("TEST", [[{"DEP": "a"}]])
     matcher(doc1)
@@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
         matcher(doc2)
     with pytest.raises(ValueError):
         matcher(doc3)
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = Matcher(en_vocab)
         matcher.add("TEST", [[{attr: "a"}]])
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 4b7027f87..9caf284a3 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
 
 def test_phrase_matcher_validation(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
     doc3 = Doc(en_vocab, words=["Test"])
     matcher = PhraseMatcher(en_vocab, validate=True)
     with pytest.warns(UserWarning):
@@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
 
 def test_attr_pipeline_checks(en_vocab):
     doc1 = Doc(en_vocab, words=["Test"])
-    doc1.is_parsed = True
+    doc1[0].dep_ = "ROOT"
     doc2 = Doc(en_vocab, words=["Test"])
-    doc2.is_tagged = True
+    doc2[0].tag_ = "TAG"
+    doc2[0].pos_ = "X"
+    doc2[0].morph_ = "Feat=Val"
+    doc2[0].lemma_ = "LEMMA"
     doc3 = Doc(en_vocab, words=["Test"])
-    # DEP requires is_parsed
+    # DEP requires DEP
     matcher = PhraseMatcher(en_vocab, attr="DEP")
     matcher.add("TEST1", [doc1])
     with pytest.raises(ValueError):
         matcher.add("TEST2", [doc2])
     with pytest.raises(ValueError):
         matcher.add("TEST3", [doc3])
-    # TAG, POS, LEMMA require is_tagged
+    # TAG, POS, LEMMA require those values
     for attr in ("TAG", "POS", "LEMMA"):
         matcher = PhraseMatcher(en_vocab, attr=attr)
         matcher.add("TEST2", [doc2])
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 691a7c3aa..9e760c1e7 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
     text = "The four wheels on the bus turned quickly"
     heads = [2, 1, 4, -1, 1, -2, 0, -1]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
         else:
-            assert not doc[i].is_sent_start
+            assert doc[i].is_sent_start is False
     for sent in doc.sents:
         for token in sent:
             assert token.head in sent
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f42601a85..db1e98ba0 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
 
     lefts = {}
     rights = {}
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3a0a6b943..3672dabea 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
 def test_parser_space_attachment(en_tokenizer):
     text = "This is a test.\nTo ensure  spaces are attached well."
     heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+    deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index 9254688cc..a66b34bc0 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_init_patterns(nlp, pattern_dicts):
@@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
     nlp.remove_pipe("attribute_ruler")
     # initialize with patterns from asset
     nlp.add_pipe(
@@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
     assert doc[3].lemma_ == "cat"
     assert doc[3].morph_ == "Case=Nom|Number=Sing"
+    assert doc.has_annotation("LEMMA")
+    assert doc.has_annotation("MORPH")
 
 
 def test_attributeruler_score(nlp, pattern_dicts):
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 0ec8a5ec2..ee9e34df3 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -35,8 +35,6 @@ def doc2(en_tokenizer):
         deps=deps,
     )
     doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
-    doc.is_parsed = True
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 1b1c51f34..5dd0fef43 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
     doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     sent_starts = [t.is_sent_start for t in doc]
     sent_ends = [t.is_sent_end for t in doc]
     assert sent_starts == [True, False, True, False, False, False, False]
@@ -22,13 +22,13 @@ def test_sentencizer_pipe():
     nlp = English()
     nlp.add_pipe("sentencizer")
     for doc in nlp.pipe(texts):
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
     for ex in nlp.pipe(texts):
         doc = ex.doc
-        assert doc.is_sentenced
+        assert doc.has_annotation("SENT_START")
         sent_starts = [t.is_sent_start for t in doc]
         assert sent_starts == [True, False, True, False, False, False, False]
         assert len(list(doc.sents)) == 2
@@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
     nlp.add_pipe("sentencizer")
     for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
         for doc in nlp.pipe(texts):
-            assert doc.is_sentenced
+            assert doc.has_annotation("SENT_START")
             sent_starts = [t.is_sent_start for t in doc]
             if len(doc) == 0:
                 assert sent_starts == []
@@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=None)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
@@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
     doc = Doc(en_vocab, words=words)
     sentencizer = Sentencizer(punct_chars=punct_chars)
     doc = sentencizer(doc)
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert [t.is_sent_start for t in doc] == sent_starts
     assert [t.is_sent_end for t in doc] == sent_ends
     assert len(list(doc.sents)) == n_sents
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index ed5bcc1a5..30f66fb1d 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
     doc = get_doc(
         tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
     )
-    doc.is_parsed = True
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
@@ -170,11 +169,9 @@ def test_issue595():
 
 def test_issue599(en_vocab):
     doc = Doc(en_vocab)
-    doc.is_tagged = True
-    doc.is_parsed = True
     doc2 = Doc(doc.vocab)
     doc2.from_bytes(doc.to_bytes())
-    assert doc2.is_parsed
+    assert doc2.has_annotation("DEP")
 
 
 def test_issue600():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index c1d726db6..e226c8524 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir
+from ..util import make_tempdir, get_doc
 
 
 def test_issue1506():
@@ -198,17 +198,26 @@ def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
     string = "This is a first sentence . And another one"
-    doc = Doc(Vocab(), words=string.split())
-    doc[6].sent_start = True
+    words = string.split()
+    doc = get_doc(Vocab(), words=words)
+    doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
-    assert not new_doc.is_parsed
-    assert not new_doc.is_tagged
-    doc.is_parsed = True
-    doc.is_tagged = True
+    assert not new_doc.has_annotation("DEP")
+    assert not new_doc.has_annotation("TAG")
+    doc = get_doc(
+        Vocab(),
+        words=words,
+        tags=["TAG"] * len(words),
+        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        deps=["dep"] * len(words),
+    )
+    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    assert new_doc.is_parsed
-    assert new_doc.is_tagged
+    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
+    assert new_doc[6].sent_start
+    assert new_doc.has_annotation("DEP")
+    assert new_doc.has_annotation("TAG")
 
 
 def test_issue1868():
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 357fbb84e..3bea5d3f6 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
 def test_issue2361(de_tokenizer):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
     doc = de_tokenizer('< > & " ')
-    doc.is_parsed = True
-    doc.is_tagged = True
     html = render(doc)
     for char in chars:
         assert char in html
@@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
     heads = [1, 0, 1, -2, -1, -1]
+    deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
             [0, 1, 1, 1, 1, 1],
@@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 859e4d80e..9267a7346 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -16,16 +16,16 @@ from ..util import get_doc
 
 
 def test_issue2564():
-    """Test the tagger sets is_tagged correctly when used via Language.pipe."""
+    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
     nlp = Language()
     tagger = nlp.add_pipe("tagger")
     tagger.add_label("A")
     nlp.begin_training()
     doc = nlp("hello world")
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
     docs = nlp.pipe(["hello", "world"])
     piped_doc = next(docs)
-    assert piped_doc.is_tagged
+    assert piped_doc.has_annotation("TAG")
 
 
 def test_issue2569(en_tokenizer):
@@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
     heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    assert not doc[1].is_sent_start
+    assert doc[1].is_sent_start is False
 
 
 @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 3059eb5ab..d848467dd 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
     ents = [(2, 4, "PERCENT")]
     doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.is_tagged
+    assert doc.has_annotation("TAG")
 
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
@@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
 def test_issue3199():
     """Test that Span.noun_chunks works correctly if no noun chunks iterator
     is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
+    with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
-    doc.is_parsed = True
+    doc = get_doc(
+        Vocab(),
+        words=["This", "is", "a", "sentence"],
+        heads=[0, -1, -2, -3],
+        deps=["dep"] * 4,
+    )
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -250,16 +254,16 @@ def test_issue3456():
 
 
 def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
     be restored after serialization."""
     nlp = English()
     nlp.add_pipe("sentencizer")
     doc = nlp("Hello world")
     assert doc[0].is_sent_start
-    assert doc.is_sentenced
+    assert doc.has_annotation("SENT_START")
     assert len(list(doc.sents)) == 1
     doc_bytes = doc.to_bytes()
     new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
     assert new_doc[0].is_sent_start
-    assert new_doc.is_sentenced
+    assert new_doc.has_annotation("SENT_START")
     assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index d36e693c7..8c483d877 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
     copy of the Doc.
     """
     doc = Doc(en_vocab, words=["Hello", "world"])
-    doc.is_parsed = True
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -386,7 +385,6 @@ def test_issue3959():
     doc[0].pos_ = "NOUN"
     assert doc[0].pos_ == "NOUN"
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     with make_tempdir() as tmp_dir:
         file_path = tmp_dir / "my_doc"
         doc.to_disk(file_path)
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 2beccedcf..4e58c347e 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
     for i, token in enumerate(doc):
         token.pos_ = pos[i]
     # usually this is already True when starting from proper models instead of blank English
-    doc.is_tagged = True
     doc_bytes = doc.to_bytes()
     vocab = Vocab()
     vocab = vocab.from_bytes(vocab_bytes)
@@ -249,7 +248,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we have correct IOB annotations
     doc1 = nlp("hi")
-    assert doc1.is_nered
+    assert doc1.has_annotation("ENT_IOB")
     for token in doc1:
         assert token.ent_iob == 2
     # add entity ruler and run again
@@ -260,7 +259,7 @@ def test_issue4267():
     assert "ner" in nlp.pipe_names
     # assert that we still have correct IOB annotations
     doc2 = nlp("hi")
-    assert doc2.is_nered
+    assert doc2.has_annotation("ENT_IOB")
     for token in doc2:
         assert token.ent_iob == 2
 
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index fb96c0361..6e3604ce8 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -80,7 +80,6 @@ def tagged_doc():
         doc[i].morph_ = morphs[i]
         if i > 0:
             doc[i].is_sent_start = False
-    doc.is_tagged = True
     return doc
 
 
diff --git a/spacy/tests/test_training.py b/spacy/tests/test_training.py
index 1926aca1f..5fd40a0dc 100644
--- a/spacy/tests/test_training.py
+++ b/spacy/tests/test_training.py
@@ -12,7 +12,7 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from .util import make_tempdir
+from .util import make_tempdir, get_doc
 
 
 @pytest.fixture
@@ -26,24 +26,16 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
+    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
+    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
     nlp = English()
-    doc = nlp(text)
-    for i in range(len(tags)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = pos[i]
-        doc[i].morph_ = morphs[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].dep_ = deps[i]
-        doc[i].head = doc[heads[i]]
-    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
+    words = [t.text for t in nlp.make_doc(text)]
+    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
     doc.cats = cats
-    doc.is_tagged = True
-    doc.is_parsed = True
     return doc
 
 
@@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
     docs = json2docs(data)
     assert len(docs) == 1
     for doc in docs:
-        assert not doc.is_nered
+        assert not doc.has_annotation("ENT_IOB")
     for token in doc:
         assert token.ent_iob == 0
     eg = Example(
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index cd8c81939..c9a20f6c0 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -13,7 +13,7 @@ from ..errors import Errors
 from ..util import ensure_path, SimpleFrozenList
 
 # fmt: off
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
+ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
 # fmt: on
 
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9b382d687..08f795b1a 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -46,10 +46,6 @@ cdef class Doc:
 
     cdef TokenC* c
 
-    cdef public bint is_tagged
-    cdef public bint is_parsed
-    cdef public bint is_morphed
-
     cdef public float sentiment
 
     cdef public dict user_hooks
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 62a6dd6db..5c5443258 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -24,9 +24,11 @@ from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
 from ..attrs import intify_attr, IDS
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
+from ..morphology import Morphology
 from .. import util
 from .underscore import Underscore, get_ext_args
 from ._retokenize import Retokenizer
+from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
 
 
 DEF PADDING = 5
@@ -185,8 +187,6 @@ cdef class Doc:
         self.c = data_start + PADDING
         self.max_length = size
         self.length = 0
-        self.is_tagged = False
-        self.is_parsed = False
         self.sentiment = 0.0
         self.cats = {}
         self.user_hooks = {}
@@ -216,11 +216,6 @@ cdef class Doc:
             else:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
-        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
-        # There's no information we'd like to add to it, so I guess so?
-        if self.length == 0:
-            self.is_tagged = True
-            self.is_parsed = True
 
     @property
     def _(self):
@@ -228,37 +223,61 @@ cdef class Doc:
         return Underscore(Underscore.doc_extensions, self)
 
     @property
-    def is_sentenced(self):
-        """Check if the document has sentence boundaries assigned. This is
-        defined as having at least one of the following:
+    def is_tagged(self):
+        warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
+        return self.has_annotation("TAG")
 
-        a) An entry "sents" in doc.user_hooks";
-        b) Doc.is_parsed is set to True;
-        c) At least one token other than the first where sent_start is not None.
-        """
-        if "sents" in self.user_hooks:
-            return True
-        if self.is_parsed:
-            return True
-        if len(self) < 2:
-            return True
-        for i in range(1, self.length):
-            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
-                return True
-        return False
+    @property
+    def is_parsed(self):
+        warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
+        return self.has_annotation("DEP")
 
     @property
     def is_nered(self):
-        """Check if the document has named entities set. Will return True if
-        *any* of the tokens has a named entity tag set (even if the others are
-        unknown values), or if the document is empty.
+        warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
+        return self.has_annotation("ENT_IOB")
+
+    @property
+    def is_sentenced(self):
+        warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
+        return self.has_annotation("SENT_START")
+
+    def has_annotation(self, attr, *, require_complete=False):
+        """Check whether the doc contains annotation on a token attribute.
+
+        attr (Union[int, str]): The attribute string name or int ID.
+        require_complete (bool): Whether to check that the attribute is set on
+            every token in the doc.
+        RETURNS (bool): Whether annotation is present.
+
+        DOCS: https://nightly.spacy.io/api/doc#has_annotation
         """
-        if len(self) == 0:
+
+        # empty docs are always annotated
+        if self.length == 0:
             return True
-        for i in range(self.length):
-            if self.c[i].ent_iob != 0:
+        cdef int i
+        cdef int range_start = 0
+        attr = intify_attr(attr)
+        # adjust attributes
+        if attr == HEAD:
+            # HEAD does not have an unset state, so rely on DEP
+            attr = DEP
+        elif attr == self.vocab.strings["IS_SENT_START"]:
+            # as in Matcher, allow IS_SENT_START as an alias of SENT_START
+            attr = SENT_START
+        # special cases for sentence boundaries
+        if attr == SENT_START:
+            if "sents" in self.user_hooks:
                 return True
-        return False
+            # docs of length 1 always have sentence boundaries
+            if self.length == 1:
+                return True
+            range_start = 1
+        if require_complete:
+            return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
+        else:
+            return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
 
     def __getitem__(self, object i):
         """Get a `Token` or `Span` object.
@@ -628,7 +647,7 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#sents
         """
-        if not self.is_sentenced:
+        if not self.has_annotation("SENT_START"):
             raise ValueError(Errors.E030)
         if "sents" in self.user_hooks:
             yield from self.user_hooks["sents"](self)
@@ -652,10 +671,6 @@ cdef class Doc:
         return self.vocab.lang
 
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
-        if self.length == 0:
-            # Flip these to false when we see the first token.
-            self.is_tagged = False
-            self.is_parsed = False
         if self.length == self.max_length:
             self._realloc(self.length * 2)
         cdef TokenC* t = &self.c[self.length]
@@ -802,8 +817,8 @@ cdef class Doc:
         if array.dtype != numpy.uint64:
             warnings.warn(Warnings.W028.format(type=array.dtype))
 
-        if SENT_START in attrs and HEAD in attrs:
-            raise ValueError(Errors.E032)
+        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
+            warnings.warn(Warnings.W106)
         cdef int i, col
         cdef int32_t abs_head_index
         cdef attr_id_t attr_id
@@ -863,18 +878,17 @@ cdef class Doc:
                     # add morph to morphology table
                     self.vocab.morphology.add(self.vocab.strings[value])
                 Token.set_struct_attr(token, attr_ids[j], value)
-        # Set flags
-        self.is_parsed = bool(self.is_parsed or HEAD in attrs)
-        self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
-        # If document is parsed, set children
-        if self.is_parsed:
-            set_children_from_heads(self.c, 0, length)
+        # If document is parsed, set children and sentence boundaries
+        if HEAD in attrs and DEP in attrs:
+            col = attrs.index(DEP)
+            if array[:, col].any():
+                set_children_from_heads(self.c, 0, length)
         return self
 
     @staticmethod
     def from_docs(docs, ensure_whitespace=True, attrs=None):
-        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
-        the same `Vocab`.
+        """Concatenate multiple Doc objects to form a new one. Raises an error
+        if the `Doc` objects do not all share the same `Vocab`.
 
         docs (list): A list of Doc objects.
         ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
@@ -892,16 +906,7 @@ cdef class Doc:
         (vocab,) = vocab
 
         if attrs is None:
-            attrs = [LEMMA, NORM]
-            if all(doc.is_nered for doc in docs):
-                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
-            # TODO: separate for is_morphed?
-            if all(doc.is_tagged for doc in docs):
-                attrs.extend([TAG, POS, MORPH])
-            if all(doc.is_parsed for doc in docs):
-                attrs.extend([HEAD, DEP])
-            else:
-                attrs.append(SENT_START)
+            attrs = Doc._get_array_attrs()
         else:
             if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
                 attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
@@ -973,9 +978,6 @@ cdef class Doc:
         other.tensor = copy.deepcopy(self.tensor)
         other.cats = copy.deepcopy(self.cats)
         other.user_data = copy.deepcopy(self.user_data)
-        other.is_tagged = self.is_tagged
-        other.is_parsed = self.is_parsed
-        other.is_morphed = self.is_morphed
         other.sentiment = self.sentiment
         other.has_unknown_spaces = self.has_unknown_spaces
         other.user_hooks = dict(self.user_hooks)
@@ -1049,22 +1051,16 @@ cdef class Doc:
 
         DOCS: https://nightly.spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
-        if self.is_tagged:
-            array_head.extend([TAG, POS])
-        # If doc parsed add head and dep attribute
-        if self.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = Doc._get_array_attrs()
         strings = set()
         for token in self:
             strings.add(token.tag_)
             strings.add(token.lemma_)
+            strings.add(token.morph_)
             strings.add(token.dep_)
             strings.add(token.ent_type_)
             strings.add(token.ent_kb_id_)
+            strings.add(token.ent_id_)
             strings.add(token.norm_)
         # Msgpack doesn't distinguish between lists and tuples, which is
         # vexing for user data. As a best guess, we *know* that within
@@ -1214,22 +1210,29 @@ cdef class Doc:
         DOCS: https://nightly.spacy.io/api/doc#to_json
         """
         data = {"text": self.text}
-        if self.is_nered:
+        if self.has_annotation("ENT_IOB"):
             data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
                             "label": ent.label_} for ent in self.ents]
-        if self.is_sentenced:
+        if self.has_annotation("SENT_START"):
             sents = list(self.sents)
             data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
                              for sent in sents]
         if self.cats:
             data["cats"] = self.cats
         data["tokens"] = []
+        attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
+        include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
         for token in self:
             token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
-            if self.is_tagged:
-                token_data["pos"] = token.pos_
+            if include_annotation["TAG"]:
                 token_data["tag"] = token.tag_
-            if self.is_parsed:
+            if include_annotation["POS"]:
+                token_data["pos"] = token.pos_
+            if include_annotation["MORPH"]:
+                token_data["morph"] = token.morph_
+            if include_annotation["LEMMA"]:
+                token_data["lemma"] = token.lemma_
+            if include_annotation["DEP"]:
                 token_data["dep"] = token.dep_
                 token_data["head"] = token.head.i
             data["tokens"].append(token_data)
@@ -1275,6 +1278,12 @@ cdef class Doc:
                     j += 1
         return output
 
+    @staticmethod
+    def _get_array_attrs():
+        attrs = [LENGTH, SPACY]
+        attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
+        return tuple(attrs)
+
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
     cdef int i = token_by_char(tokens, length, start_char)
@@ -1335,7 +1344,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         tokens[i].sent_start = -1
     for i in range(start, end):
         if tokens[i].head == 0:
-            tokens[tokens[i].l_edge].sent_start = True
+            tokens[tokens[i].l_edge].sent_start = 1
 
 
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 1f42c84ee..781474d3a 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -201,7 +201,7 @@ cdef class Span:
         return Underscore(Underscore.span_extensions, self,
                           start=self.start_char, end=self.end_char)
 
-    def as_doc(self, bint copy_user_data=False):
+    def as_doc(self, *, bint copy_user_data=False):
         """Create a `Doc` object with a copy of the `Span`'s data.
 
         copy_user_data (bool): Whether or not to copy the original doc's user data.
@@ -209,19 +209,10 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#as_doc
         """
-        # TODO: make copy_user_data a keyword-only argument (Python 3 only)
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
-        if self.doc.is_tagged:
-            array_head.append(TAG)
-        # If doc parsed add head and dep attribute
-        if self.doc.is_parsed:
-            array_head.extend([HEAD, DEP])
-        # Otherwise add sent_start
-        else:
-            array_head.append(SENT_START)
+        array_head = self.doc._get_array_attrs()
         array = self.doc.to_array(array_head)
         array = array[self.start : self.end]
         self._fix_dep_copy(array_head, array)
@@ -375,7 +366,7 @@ cdef class Span:
         self.doc.sents
         # Use `sent_start` token attribute to find sentence boundaries
         cdef int n = 0
-        if self.doc.is_sentenced:
+        if self.doc.has_annotation("SENT_START"):
             # Find start of the sentence
             start = self.start
             while self.doc.c[start].sent_start != 1 and start > 0:
@@ -507,8 +498,6 @@ cdef class Span:
 
         DOCS: https://nightly.spacy.io/api/span#noun_chunks
         """
-        if not self.doc.is_parsed:
-            raise ValueError(Errors.E029)
         # Accumulate the result before beginning to iterate over it. This
         # prevents the tokenisation from being changed out from under us
         # during the iteration. The tricky thing here is that Span accepts
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 35142c35e..239de4559 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -486,7 +486,7 @@ cdef class Token:
                 return True
 
         def __set__(self, value):
-            if self.doc.is_parsed:
+            if self.doc.has_annotation("DEP"):
                 raise ValueError(Errors.E043)
             if value is None:
                 self.c.sent_start = 0
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index 85afdeef3..ebd123375 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
         doc[i]._.merged_spaceafter = spaces[i]
     ents = get_entities(lines, ner_tag_pattern, ner_map)
     doc.ents = spans_from_biluo_tags(doc, ents)
-    doc.is_parsed = True
-    doc.is_tagged = True
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
@@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
         doc_x[i].dep_ = deps[i]
         doc_x[i].head = doc_x[heads[i]]
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
-    doc_x.is_parsed = True
-    doc_x.is_tagged = True
 
     return doc_x
 
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 5dc39eb31..b58df0d71 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                 json_para["links"].append(link_dict)
         biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
+        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
         for j, sent in enumerate(doc.sents):
             json_sent = {"tokens": [], "brackets": []}
             for token in sent:
                 json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
-                if doc.is_tagged:
+                if include_annotation["TAG"]:
                     json_token["tag"] = token.tag_
+                if include_annotation["POS"]:
                     json_token["pos"] = token.pos_
+                if include_annotation["MORPH"]:
                     json_token["morph"] = token.morph_
+                if include_annotation["LEMMA"]:
                     json_token["lemma"] = token.lemma_
-                if doc.is_parsed:
+                if include_annotation["DEP"]:
                     json_token["head"] = token.head.i-token.i
                     json_token["dep"] = token.dep_
-                json_token["ner"] = biluo_tags[token.i]
+                if include_annotation["ENT_IOB"]:
+                    json_token["ner"] = biluo_tags[token.i]
                 json_sent["tokens"].append(json_token)
             json_para["sentences"].append(json_sent)
         json_doc["paragraphs"].append(json_para)
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 88dc62c2a..380f6a172 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------------------------------------------------------- |
 | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
 
+## Doc.has_annotation {#has_annotation tag="method"}
+
+Check whether the doc contains annotation on a token attribute.
+
+| Name               | Description                                                                                         |
+| ------------------ | --------------------------------------------------------------------------------------------------- |
+| `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
+| _keyword-only_     |                                                                                                     |
+| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
+| **RETURNS**        | Whether specified annotation is present in the doc. ~~bool~~                                        |
+
 ## Doc.to_array {#to_array tag="method"}
 
 Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
@@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
 
 ## Attributes {#attributes}
 
-| Name                                    | Description                                                                                                                                                                              |
-| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text`                                  | A string representation of the document text. ~~str~~                                                                                                                                    |
-| `text_with_ws`                          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                                                                            |
-| `mem`                                   | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                                                                 |
-| `vocab`                                 | The store of lexical types. ~~Vocab~~                                                                                                                                                    |
-| `tensor` <Tag variant="new">2</Tag>     | Container for dense vector representations. ~~numpy.ndarray~~                                                                                                                            |
-| `cats` <Tag variant="new">2</Tag>       | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~                                              |
-| `user_data`                             | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                                                                         |
-| `lang` <Tag variant="new">2.1</Tag>     | Language of the document's vocabulary. ~~int~~                                                                                                                                           |
-| `lang_` <Tag variant="new">2.1</Tag>    | Language of the document's vocabulary. ~~str~~                                                                                                                                           |
-| `is_tagged`                             | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~                                                                       |
-| `is_parsed`                             | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~                                                                        |
-| `is_sentenced`                          | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~                                                             |
-| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
-| `sentiment`                             | The document's positivity/negativity score, if available. ~~float~~                                                                                                                      |
-| `user_hooks`                            | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                                                                |
-| `user_token_hooks`                      | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                                                                        |
-| `user_span_hooks`                       | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                                                                         |
-| `_`                                     | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~                                                            |
+| Name                                 | Description                                                                                                                                 |
+| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text`                               | A string representation of the document text. ~~str~~                                                                                       |
+| `text_with_ws`                       | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~                                               |
+| `mem`                                | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~                                                                    |
+| `vocab`                              | The store of lexical types. ~~Vocab~~                                                                                                       |
+| `tensor` <Tag variant="new">2</Tag>  | Container for dense vector representations. ~~numpy.ndarray~~                                                                               |
+| `cats` <Tag variant="new">2</Tag>    | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
+| `user_data`                          | A generic storage area, for user custom data. ~~Dict[str, Any]~~                                                                            |
+| `lang` <Tag variant="new">2.1</Tag>  | Language of the document's vocabulary. ~~int~~                                                                                              |
+| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~                                                                                              |
+| `sentiment`                          | The document's positivity/negativity score, if available. ~~float~~                                                                         |
+| `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   |
+| `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           |
+| `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            |
+| `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               |
 
 ## Serialization fields {#serialization-fields}
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44810da58..346b44600 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
 | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        |
 | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         |
+| [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         |
 | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              |
 | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             |
 | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          |
@@ -763,6 +764,25 @@ nlp = spacy.blank("en")
 + ruler.load_from_tag_map(YOUR_TAG_MAP)
 ```
 
+### Migrating Doc flags {#migrating-doc-flags}
+
+The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3 and replaced by
+[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
+token attribute symbols (the same symbols used in `Matcher` patterns):
+
+```diff
+doc = nlp(text)
+- doc.is_parsed
++ doc.has_annotation("DEP")
+- doc.is_tagged
++ doc.has_annotation("TAG")
+- doc.is_sentenced
++ doc.has_annotation("SENT_START")
+- doc.is_nered
++ doc.has_annotation("ENT_IOB")
+```
+
 ### Training pipelines and models {#migrating-training}
 
 To train your pipelines, you should now pretty much always use the

From 8303d101a5327e96ecddb28d7dc668d75db56b50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 00:18:49 +0200
Subject: [PATCH 020/133] Set version to v3.0.0a19

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4ed3dd327..4fb6dfff1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a18"
+__version__ = "3.0.0a19"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 21dcf92964c6a2c4218d5ffc44a164dead641c44 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 09:21:36 +0200
Subject: [PATCH 021/133] Update website/docs/api/data-formats.md

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
---
 website/docs/api/data-formats.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 74d612862..cf091e16c 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -130,7 +130,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
 | `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
-| `corpus`              | Dictionary with `train` and `develop` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
+| `corpus`              | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |

From 0c35885751f2ad83098f54103de33b987b4a199e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 11:38:59 +0200
Subject: [PATCH 022/133] generalize corpora, dot notation for dev and train
 corpus

---
 extra/experiments/onto-joint/defaults.cfg     |  34 +++---
 .../ptb-joint-pos-dep/defaults.cfg            |  32 +++---
 spacy/cli/pretrain.py                         |   3 +-
 spacy/cli/templates/quickstart_training.jinja |  27 ++---
 spacy/cli/train.py                            |   5 +-
 spacy/default_config.cfg                      |  56 +++++----
 spacy/default_config_pretraining.cfg          |  17 +--
 spacy/schemas.py                              |   6 +-
 .../tests/serialize/test_serialize_config.py  |  20 ++--
 spacy/tests/training/test_readers.py          |  63 ++++++++++-
 website/docs/api/cli.md                       |  20 ++--
 website/docs/api/corpus.md                    |   4 +-
 website/docs/api/data-formats.md              | 107 +++++++++++++-----
 website/docs/api/top-level.md                 |   6 +-
 website/docs/usage/projects.md                |   2 +-
 website/docs/usage/training.md                |   2 +-
 16 files changed, 261 insertions(+), 143 deletions(-)

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
index 97eebe6b4..90101281c 100644
--- a/extra/experiments/onto-joint/defaults.cfg
+++ b/extra/experiments/onto-joint/defaults.cfg
@@ -8,6 +8,22 @@ init_tok2vec = null
 seed = 0
 use_pytorch_for_gpu_memory = false
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+gold_preproc = true
+max_length = 0
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
+max_length = 0
+limit = 0
+
 [training]
 seed = ${system:seed}
 dropout = 0.1
@@ -20,22 +36,8 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
 frozen_components = []
-
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
index 03e2f5bd7..55fb52b99 100644
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -8,6 +8,22 @@ init_tok2vec = null
 seed = 0
 use_pytorch_for_gpu_memory = false
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+gold_preproc = true
+max_length = 0
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
+max_length = 0
+limit = 0
+
 [training]
 seed = ${system:seed}
 dropout = 0.2
@@ -20,22 +36,6 @@ patience = 10000
 eval_frequency = 200
 score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 70858123d..3567e7339 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -20,6 +20,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..tokens import Doc
 from ..attrs import ID
 from .. import util
+from ..util import dot_to_object
 
 
 @app.command(
@@ -106,7 +107,7 @@ def pretrain(
         use_pytorch_for_gpu_memory()
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
-    corpus = P_cfg["corpus"]
+    corpus = dot_to_object(config, config["pretraining"]["corpus"])
     batcher = P_cfg["batcher"]
     model = create_pretraining_model(nlp, config["pretraining"])
     optimizer = config["pretraining"]["optimizer"]
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 39d4d875d..00b77af4d 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -173,6 +173,18 @@ factory = "{{ pipe }}"
 {% endif %}
 {% endfor %}
 
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = {{ 500 if hardware == "gpu" else 2000 }}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+
 [training]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
 vectors = null
@@ -182,11 +194,12 @@ vectors = "{{ word_vectors }}"
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
 {% endif %}
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 
 [training.optimizer]
 @optimizers = "Adam.v1"
 
-
 {% if use_transformer -%}
 [training.optimizer.learn_rate]
 @schedules = "warmup_linear.v1"
@@ -195,18 +208,6 @@ total_steps = 20000
 initial_rate = 5e-5
 {% endif %}
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = {{ 500 if hardware == "gpu" else 2000 }}
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-max_length = 0
-
 {% if use_transformer %}
 [training.batcher]
 @batchers = "spacy.batch_by_padded.v1"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2c2eeb88b..15c745b69 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -18,6 +18,7 @@ from ..language import Language
 from .. import util
 from ..training.example import Example
 from ..errors import Errors
+from ..util import dot_to_object
 
 
 @app.command(
@@ -92,8 +93,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = T_cfg["corpus"]["train"]
-    dev_corpus = T_cfg["corpus"]["dev"]
+    train_corpus = dot_to_object(config, config["training"]["train_corpus"])
+    dev_corpus = dot_to_object(config, config["training"]["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 61f3dfe25..c7c9593d7 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -22,6 +22,33 @@ after_pipeline_creation = null
 
 [components]
 
+# Readers for corpora like dev and train.
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
@@ -40,35 +67,14 @@ eval_frequency = 200
 score_weights = {}
 # Names of pipeline components that shouldn't be updated during training
 frozen_components = []
+# Location in the config where the dev corpus is defined
+dev_corpus = "corpora.dev"
+# Location in the config where the train corpus is defined
+train_corpus = "corpora.train"
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-[training.corpus]
-
-[training.corpus.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length
-max_length = 0
-# Limitation on number of training examples
-limit = 0
-
-[training.corpus.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length
-max_length = 0
-# Limitation on number of training examples
-limit = 0
 
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg
index 9120db338..bbd595308 100644
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@@ -4,6 +4,7 @@ dropout = 0.2
 n_save_every = null
 component = "tok2vec"
 layer = ""
+corpus = "corpora.pretrain"
 
 [pretraining.batcher]
 @batchers = "spacy.batch_by_words.v1"
@@ -12,13 +13,6 @@ discard_oversize = false
 tolerance = 0.2
 get_length = null
 
-[pretraining.corpus]
-@readers = "spacy.JsonlReader.v1"
-path = ${paths.raw}
-min_length = 5
-max_length = 500
-limit = 0
-
 [pretraining.objective]
 type = "characters"
 n_characters = 4
@@ -33,3 +27,12 @@ grad_clip = 1.0
 use_averages = true
 eps = 1e-8
 learn_rate = 0.001
+
+[corpora]
+
+[corpora.pretrain]
+@readers = "spacy.JsonlReader.v1"
+path = ${paths.raw}
+min_length = 5
+max_length = 500
+limit = 0
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 2030048d8..a530db3d0 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -198,7 +198,8 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
-    corpus: Dict[str, Reader] = Field(..., title="Reader for the training and dev data")
+    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
+    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     dropout: StrictFloat = Field(..., title="Dropout rate")
     patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
@@ -248,7 +249,7 @@ class ConfigSchemaPretrain(BaseModel):
     dropout: StrictFloat = Field(..., title="Dropout rate")
     n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
-    corpus: Reader = Field(..., title="Reader for the training data")
+    corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
     component: str = Field(..., title="Component to find the layer to pretrain")
     layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
@@ -267,6 +268,7 @@ class ConfigSchema(BaseModel):
     nlp: ConfigSchemaNlp
     pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
     components: Dict[str, Dict[str, Any]]
+    corpora: Dict[str, Reader]
 
     @root_validator(allow_reuse=True)
     def validate_config(cls, values):
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index d113ac2a5..1e17b3212 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -17,18 +17,18 @@ nlp_config_string = """
 train = ""
 dev = ""
 
-[training]
+[corpora]
 
-[training.corpus]
-
-[training.corpus.train]
+[corpora.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
 
-[training.corpus.dev]
+[corpora.dev]
 @readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 
+[training]
+
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 size = 666
@@ -302,20 +302,20 @@ def test_config_overrides():
 
 def test_config_interpolation():
     config = Config().from_str(nlp_config_string, interpolate=False)
-    assert config["training"]["corpus"]["train"]["path"] == "${paths.train}"
+    assert config["corpora"]["train"]["path"] == "${paths.train}"
     interpolated = config.interpolate()
-    assert interpolated["training"]["corpus"]["train"]["path"] == ""
+    assert interpolated["corpora"]["train"]["path"] == ""
     nlp = English.from_config(config)
-    assert nlp.config["training"]["corpus"]["train"]["path"] == "${paths.train}"
+    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
     # Ensure that variables are preserved in nlp config
     width = "${components.tok2vec.model.width}"
     assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
     interpolated2 = nlp.config.interpolate()
-    assert interpolated2["training"]["corpus"]["train"]["path"] == ""
+    assert interpolated2["corpora"]["train"]["path"] == ""
     assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
     nlp2 = English.from_config(interpolated)
-    assert nlp2.config["training"]["corpus"]["train"]["path"] == ""
+    assert nlp2.config["corpora"]["train"]["path"] == ""
     assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index c81ec0897..52a4abecc 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,6 +1,57 @@
+from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
-from spacy.util import load_model_from_config
+
+from spacy import Language
+from spacy.util import load_model_from_config, registry, dot_to_object
+from spacy.training import Example
+
+
+def test_readers():
+    config_string = """
+    [training]
+    
+    [corpora]
+    @readers = "myreader.v1"
+
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec", "textcat"]
+    
+    [components]
+    
+    [components.tok2vec]
+    factory = "tok2vec"
+    
+    [components.textcat]
+    factory = "textcat"
+    """
+    @registry.readers.register("myreader.v1")
+    def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
+        annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
+        def reader(nlp: Language):
+            doc = nlp.make_doc(f"This is an example")
+            return [Example.from_dict(doc, annots)]
+        return {"train": reader, "dev": reader, "extra": reader, "something": reader}
+
+    config = Config().from_str(config_string)
+    nlp, resolved = load_model_from_config(config, auto_fill=True)
+
+    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
+    assert isinstance(train_corpus, Callable)
+    optimizer = resolved["training"]["optimizer"]
+    # simulate a training loop
+    nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
+    for example in train_corpus(nlp):
+        nlp.update([example], sgd=optimizer)
+    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
+    scores = nlp.evaluate(list(dev_corpus(nlp)))
+    assert scores["cats_score"]
+    # ensure the pipeline runs
+    doc = nlp("Quick test")
+    assert doc.cats
+    extra_corpus = resolved["corpora"]["extra"]
+    assert isinstance(extra_corpus, Callable)
 
 
 @pytest.mark.slow
@@ -16,7 +67,7 @@ def test_cat_readers(reader, additional_config):
     nlp_config_string = """
     [training]
     
-    [training.corpus]
+    [corpora]
     @readers = "PLACEHOLDER"
 
     [nlp]
@@ -32,11 +83,11 @@ def test_cat_readers(reader, additional_config):
     factory = "textcat"
     """
     config = Config().from_str(nlp_config_string)
-    config["training"]["corpus"]["@readers"] = reader
-    config["training"]["corpus"].update(additional_config)
+    config["corpora"]["@readers"] = reader
+    config["corpora"].update(additional_config)
     nlp, resolved = load_model_from_config(config, auto_fill=True)
 
-    train_corpus = resolved["training"]["corpus"]["train"]
+    train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
     optimizer = resolved["training"]["optimizer"]
     # simulate a training loop
     nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
@@ -46,7 +97,7 @@ def test_cat_readers(reader, additional_config):
         assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
         nlp.update([example], sgd=optimizer)
     # simulate performance benchmark on dev corpus
-    dev_corpus = resolved["training"]["corpus"]["dev"]
+    dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
     dev_examples = list(dev_corpus(nlp))
     for example in dev_examples:
         # this shouldn't fail if each dev example has at least one positive label
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7dd6e6184..5c5eb6486 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -355,6 +355,16 @@ Registry   @architectures
 Name       spacy.MaxoutWindowEncoder.v1
 Module     spacy.ml.models.tok2vec
 File       /path/to/spacy/ml/models/tok2vec.py (line 207)
+ℹ [corpora.dev]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
+ℹ [corpora.train]
+Registry   @readers
+Name       spacy.Corpus.v1
+Module     spacy.training.corpus
+File       /path/to/spacy/training/corpus.py (line 18)
 ℹ [training.logger]
 Registry   @loggers
 Name       spacy.ConsoleLogger.v1
@@ -370,16 +380,6 @@ Registry   @schedules
 Name       compounding.v1
 Module     thinc.schedules
 File       /path/to/thinc/thinc/schedules.py (line 43)
-ℹ [training.corpus.dev]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
-ℹ [training.corpus.train]
-Registry   @readers
-Name       spacy.Corpus.v1
-Module     spacy.training.corpus
-File       /path/to/spacy/training/corpus.py (line 18)
 ℹ [training.optimizer]
 Registry   @optimizers
 Name       Adam.v1
diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md
index c25ce1651..2b308d618 100644
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@@ -26,7 +26,7 @@ streaming.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
@@ -135,7 +135,7 @@ Initialize the reader.
 >
 > ```ini
 > ### Example config
-> [pretraining.corpus]
+> [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = "corpus/raw_text.jsonl"
 > min_length = 0
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index cf091e16c..f868233c7 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -121,28 +121,78 @@ that you don't want to hard-code in your config file.
 $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 ```
 
+### corpora {#config-corpora tag="section"}
+
+This section defines a dictionary mapping of string keys to `Callable`
+functions. Each callable takes an `nlp` object and yields
+[`Example`](/api/example) objects. By default, the two keys `train` and `dev`
+are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When
+pretraining, an additional pretrain section is added that defaults to a
+[`JsonlReader`](/api/top-level#JsonlReader).
+
+These subsections can be expanded with additional subsections, each referring to
+a callback of type `Callable[[Language], Iterator[Example]]`:
+
+> #### Example
+>
+> ```ini
+> [corpora]
+> [corpora.train]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths:train}
+>
+> [corpora.dev]
+> @readers = "spacy.Corpus.v1"
+> path = ${paths:dev}
+>
+> [corpora.pretrain]
+> @readers = "spacy.JsonlReader.v1"
+> path = ${paths.raw}
+> min_length = 5
+> max_length = 500
+>
+> [corpora.mydata]
+> @readers = "my_reader.v1"
+> shuffle = true
+> ```
+
+Alternatively, the `corpora` block could refer to one function with return type
+`Dict[str, Callable[[Language], Iterator[Example]]]`:
+
+> #### Example
+>
+> ```ini
+> [corpora]
+> @readers = "my_dict_reader.v1"
+> train_path = ${paths:train}
+> dev_path = ${paths:dev}
+> shuffle = true
+>
+> ```
+
 ### training {#config-training tag="section"}
 
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                                                           |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                          |
-| `corpus`              | Dictionary with `train` and `dev` keys, each referring to a callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                        |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                             |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                       |
-| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                       |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                       |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                             |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                               |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                       |
-| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                                   |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                         |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                       |
-| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                      |
+| Name                  | Description                                                                                                                                                                                                  |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                       |
+| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
+| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
+| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
+| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
+| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
+| `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
+| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
+| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
+| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
+| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                              |
+| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
+| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
+| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
+| `corpus`              | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
+| `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
@@ -150,17 +200,18 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/embeddings-transformers#pretraining). It's
 used when you run [`spacy pretrain`](/api/cli#pretrain).
 
-| Name           | Description                                                                                                                                                                                  |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                                                                                                        |
-| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                                                                               |
-| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                                                                                      |
-| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                                                                                       |
-| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                |
-| `corpus`       | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ |
-| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                            |
-| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                                                                                                                    |
-| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                                                                                                             |
+| Name           | Description                                                                                            |
+| -------------- | ------------------------------------------------------------------------------------------------------ |
+| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                  |
+| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                         |
+| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                |
+| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
+| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
+| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~    |
+| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
+| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
+| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
+|                |
 
 ## Training data {#training}
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index be7994d5d..72b79de48 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -448,7 +448,7 @@ remain in the config file stored on your local system.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
+> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```
 
 | Name                   | Description                                                                                                                           |
@@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
 > [paths]
 > train = "corpus/train.spacy"
 >
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths.train}
 > gold_preproc = false
@@ -506,7 +506,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
 > [paths]
 > pretrain = "corpus/raw_text.jsonl"
 >
-> [pretraining.corpus]
+> [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = ${paths.pretrain}
 > min_length = 0
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 3a6bd4551..665caa15b 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -969,7 +969,7 @@ your results.
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "training.corpus.train.path", "training.corpus.dev.path"]
+> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```
 
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index bba2e2853..c0f4caad7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
 > #### config.cfg
 >
 > ```ini
-> [training.corpus.train]
+> [corpora.train]
 > @readers = "corpus_variants.v1"
 > source = "s3://your_bucket/path/data.csv"
 > ```

From 427dbecdd63706f9c6c55875d46ed570f5a6a48b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 11:48:04 +0200
Subject: [PATCH 023/133] cleanup and formatting

---
 spacy/cli/pretrain.py                | 14 +++++---------
 spacy/cli/train.py                   |  4 ++--
 spacy/schemas.py                     |  2 +-
 spacy/tests/training/test_readers.py |  3 +++
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 3567e7339..aec077eb7 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -71,9 +71,7 @@ def pretrain_cli(
 
     with show_validation_error(config_path):
         config = util.load_config(
-            config_path,
-            overrides=config_overrides,
-            interpolate=True
+            config_path, overrides=config_overrides, interpolate=True
         )
     if not config.get("pretraining"):
         # TODO: What's the solution here? How do we handle optional blocks?
@@ -84,7 +82,7 @@ def pretrain_cli(
 
     config.to_disk(output_dir / "config.cfg")
     msg.good("Saved config file in the output directory")
- 
+
     pretrain(
         config,
         output_dir,
@@ -99,7 +97,7 @@ def pretrain(
     output_dir: Path,
     resume_path: Optional[Path] = None,
     epoch_resume: Optional[int] = None,
-    use_gpu: int=-1
+    use_gpu: int = -1,
 ):
     if config["system"].get("seed") is not None:
         fix_random_seed(config["system"]["seed"])
@@ -107,7 +105,7 @@ def pretrain(
         use_pytorch_for_gpu_memory()
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
-    corpus = dot_to_object(config, config["pretraining"]["corpus"])
+    corpus = dot_to_object(config, P_cfg["corpus"])
     batcher = P_cfg["batcher"]
     model = create_pretraining_model(nlp, config["pretraining"])
     optimizer = config["pretraining"]["optimizer"]
@@ -148,9 +146,7 @@ def pretrain(
             progress = tracker.update(epoch, loss, docs)
             if progress:
                 msg.row(progress, **row_settings)
-            if P_cfg["n_save_every"] and (
-                batch_id % P_cfg["n_save_every"] == 0
-            ):
+            if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
                 _save_model(epoch, is_temp=True)
         _save_model(epoch)
         tracker.epoch_loss = 0.0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 15c745b69..50306b350 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -93,8 +93,8 @@ def train(
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
     T_cfg = config["training"]
     optimizer = T_cfg["optimizer"]
-    train_corpus = dot_to_object(config, config["training"]["train_corpus"])
-    dev_corpus = dot_to_object(config, config["training"]["dev_corpus"])
+    train_corpus = dot_to_object(config, T_cfg["train_corpus"])
+    dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
     batcher = T_cfg["batcher"]
     train_logger = T_cfg["logger"]
     # Components that shouldn't be updated during training
diff --git a/spacy/schemas.py b/spacy/schemas.py
index a530db3d0..06bc4beed 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -104,7 +104,7 @@ class TokenPatternOperator(str, Enum):
 StringValue = Union[TokenPatternString, StrictStr]
 NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
-    TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
+    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
 
 
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 52a4abecc..898746c2a 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -26,12 +26,15 @@ def test_readers():
     [components.textcat]
     factory = "textcat"
     """
+
     @registry.readers.register("myreader.v1")
     def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
         annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
+
         def reader(nlp: Language):
             doc = nlp.make_doc(f"This is an example")
             return [Example.from_dict(doc, annots)]
+
         return {"train": reader, "dev": reader, "extra": reader, "something": reader}
 
     config = Config().from_str(config_string)

From 6761028c6f5b033109e3eed4a4b1b19218f55e40 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 12:34:11 +0200
Subject: [PATCH 024/133] Update docs [ci skip]

---
 website/docs/usage/v3.md | 42 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 44810da58..72971dce2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -709,6 +709,48 @@ nlp = spacy.blank("en")
 + nlp.add_pipe("ner", source=source_nlp)
 ```
 
+#### Configuring pipeline components with settings {#migrating-configure-pipe}
+
+Because pipeline components are now added using their string names, you won't
+have to instantiate the [component classes](/api/#architecture-pipeline)
+directly anynore. To configure the component, you can now use the `config`
+argument on [`nlp.add_pipe`](/api/language#add_pipe).
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [components.sentencizer]
+> factory = "sentencizer"
+> punct_chars = ["!", ".", "?"]
+> ```
+
+```diff
+punct_chars = ["!", ".", "?"]
+- sentencizer = Sentencizer(punct_chars=punct_chars)
++ sentencizer = nlp.add_pipe("sentencizer", config={"punct_chars": punct_chars})
+```
+
+The `config` corresponds to the component settings in the
+[`config.cfg`](/usage/training#config-components) and will overwrite the default
+config defined by the components.
+
+<Infobox variant="warning" title="Important note on config values">
+
+Config values you pass to components **need to be JSON-serializable** and can't
+be arbitrary Python objects. Otherwise, the settings you provide can't be
+represented in the `config.cfg` and spaCy has no way of knowing how to re-create
+your component with the same settings when you load the pipeline back in. If you
+need to pass arbitrary objects to a component, use a
+[registered function](/usage/processing-pipelines#example-stateful-components):
+
+```diff
+- config = {"model": MyTaggerModel()}
++ config= {"model": {"@architectures": "MyTaggerModel"}}
+tagger = nlp.add_pipe("tagger", config=config)
+```
+
+</Infobox>
+
 ### Adding match patterns {#migrating-matcher}
 
 The [`Matcher.add`](/api/matcher#add),

From 30e85b2a42cdd827bc48411371ebe79b092009a2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 13:59:12 +0200
Subject: [PATCH 025/133] Remove outdated configs

---
 extra/experiments/onto-joint/defaults.cfg     | 133 ---------------
 extra/experiments/onto-joint/pretrain.cfg     | 152 ------------------
 extra/experiments/onto-ner.cfg                |  73 ---------
 .../ptb-joint-pos-dep/bilstm_tok2vec.cfg      |  73 ---------
 .../ptb-joint-pos-dep/defaults.cfg            | 110 -------------
 .../tok2vec-ner/charembed_tok2vec.cfg         |  69 --------
 .../tok2vec-ner/multihashembed_tok2vec.cfg    |  51 ------
 7 files changed, 661 deletions(-)
 delete mode 100644 extra/experiments/onto-joint/defaults.cfg
 delete mode 100644 extra/experiments/onto-joint/pretrain.cfg
 delete mode 100644 extra/experiments/onto-ner.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg
 delete mode 100644 extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
 delete mode 100644 extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
deleted file mode 100644
index 7954b57b5..000000000
--- a/extra/experiments/onto-joint/defaults.cfg
+++ /dev/null
@@ -1,133 +0,0 @@
-[paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
-
-[system]
-seed = 0
-use_pytorch_for_gpu_memory = false
-
-[training]
-seed = ${system:seed}
-dropout = 0.1
-init_tok2vec = ${paths:init_tok2vec}
-vectors = null
-accumulate_gradient = 1
-max_steps = 0
-max_epochs = 0
-patience = 10000
-eval_frequency = 200
-score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
-frozen_components = []
-
-[training.train_corpus]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.dev_corpus]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-load_vocab_data = false
-pipeline = ["tok2vec", "ner", "tagger", "parser"]
-
-[nlp.tokenizer]
-@tokenizers = "spacy.Tokenizer.v1"
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.Lemmatizer.v1"
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-
-[components.tagger]
-factory = "tagger"
-
-[components.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 30
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 128
-maxout_pieces = 2
-use_upper = true
-
-[components.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 128
-maxout_pieces = 2
-use_upper = true
-
-[components.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-rows = 2000
-also_embed_subwords = true
-also_use_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
diff --git a/extra/experiments/onto-joint/pretrain.cfg b/extra/experiments/onto-joint/pretrain.cfg
deleted file mode 100644
index 211339603..000000000
--- a/extra/experiments/onto-joint/pretrain.cfg
+++ /dev/null
@@ -1,152 +0,0 @@
-# Training hyper-parameters and additional features.
-[training]
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length or number of examples.
-max_length = 0
-limit = 0
-# Data augmentation
-orth_variant_level = 0.0
-dropout = 0.1
-# Controls early-stopping. 0 or -1 mean unlimited.
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 400
-# Other settings
-seed = 0
-accumulate_gradient = 1
-use_pytorch_for_gpu_memory = false
-# Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
-score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
-# These settings are invalid for the transformer models.
-init_tok2vec = null
-discard_oversize = false
-omit_extra_lookups = false
-batch_by = "words"
-use_gpu = -1
-raw_text = null
-tag_map = null
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 1000
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[pretraining]
-max_epochs = 1000
-min_length = 5
-max_length = 500
-dropout = 0.2
-n_save_every = null
-batch_size = 3000
-seed = ${training:seed}
-use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
-tok2vec_model = "nlp.pipeline.tok2vec.model"
-
-[pretraining.objective]
-type = "characters"
-n_characters = 4
-
-[pretraining.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-vectors = null
-base_model = null
-
-[nlp.pipeline]
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.senter]
-factory = "senter"
-
-[nlp.pipeline.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.tagger]
-factory = "tagger"
-
-[nlp.pipeline.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.senter.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.senter.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 128
-maxout_pieces = 3
-use_upper = false
-
-[nlp.pipeline.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 128
-maxout_pieces = 3
-use_upper = false
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 256
-depth = 6
-window_size = 1
-embed_size = 10000
-maxout_pieces = 3
-subword_features = true
-dropout = null
diff --git a/extra/experiments/onto-ner.cfg b/extra/experiments/onto-ner.cfg
deleted file mode 100644
index eab68a27f..000000000
--- a/extra/experiments/onto-ner.cfg
+++ /dev/null
@@ -1,73 +0,0 @@
-# Training hyper-parameters and additional features.
-[training]
-# Whether to train on sequences with 'gold standard' sentence boundaries
-# and tokens. If you set this to true, take care to ensure your run-time
-# data is passed in sentence-by-sentence via some prior preprocessing.
-gold_preproc = false
-# Limitations on training document length or number of examples.
-max_length = 3000
-limit = 0
-# Data augmentation
-orth_variant_level = 0.0
-dropout = 0.1
-# Controls early-stopping. 0 or -1 mean unlimited.
-patience = 100000
-max_epochs = 0
-max_steps = 0
-eval_frequency = 1000
-# Other settings
-seed = 0
-accumulate_gradient = 1
-use_pytorch_for_gpu_memory = false
-# Control how scores are printed and checkpoints are evaluated.
-scores = ["speed", "ents_p", "ents_r", "ents_f"]
-score_weights = {"ents_f": 1.0}
-# These settings are invalid for the transformer models.
-init_tok2vec = null
-discard_oversize = false
-omit_extra_lookups = false
-batch_by = "words"
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = true
-eps = 1e-8
-learn_rate = 0.001
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.ner]
-factory = "ner"
-learn_tokens = false
-min_action_freq = 1
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
-hidden_width = 64
-maxout_pieces = 2
-use_upper = true
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 96
-depth = 4
-window_size = 1
-embed_size = 2000
-maxout_pieces = 3
-subword_features = true
-dropout = ${training:dropout}
diff --git a/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
deleted file mode 100644
index f1b702a4e..000000000
--- a/extra/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ /dev/null
@@ -1,73 +0,0 @@
-[training]
-patience = 10000
-eval_frequency = 200
-dropout = 0.2
-init_tok2vec = null
-vectors = null
-max_epochs = 100
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-use_gpu = 0
-scores = ["tags_acc", "uas", "las"]
-score_weights = {"las": 0.8, "tags_acc": 0.2}
-limit = 0
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = ${training:vectors}
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.tagger]
-factory = "tagger"
-
-[nlp.pipeline.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-beam_width = 1
-beam_update_prob = 1.0
-
-[nlp.pipeline.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[nlp.pipeline.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 64
-maxout_pieces = 3
-
-[nlp.pipeline.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model:width}
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.HashEmbedBiLSTM.v1"
-pretrained_vectors = ${nlp:vectors}
-width = 96
-depth = 4
-embed_size = 2000
-subword_features = true
-maxout_pieces = 3
-dropout = null
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
deleted file mode 100644
index 8f9c5666e..000000000
--- a/extra/experiments/ptb-joint-pos-dep/defaults.cfg
+++ /dev/null
@@ -1,110 +0,0 @@
-[paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
-
-[system]
-seed = 0
-use_pytorch_for_gpu_memory = false
-
-[training]
-seed = ${system:seed}
-dropout = 0.2
-init_tok2vec = ${paths:init_tok2vec}
-vectors = null
-accumulate_gradient = 1
-max_steps = 0
-max_epochs = 0
-patience = 10000
-eval_frequency = 200
-score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
-
-[training.read_train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = true
-max_length = 0
-limit = 0
-
-[training.read_dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${training.read_train:gold_preproc}
-max_length = 0
-limit = 0
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger", "parser"]
-load_vocab_data = false
-
-[nlp.tokenizer]
-@tokenizers = "spacy.Tokenizer.v1"
-
-[nlp.lemmatizer]
-@lemmatizers = "spacy.Lemmatizer.v1"
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tagger]
-factory = "tagger"
-
-[components.parser]
-factory = "parser"
-learn_tokens = false
-min_action_freq = 1
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v1"
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.parser.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
-hidden_width = 64
-maxout_pieces = 3
-
-[components.parser.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode:width}
-
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.tok2vec.model.encode:width}
-rows = 2000
-also_embed_subwords = true
-also_use_static_vectors = false
-
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
diff --git a/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
deleted file mode 100644
index eca6a22fa..000000000
--- a/extra/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ /dev/null
@@ -1,69 +0,0 @@
-[training]
-use_gpu = -1
-limit = 0
-dropout = 0.2
-patience = 10000
-eval_frequency = 200
-scores = ["ents_f"]
-score_weights = {"ents_f": 1}
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-batch_size = 25
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.tok2vec]
-factory = "tok2vec"
-
-[nlp.pipeline.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v1"
-
-[nlp.pipeline.tok2vec.model.extract]
-@architectures = "spacy.CharacterEmbed.v1"
-width = 96
-nM = 64
-nC = 8
-rows = 2000
-columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-dropout = null
-
-[nlp.pipeline.tok2vec.model.extract.features]
-@architectures = "spacy.Doc2Feats.v1"
-columns = ${nlp.pipeline.tok2vec.model.extract:columns}
-
-[nlp.pipeline.tok2vec.model.embed]
-@architectures = "spacy.LayerNormalizedMaxout.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
-maxout_pieces = 4
-
-[nlp.pipeline.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
-window_size = 1
-maxout_pieces = 2
-depth = 2
-
-[nlp.pipeline.ner]
-factory = "ner"
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecTensors.v1"
-width = ${nlp.pipeline.tok2vec.model.extract:width}
diff --git a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
deleted file mode 100644
index e2ab148c6..000000000
--- a/extra/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ /dev/null
@@ -1,51 +0,0 @@
-[training]
-use_gpu = -1
-limit = 0
-dropout = 0.2
-patience = 10000
-eval_frequency = 200
-scores = ["ents_p", "ents_r", "ents_f"]
-score_weights = {"ents_f": 1}
-orth_variant_level = 0.0
-gold_preproc = true
-max_length = 0
-seed = 0
-accumulate_gradient = 2
-discard_oversize = false
-
-[training.batch_size]
-@schedules = "compounding.v1"
-start = 3000
-stop = 3000
-compound = 1.001
-
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-learn_rate = 0.001
-beta1 = 0.9
-beta2 = 0.999
-
-[nlp]
-lang = "en"
-vectors = null
-
-[nlp.pipeline.ner]
-factory = "ner"
-
-[nlp.pipeline.ner.model]
-@architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
-hidden_width = 64
-maxout_pieces = 2
-
-[nlp.pipeline.ner.model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
-width = 128
-depth = 4
-embed_size = 7000
-maxout_pieces = 3
-window_size = 1
-subword_features = true
-pretrained_vectors = null
-dropout = null

From 130ffa5fbf8751de4eeb4bfd2463f46242ecc50d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 14:59:41 +0200
Subject: [PATCH 026/133] fix typos in docs

---
 website/docs/api/data-formats.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index f868233c7..b9e185d9c 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -191,7 +191,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                          |
 | `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                |
 | `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                              |
-| `corpus`              | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
+| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                          |
 | `vectors`             | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                             |
 
 ### pretraining {#config-pretraining tag="section,optional"}
@@ -207,7 +207,7 @@ used when you run [`spacy pretrain`](/api/cli#pretrain).
 | `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                |
 | `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
 | `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
-| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~    |
+| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
 | `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
 | `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |

From 3a3110ef6040e6cd9a745676586954f7508c6a6c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 15:44:11 +0200
Subject: [PATCH 027/133] remove empty files

---
 extra/experiments/onto-joint/defaults.cfg        | 0
 extra/experiments/ptb-joint-pos-dep/defaults.cfg | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 extra/experiments/onto-joint/defaults.cfg
 delete mode 100644 extra/experiments/ptb-joint-pos-dep/defaults.cfg

diff --git a/extra/experiments/onto-joint/defaults.cfg b/extra/experiments/onto-joint/defaults.cfg
deleted file mode 100644
index e69de29bb..000000000
diff --git a/extra/experiments/ptb-joint-pos-dep/defaults.cfg b/extra/experiments/ptb-joint-pos-dep/defaults.cfg
deleted file mode 100644
index e69de29bb..000000000

From ddfc1fc146ec35dab19f835602345de91342eeee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:05:40 +0200
Subject: [PATCH 028/133] add pretraining option to init config

---
 spacy/cli/init_config.py | 12 +++++++++---
 website/docs/api/cli.md  | 34 ++++++++++++++++++----------------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index ec65b0e0a..60ea1b640 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -30,6 +30,7 @@ def init_config_cli(
     pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
     optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
     cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
+    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     # fmt: on
 ):
     """
@@ -43,7 +44,7 @@ def init_config_cli(
     if isinstance(optimize, Optimizations):  # instance of enum from the CLI
         optimize = optimize.value
     pipeline = string_to_list(pipeline)
-    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
+    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu,  pretraining=pretraining)
 
 
 @init_cli.command("fill-config")
@@ -109,7 +110,7 @@ def fill_config(
 
 
 def init_config(
-    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
+    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False,
 ) -> None:
     is_stdout = str(output_file) == "-"
     msg = Printer(no_print=is_stdout)
@@ -156,8 +157,13 @@ def init_config(
     with show_validation_error(hint_fill=False):
         config = util.load_config_from_str(base_template)
         nlp, _ = util.load_model_from_config(config, auto_fill=True)
+        config = nlp.config
+        if pretraining:
+            validate_config_for_pretrain(config, msg)
+            pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+            config = pretrain_config.merge(config)
     msg.good("Auto-filled config with all values")
-    save_config(nlp.config, output_file, is_stdout=is_stdout)
+    save_config(config, output_file, is_stdout=is_stdout)
 
 
 def save_config(
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8449d23e1..7ba451c2f 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -124,15 +124,16 @@ customize those settings in your config file later.
 $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
 ```
 
-| Name               | Description                                                                                                                                                                                                                                                                                                                        |
-| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_file`      | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
-| `--lang`, `-l`     | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
-| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
-| `--cpu`, `-C`      | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
-| `--help`, `-h`     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
-| **CREATES**        | The config file for training.                                                                                                                                                                                                                                                                                                      |
+| Name                  | Description                                                                                                                                                                                                                                                                                                                        |
+| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output_file`         | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
+| `--lang`, `-l`        | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
+| `--pipeline`, `-p`    | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
+| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--optimize`, `-o`    | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `--cpu`, `-C`         | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
+| **CREATES**           | The config file for training.                                                                                                                                                                                                                                                                                                      |
 
 ### init fill-config {#init-fill-config new="3"}
 
@@ -160,13 +161,14 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name           | Description                                                                                                                         |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `base_path`    | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
-| `output_file`  | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
-| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                                                                          |
-| **CREATES**    | Complete and auto-filled config file for training.                                                                                  |
+| Name                  | Description                                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `base_path`           | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
+| `output_file`         | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
+| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                     |
+| `--diff`, `-D`        | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
+| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                          |
+| **CREATES**           | Complete and auto-filled config file for training.                                                                                  |
 
 ### init vocab {#init-vocab new="3" tag="command"}
 

From 5fade4feb7fbd3d579a6b9a2d696a470456a997f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:15:20 +0200
Subject: [PATCH 029/133] fix cli abbrev

---
 website/docs/api/cli.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7ba451c2f..8edee6b29 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -124,16 +124,16 @@ customize those settings in your config file later.
 $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
 ```
 
-| Name                  | Description                                                                                                                                                                                                                                                                                                                        |
-| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_file`         | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
-| `--lang`, `-l`        | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
-| `--pipeline`, `-p`    | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
-| `--optimize`, `-o`    | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
-| `--cpu`, `-C`         | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
-| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
-| **CREATES**           | The config file for training.                                                                                                                                                                                                                                                                                                      |
+| Name                   | Description                                                                                                                                                                                                                                                                                                                        |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
+| `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
+| `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
+| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
+| **CREATES**            | The config file for training.                                                                                                                                                                                                                                                                                                      |
 
 ### init fill-config {#init-fill-config new="3"}
 

From 35a393106404d8f69d69e6c12d62e21a7d517065 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:36:27 +0200
Subject: [PATCH 030/133] fix typo

---
 spacy/cli/debug_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index a4899a458..58908c5e8 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -54,7 +54,7 @@ def debug_model_cli(
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=config_overrides)
-        nlp, config = util.load_model_from_config(config_path)
+        nlp, config = util.load_model_from_config(config)
     seed = config["training"]["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")

From ec751068f328e47ae7fa8ca1745a1dd8ac00529d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 16:42:53 +0200
Subject: [PATCH 031/133] Draft text for static vectors intro

---
 website/docs/usage/embeddings-transformers.md | 45 +++++++++++++++----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 8dd104ead..6a239cb1e 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using
 
 <Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
 
-The key difference between [word vectors](#word-vectors) and contextual language
-models such as [transformers](#transformers) is that word vectors model
-**lexical types**, rather than _tokens_. If you have a list of terms with no
-context around them, a transformer model like BERT can't really help you. BERT
-is designed to understand language **in context**, which isn't what you have. A
-word vectors table will be a much better fit for your task. However, if you do
-have words in context — whole sentences or paragraphs of running text — word
-vectors will only provide a very rough approximation of what the text is about.
+[Transformers](#transformers) are large and powerful neural networks that give
+you better accuracy, but are harder to deploy in production, as they require a GPU to run
+effectively. [Word vectors](#word-vectors) are a slightly older technique that
+can give your models a smaller improvement in accuracy, and can also provide
+some additional capabilities. 
+
+The key difference between word-vectors and contextual language
+models such as transformers is that word vectors model **lexical types**, rather
+than _tokens_. If you have a list of terms with no context around them, a transformer
+model like BERT can't really help you. BERT is designed to understand language
+**in context**, which isn't what you have. A word vectors table will be a much
+better fit for your task. However, if you do have words in context — whole sentences
+or paragraphs of running text — word vectors will only provide a very rough
+approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a
 vector with a single indexing operation. Word vectors are therefore useful as a
@@ -478,7 +484,28 @@ training.
 
 ## Static vectors {#static-vectors}
 
-<!-- TODO: write -->
+If your pipeline includes a word vectors table, you'll be able to use the
+`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects.
+You'll also be able to access the vectors using the `.vector` attribute, or you
+can look up one or more vectors directly using the `Vocab` object. Pipelines
+with word vectors can also use the vectors as features for the statistical
+models, which can improve the accuracy of your components.
+
+Word vectors in spaCy are "static" in the sense that they are not learned
+parameters of the statistical models, and spaCy itself does not feature any
+algorithms for learning word vector tables. You can train a word vectors table
+using tools such as Gensim, word2vec, FastText or GloVe. There are also many
+word vector tables available for download. Once you have a word vectors table
+you want to use, you can convert it for use with spaCy using the `spacy init vocab`
+command, which will give you a directory you can load or refer to in your training
+configs.
+
+When converting the vectors, there are two ways you can trim them down to make
+your package smaller. You can _truncate_ the vectors with the `--truncate-vectors`
+option, which will remove entries for rarer words from the table. Alternatively,
+you can use the `--prune-vectors` option to remap rarer words to the closest vector
+that remains in the table. This allows the vectors table to return meaningful
+(albeit imperfect) results for more words than you have rows in the table.
 
 ### Using word vectors in your models {#word-vectors-models}
 

From 127ce0c574da23f2e17c824dcebec6f229d4561f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:55:53 +0200
Subject: [PATCH 032/133] Update website/docs/api/cli.md

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8edee6b29..5f3a06c36 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -121,7 +121,7 @@ customize those settings in your config file later.
 > ```
 
 ```cli
-$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
+$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] [--pretraining]
 ```
 
 | Name                   | Description                                                                                                                                                                                                                                                                                                                        |

From e5ceec5df0cf7d279d6f2bac716a30f4edb71fc8 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:56:20 +0200
Subject: [PATCH 033/133] Update website/docs/api/cli.md

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 5f3a06c36..f5ac943e2 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -129,7 +129,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 | `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
 | `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
 | `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-pt` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                                                                                                                                                                                                                    |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
 | `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |

From 38652143434207531c2779ab6905331269f072ca Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:57:02 +0200
Subject: [PATCH 034/133] Use consistent shortcut

---
 spacy/cli/init_config.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 60ea1b640..e70195e15 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -44,7 +44,14 @@ def init_config_cli(
     if isinstance(optimize, Optimizations):  # instance of enum from the CLI
         optimize = optimize.value
     pipeline = string_to_list(pipeline)
-    init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu,  pretraining=pretraining)
+    init_config(
+        output_file,
+        lang=lang,
+        pipeline=pipeline,
+        optimize=optimize,
+        cpu=cpu,
+        pretraining=pretraining,
+    )
 
 
 @init_cli.command("fill-config")
@@ -52,7 +59,7 @@ def init_fill_config_cli(
     # fmt: off
     base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
-    pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
+    pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
     diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
     # fmt: on
 ):
@@ -110,7 +117,13 @@ def fill_config(
 
 
 def init_config(
-    output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool, pretraining: bool = False,
+    output_file: Path,
+    *,
+    lang: str,
+    pipeline: List[str],
+    optimize: str,
+    cpu: bool,
+    pretraining: bool = False,
 ) -> None:
     is_stdout = str(output_file) == "-"
     msg = Printer(no_print=is_stdout)

From c4b414b2825021410c8f8e80304b83eac3847bf1 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:58:09 +0200
Subject: [PATCH 035/133] Update website/docs/api/cli.md

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f5ac943e2..9d0b872c3 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -165,7 +165,7 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | `base_path`           | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
 | `output_file`         | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--pretraining`, `-p` | Include config for pretraining (with 'spacy pretrain'). Default False. ~~bool~~                                                     |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--diff`, `-D`        | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
 | `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                          |
 | **CREATES**           | Complete and auto-filled config file for training.                                                                                  |

From 3d8e010655e7180eb875fe784f2c8f098a332388 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 16:58:46 +0200
Subject: [PATCH 036/133] Change order

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index f5ac943e2..f9a192000 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -129,9 +129,9 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
 | `output_file`          | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~                                                                                                                        |
 | `--lang`, `-l`         | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~                                                                                                                                                                                                                              |
 | `--pipeline`, `-p`     | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~                                                                                                                                                                      |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
 | `--optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
 | `--cpu`, `-C`          | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~                                                                                                                                                                                |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                  |
 | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                                                                                         |
 | **CREATES**            | The config file for training.                                                                                                                                                                                                                                                                                                      |
 

From a2c8cda26ffbc6ba0e15b0872b8691ee4f366994 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 17:12:51 +0200
Subject: [PATCH 037/133] Update docs [ci skip]

---
 website/docs/usage/embeddings-transformers.md | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 6a239cb1e..9f73661c3 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -31,18 +31,18 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using
 <Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
 
 [Transformers](#transformers) are large and powerful neural networks that give
-you better accuracy, but are harder to deploy in production, as they require a GPU to run
-effectively. [Word vectors](#word-vectors) are a slightly older technique that
-can give your models a smaller improvement in accuracy, and can also provide
-some additional capabilities. 
+you better accuracy, but are harder to deploy in production, as they require a
+GPU to run effectively. [Word vectors](#word-vectors) are a slightly older
+technique that can give your models a smaller improvement in accuracy, and can
+also provide some additional capabilities.
 
-The key difference between word-vectors and contextual language
-models such as transformers is that word vectors model **lexical types**, rather
-than _tokens_. If you have a list of terms with no context around them, a transformer
-model like BERT can't really help you. BERT is designed to understand language
-**in context**, which isn't what you have. A word vectors table will be a much
-better fit for your task. However, if you do have words in context — whole sentences
-or paragraphs of running text — word vectors will only provide a very rough
+The key difference between word-vectors and contextual language models such as
+transformers is that word vectors model **lexical types**, rather than _tokens_.
+If you have a list of terms with no context around them, a transformer model
+like BERT can't really help you. BERT is designed to understand language **in
+context**, which isn't what you have. A word vectors table will be a much better
+fit for your task. However, if you do have words in context — whole sentences or
+paragraphs of running text — word vectors will only provide a very rough
 approximation of what the text is about.
 
 Word vectors are also very computationally efficient, as they map a word to a
@@ -484,28 +484,32 @@ training.
 
 ## Static vectors {#static-vectors}
 
-If your pipeline includes a word vectors table, you'll be able to use the
-`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects.
-You'll also be able to access the vectors using the `.vector` attribute, or you
-can look up one or more vectors directly using the `Vocab` object. Pipelines
-with word vectors can also use the vectors as features for the statistical
-models, which can improve the accuracy of your components.
+If your pipeline includes a **word vectors table**, you'll be able to use the
+`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span),
+[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able
+to access the vectors using the `.vector` attribute, or you can look up one or
+more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with
+word vectors can also **use the vectors as features** for the statistical
+models, which can **improve the accuracy** of your components.
 
 Word vectors in spaCy are "static" in the sense that they are not learned
 parameters of the statistical models, and spaCy itself does not feature any
 algorithms for learning word vector tables. You can train a word vectors table
-using tools such as Gensim, word2vec, FastText or GloVe. There are also many
-word vector tables available for download. Once you have a word vectors table
-you want to use, you can convert it for use with spaCy using the `spacy init vocab`
-command, which will give you a directory you can load or refer to in your training
-configs.
+using tools such as [Gensim](https://radimrehurek.com/gensim/),
+[FastText](https://fasttext.cc/) or
+[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing
+pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you
+convert vectors for use with spaCy and will give you a directory you can load or
+refer to in your [training configs](/usage/training#config).
 
-When converting the vectors, there are two ways you can trim them down to make
-your package smaller. You can _truncate_ the vectors with the `--truncate-vectors`
-option, which will remove entries for rarer words from the table. Alternatively,
-you can use the `--prune-vectors` option to remap rarer words to the closest vector
-that remains in the table. This allows the vectors table to return meaningful
-(albeit imperfect) results for more words than you have rows in the table.
+<Infobox title="Word vectors and similarity" emoji="📖">
+
+For more details on loading word vectors into spaCy, using them for similarity
+and improving word vector coverage by truncating and pruning the vectors, see
+the usage guide on
+[word vectors and similarity](/usage/linguistic-features#vectors-similarity).
+
+</Infobox>
 
 ### Using word vectors in your models {#word-vectors-models}
 

From ed0fb034cb487a1fcc206e250ca34c8a38b7e0de Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 17 Sep 2020 18:11:10 +0200
Subject: [PATCH 038/133] ml_datasets v0.2.0a0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 69477c2d3..55fe627b8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0a33,<8.0.0a40
 blis>=0.4.0,<0.5.0
-ml_datasets>=0.2.0
+ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.0,<1.1.0
 srsly>=2.1.0,<3.0.0

From 6efb7688a65faae489de33073c1c40b11ec4f432 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 17 Sep 2020 18:17:03 +0200
Subject: [PATCH 039/133] Draft pretrain usage

---
 website/docs/usage/embeddings-transformers.md | 86 ++++++++++++++++---
 1 file changed, 76 insertions(+), 10 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 9f73661c3..678237dc2 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -610,17 +610,83 @@ def MyCustomVectors(
 
 ## Pretraining {#pretraining}
 
-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
+The `spacy pretrain` command lets you initialize your models with information
+from raw text. Without pretraining, the models for your components will usually
+be initialized randomly. The idea behind pretraining is simple: random probably
+isn't optimal, so if we have some text to learn from, we can probably find
+a way to get the model off to a better start. The impact of `spacy pretrain` varies,
+but it will usually be worth trying if you're not using a transformer model and
+you have relatively little training data (for instance, fewer than 5,000 sentence).
+A good rule of thumb is that pretraining will generally give you a similar accuracy
+improvement to using word vectors in your model. If word vectors have given you
+a 10% error reduction, the `spacy pretrain` command might give you another 10%,
+for a 20% error reduction in total.
 
-<!--
-- explain general concept and idea (short!)
-- present it as a separate lightweight mechanism for pretraining the tok2vec
-  layer
-- advantages (could also be pros/cons table)
-- explain how it generates a separate file (!) and how it depends on the same
-  vectors
--->
+The `spacy pretrain` command will take a specific subnetwork within one of your
+components, and add additional layers to build a network for a temporary task,
+that forces the model to learn something about sentence structure and word
+cooccurrence statistics. Pretraining produces a binary weights file that can be
+loaded back in at the start of training. The weights file specifies an initial
+set of weights. Training then proceeds as normal.
+
+You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork
+must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer).
+The most common workflow is to use the `Tok2Vec` component to create a shared
+token-to-vector layer for several components of your pipeline, and apply
+pretraining to its whole model. 
+
+The `spacy pretrain` command is configured using the `[pretraining]` section of
+your config file. The `pretraining.component` and `pretraining.layer` settings
+tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer`
+setting should be either the empty string (to use the whole model), or a 
+[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's
+built-in model architectures have a reference named `"tok2vec"` that will refer
+to the right layer.
+
+```ini
+# Pretrain nlp.get_pipe("tok2vec").model
+[pretraining]
+component = "tok2vec"
+layer = ""
+
+[pretraining]
+# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec")
+component = "textcat"
+layer = "tok2vec"
+```
+
+two pretraining objectives are available, both of which are variants of the cloze
+task Devlin et al (2018) introduced for BERT.
+
+* The *characters* objective asks the model to predict some number of leading and
+  trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the
+  model will try to predict the first two and last two characters of the word.
+
+* The *vectors* objective asks the model to predict the word's vector, from
+  a static embeddings table. This requires a word vectors model to be trained
+  and loaded. The vectors objective can optimize either a cosine or an L2 loss.
+  We've generally found cosine loss to perform better.
+
+These pretraining objectives use a trick that we term _language modelling with
+approximate outputs (LMAO)_. The motivation for the trick is that predicting
+an exact word ID introduces a lot of incidental complexity. You need a large
+output layer, and even then, the vocabulary is too large, which motivates
+tokenization schemes that do not align to actual word boundaries. At the end of
+training, the output layer will be thrown away regardless: we just want a task
+that forces the network to model something about word cooccurrence statistics.
+Predicting leading and trailing characters does that more than adequately, as
+the exact word sequence could be recovered with high accuracy if the initial
+and trailing characters are predicted accurately. With the vectors objective,
+the pretraining is use the embedding space learned by an algorithm such as
+GloVe or word2vec, allowing the model to focus on the contextual
+modelling we actual care about.
+
+The `[pretraining]` section has several configuration subsections that are
+familiar from the training block: the `[pretraining.batcher]`,
+[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
+expect the same types of objects, although for pretraining your corpus does not
+need to have any annotations, so you will often use a different reader, such as 
+`spacy.training.JsonlReader1`.
 
 > #### Raw text format
 >

From a0b4389a3845a1692b934a6ca79caf54bb29b1a3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 17 Sep 2020 19:24:48 +0200
Subject: [PATCH 040/133] Update docs [ci skip]

---
 website/docs/usage/embeddings-transformers.md | 200 +++++++++++-------
 1 file changed, 121 insertions(+), 79 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 678237dc2..4adcd927c 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -610,99 +610,141 @@ def MyCustomVectors(
 
 ## Pretraining {#pretraining}
 
-The `spacy pretrain` command lets you initialize your models with information
-from raw text. Without pretraining, the models for your components will usually
-be initialized randomly. The idea behind pretraining is simple: random probably
-isn't optimal, so if we have some text to learn from, we can probably find
-a way to get the model off to a better start. The impact of `spacy pretrain` varies,
-but it will usually be worth trying if you're not using a transformer model and
-you have relatively little training data (for instance, fewer than 5,000 sentence).
-A good rule of thumb is that pretraining will generally give you a similar accuracy
-improvement to using word vectors in your model. If word vectors have given you
-a 10% error reduction, the `spacy pretrain` command might give you another 10%,
-for a 20% error reduction in total.
+The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
+models with **information from raw text**. Without pretraining, the models for
+your components will usually be initialized randomly. The idea behind
+pretraining is simple: random probably isn't optimal, so if we have some text to
+learn from, we can probably find a way to get the model off to a better start.
 
-The `spacy pretrain` command will take a specific subnetwork within one of your
-components, and add additional layers to build a network for a temporary task,
-that forces the model to learn something about sentence structure and word
-cooccurrence statistics. Pretraining produces a binary weights file that can be
-loaded back in at the start of training. The weights file specifies an initial
-set of weights. Training then proceeds as normal.
-
-You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork
-must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer).
-The most common workflow is to use the `Tok2Vec` component to create a shared
-token-to-vector layer for several components of your pipeline, and apply
-pretraining to its whole model. 
-
-The `spacy pretrain` command is configured using the `[pretraining]` section of
-your config file. The `pretraining.component` and `pretraining.layer` settings
-tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer`
-setting should be either the empty string (to use the whole model), or a 
-[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's
-built-in model architectures have a reference named `"tok2vec"` that will refer
-to the right layer.
-
-```ini
-# Pretrain nlp.get_pipe("tok2vec").model
-[pretraining]
-component = "tok2vec"
-layer = ""
-
-[pretraining]
-# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec")
-component = "textcat"
-layer = "tok2vec"
-```
-
-two pretraining objectives are available, both of which are variants of the cloze
-task Devlin et al (2018) introduced for BERT.
-
-* The *characters* objective asks the model to predict some number of leading and
-  trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the
-  model will try to predict the first two and last two characters of the word.
-
-* The *vectors* objective asks the model to predict the word's vector, from
-  a static embeddings table. This requires a word vectors model to be trained
-  and loaded. The vectors objective can optimize either a cosine or an L2 loss.
-  We've generally found cosine loss to perform better.
-
-These pretraining objectives use a trick that we term _language modelling with
-approximate outputs (LMAO)_. The motivation for the trick is that predicting
-an exact word ID introduces a lot of incidental complexity. You need a large
-output layer, and even then, the vocabulary is too large, which motivates
-tokenization schemes that do not align to actual word boundaries. At the end of
-training, the output layer will be thrown away regardless: we just want a task
-that forces the network to model something about word cooccurrence statistics.
-Predicting leading and trailing characters does that more than adequately, as
-the exact word sequence could be recovered with high accuracy if the initial
-and trailing characters are predicted accurately. With the vectors objective,
-the pretraining is use the embedding space learned by an algorithm such as
-GloVe or word2vec, allowing the model to focus on the contextual
-modelling we actual care about.
-
-The `[pretraining]` section has several configuration subsections that are
-familiar from the training block: the `[pretraining.batcher]`,
-[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
+Pretraining uses the same [`config.cfg`](/usage/training#config) file as the
+regular training, which helps keep the settings and hyperparameters consistent.
+The additional `[pretraining]` section has several configuration subsections
+that are familiar from the training block: the `[pretraining.batcher]`,
+`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
-need to have any annotations, so you will often use a different reader, such as 
-`spacy.training.JsonlReader1`.
+need to have any annotations, so you will often use a different reader, such as
+the [`JsonlReader`](/api/toplevel#jsonlreader).
 
 > #### Raw text format
 >
-> The raw text can be provided as JSONL (newline-delimited JSON) with a key
-> `"text"` per entry. This allows the data to be read in line by line, while
-> also allowing you to include newlines in the texts.
+> The raw text can be provided in spaCy's
+> [binary `.spacy` format](/api/data-formats#training) consisting of serialized
+> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per
+> entry. This allows the data to be read in line by line, while also allowing
+> you to include newlines in the texts.
 >
 > ```json
 > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
 > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
 > ```
+>
+> You can also use your own custom corpus loader instead.
+
+You can add a `[pretraining]` block to your config by setting the
+`--pretraining` flag on [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config):
 
 ```cli
 $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
 ```
 
+You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config
+and pass in optional config overrides, like the path to the raw text file:
+
 ```cli
-$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
+$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```
+
+### How pretraining works {#pretraining-details}
+
+The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
+be worth trying if you're **not using a transformer** model and you have
+**relatively little training data** (for instance, fewer than 5,000 sentences).
+A good rule of thumb is that pretraining will generally give you a similar
+accuracy improvement to using word vectors in your model. If word vectors have
+given you a 10% error reduction, pretraining with spaCy might give you another
+10%, for a 20% error reduction in total.
+
+The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
+subnetwork** within one of your components, and add additional layers to build a
+network for a temporary task, that forces the model to learn something about
+sentence structure and word cooccurrence statistics. Pretraining produces a
+**binary weights file** that can be loaded back in at the start of training. The
+weights file specifies an initial set of weights. Training then proceeds as
+normal.
+
+You can only pretrain one subnetwork from your pipeline at a time, and the
+subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be
+a "tok2vec" layer). The most common workflow is to use the
+[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for
+several components of your pipeline, and apply pretraining to its whole model.
+
+#### Configuring the pretraining {#pretraining-configure}
+
+The [`spacy pretrain`](/api/cli#pretrain) command is configured using the
+`[pretraining]` section of your [config file](/usage/training#config). The
+`component` and `layer` settings tell spaCy how to **find the subnetwork** to
+pretrain. The `layer` setting should be either the empty string (to use the
+whole model), or a
+[node reference](https://thinc.ai/docs/usage-models#model-state). Most of
+spaCy's built-in model architectures have a reference named `"tok2vec"` that
+will refer to the right layer.
+
+```ini
+### config.cfg
+# 1. Use the whole model of the "tok2vec" component
+[pretraining]
+component = "tok2vec"
+layer = ""
+
+# 2. Pretrain the "tok2vec" node of the "textcat" component
+[pretraining]
+component = "textcat"
+layer = "tok2vec"
+```
+
+#### Pretraining objectives {#pretraining-details}
+
+Two pretraining objectives are available, both of which are variants of the
+cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced
+for BERT. The objective can be defined and configured via the
+`[pretraining.objective]` config block.
+
+> ```ini
+> ### Characters objective
+> [pretraining.objective]
+> type = "characters"
+> n_characters = 4
+> ```
+>
+> ```ini
+> ### Vectors objective
+> [pretraining.objective]
+> type = "vectors"
+> loss = "cosine"
+> ```
+
+- **Characters:** The `"characters"` objective asks the model to predict some
+  number of leading and trailing UTF-8 bytes for the words. For instance,
+  setting `n_characters = 2`, the model will try to predict the first two and
+  last two characters of the word.
+
+- **Vectors:** The `"vectors"` objective asks the model to predict the word's
+  vector, from a static embeddings table. This requires a word vectors model to
+  be trained and loaded. The vectors objective can optimize either a cosine or
+  an L2 loss. We've generally found cosine loss to perform better.
+
+These pretraining objectives use a trick that we term **language modelling with
+approximate outputs (LMAO)**. The motivation for the trick is that predicting an
+exact word ID introduces a lot of incidental complexity. You need a large output
+layer, and even then, the vocabulary is too large, which motivates tokenization
+schemes that do not align to actual word boundaries. At the end of training, the
+output layer will be thrown away regardless: we just want a task that forces the
+network to model something about word cooccurrence statistics. Predicting
+leading and trailing characters does that more than adequately, as the exact
+word sequence could be recovered with high accuracy if the initial and trailing
+characters are predicted accurately. With the vectors objective, the pretraining
+is use the embedding space learned by an algorithm such as
+[GloVe](https://nlp.stanford.edu/projects/glove/) or
+[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
+focus on the contextual modelling we actual care about.

From e4fc7e0222621c40b6d0aa025d3fc0450a672079 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 22:34:36 +0200
Subject: [PATCH 041/133] fixing output sample to proper 2D array

---
 spacy/cli/debug_model.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 58908c5e8..04a14bdc9 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -60,13 +60,12 @@ def debug_model_cli(
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
     pipe = nlp.get_pipe(component)
-    if hasattr(pipe, "model"):
-        model = pipe.model
-    else:
+    if not hasattr(pipe, "model"):
         msg.fail(
             f"The component '{component}' does not specify an object that holds a Model.",
             exits=1,
         )
+    model = pipe.model
     debug_model(model, print_settings=print_settings)
 
 
@@ -87,7 +86,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops.xp)
+    Y = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=Y)
@@ -113,9 +112,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
+    msg.good(f"Succesfully ended analysis - model looks good!")
+
 
 def get_gradient(model, Y):
-    goldY = _get_output(model.ops.xp)
+    goldY = _get_output(model.ops)
     return Y - goldY
 
 
@@ -133,8 +134,14 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _get_output(ops):
+    docs = len(_get_docs())
+    labels = 6
+    output = ops.alloc2f(d0=docs, d1=labels)
+    for i in range(docs):
+        for j in range(labels):
+            output[i, j] = 1 / (i+j+0.01)
+    return ops.xp.asarray(output)
 
 
 def _print_model(model, print_settings):

From a88106e852b08bcbbe607d5bb83929e5a13120f4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 03:01:29 +0200
Subject: [PATCH 042/133] Remove W106: HEAD and SENT_START in doc.from_array
 (#6086)

* Remove W106: HEAD and SENT_START in doc.from_array

This warning was hacky and being triggered too often.

* Fix test
---
 spacy/errors.py                 | 3 ---
 spacy/tests/doc/test_doc_api.py | 5 ++---
 spacy/tokens/doc.pyx            | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 173aedab9..81e3616be 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -119,9 +119,6 @@ class Warnings:
     W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
             "need to match on a stream of documents, you can use nlp.pipe and "
             "call the {matcher} on each Doc object.")
-    W106 = ("Both HEAD and SENT_START are included as attributes in "
-            "doc.from_array(). The parse trees based on the HEAD attribute "
-            "will override the values in SENT_START.")
     W107 = ("The property Doc.{prop} is deprecated. Use "
             "Doc.has_annotation(\"{attr}\") instead.")
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index ce979d3d1..c979931b1 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab):
     # fmt: on
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
 
-    # HEAD overrides SENT_START with warning
+    # HEAD overrides SENT_START without warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
-    with pytest.warns(UserWarning):
-        new_doc.from_array(attrs, arr)
+    new_doc.from_array(attrs, arr)
 
     # no warning using default attrs
     attrs = doc._get_array_attrs()
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5c5443258..2d9de278b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -817,8 +817,6 @@ cdef class Doc:
         if array.dtype != numpy.uint64:
             warnings.warn(Warnings.W028.format(type=array.dtype))
 
-        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
-            warnings.warn(Warnings.W106)
         cdef int i, col
         cdef int32_t abs_head_index
         cdef attr_id_t attr_id

From d32ce121beb38d05e1e926053f1fdf9cce8d2aa6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 18 Sep 2020 13:41:12 +0200
Subject: [PATCH 043/133] Fix docs [ci skip]

---
 website/docs/api/top-level.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f52c63f18..a37f24213 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -84,7 +84,7 @@ Create a blank pipeline of a given language class. This function is the twin of
 | _keyword-only_                      |                                                                                                                                                                    |
 | `vocab` <Tag variant="new">3</Tag>  | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                             |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
-| `meta` <Tag variant="new">3</tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
+| `meta` <Tag variant="new">3</Tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
 | **RETURNS**                         | An empty `Language` object of the appropriate subclass. ~~Language~~                                                                                               |
 
 ### spacy.info {#spacy.info tag="function"}

From bbdb5f62b70e9e12c6d4a8d9581e064ce846d19c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 18 Sep 2020 14:26:42 +0200
Subject: [PATCH 044/133] Temporary work-around for scoring a subset of
 components (#6090)

* Try hacking the scorer to work around sentence boundaries

* Upd scorer

* Set dev version

* Upd scorer hack

* Fix version

* Improve comment on hack
---
 spacy/scorer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 7f7418237..da22d59d4 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -270,6 +270,18 @@ class Scorer:
         for example in examples:
             pred_doc = example.predicted
             gold_doc = example.reference
+            # TODO
+            # This is a temporary hack to work around the problem that the scorer
+            # fails if you have examples that are not fully annotated for all
+            # the tasks in your pipeline. For instance, you might have a corpus
+            # of NER annotations that does not set sentence boundaries, but the
+            # pipeline includes a parser or senter, and then the score_weights
+            # are used to evaluate that component. When the scorer attempts
+            # to read the sentences from the gold document, it fails.
+            try:
+                list(getter(gold_doc, attr))
+            except ValueError:
+                continue
             # Find all labels in gold and doc
             labels = set(
                 [k.label_ for k in getter(gold_doc, attr)]

From 0406200a1ea1c960cf6d07c11f91f3b4d7f2d551 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 18 Sep 2020 15:13:13 +0200
Subject: [PATCH 045/133] Update docs [ci skip]

---
 website/docs/api/data-formats.md | 43 +++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index b9e185d9c..3ed846b9e 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -123,20 +123,11 @@ $ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
 
 ### corpora {#config-corpora tag="section"}
 
-This section defines a dictionary mapping of string keys to `Callable`
-functions. Each callable takes an `nlp` object and yields
-[`Example`](/api/example) objects. By default, the two keys `train` and `dev`
-are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When
-pretraining, an additional pretrain section is added that defaults to a
-[`JsonlReader`](/api/top-level#JsonlReader).
-
-These subsections can be expanded with additional subsections, each referring to
-a callback of type `Callable[[Language], Iterator[Example]]`:
-
 > #### Example
 >
 > ```ini
 > [corpora]
+>
 > [corpora.train]
 > @readers = "spacy.Corpus.v1"
 > path = ${paths:train}
@@ -148,28 +139,44 @@ a callback of type `Callable[[Language], Iterator[Example]]`:
 > [corpora.pretrain]
 > @readers = "spacy.JsonlReader.v1"
 > path = ${paths.raw}
-> min_length = 5
-> max_length = 500
 >
-> [corpora.mydata]
-> @readers = "my_reader.v1"
-> shuffle = true
+> [corpora.my_custom_data]
+> @readers = "my_custom_reader.v1"
 > ```
 
-Alternatively, the `corpora` block could refer to one function with return type
-`Dict[str, Callable[[Language], Iterator[Example]]]`:
+This section defines a **dictionary** mapping of string keys to functions. Each
+function takes an `nlp` object and yields [`Example`](/api/example) objects. By
+default, the two keys `train` and `dev` are specified and each refer to a
+[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
+section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
+You can also register custom functions that return a callable.
+
+| Name       | Description                                                                                                                                                                 |
+| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `train`    | Training data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~                                                                     |
+| `dev`      | Development data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~                                                                  |
+| `pretrain` | Raw text for [pretraining](/usage/embeddings-transformers#pretraining), typically used in `[pretraining]` block (if available). ~~Callable[[Language], Iterator[Example]]~~ |
+| ...        | Any custom or alternative corpora. ~~Callable[[Language], Iterator[Example]]~~                                                                                              |
+
+Alternatively, the `[corpora]` block can refer to **one function** that returns
+a dictionary keyed by the corpus names. This can be useful if you want to load a
+single corpus once and then divide it up into `train` and `dev` partitions.
 
 > #### Example
 >
 > ```ini
 > [corpora]
-> @readers = "my_dict_reader.v1"
+> @readers = "my_custom_reader.v1"
 > train_path = ${paths:train}
 > dev_path = ${paths:dev}
 > shuffle = true
 >
 > ```
 
+| Name      | Description                                                                                                                                                                                                              |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `corpora` | A dictionary keyed by string names, mapped to corpus functions that receive the current `nlp` object and return an iterator of [`Example`](/api/example) objects. ~~Dict[str, Callable[[Language], Iterator[Example]]]~~ |
+
 ### training {#config-training tag="section"}
 
 This section defines settings and controls for the training and evaluation

From eed4b785f51fcff2783e06306441f55437fc95fb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 15:45:55 +0200
Subject: [PATCH 046/133] Load vocab lookups tables at beginning of training

Similar to how vectors are handled, move the vocab lookups to be loaded
at the start of training rather than when the vocab is initialized,
since the vocab doesn't have access to the full config when it's
created.

The option moves from `nlp.load_vocab_data` to `training.lookups`.

Typically these tables will come from `spacy-lookups-data`, but any
`Lookups` object can be provided.

The loading from `spacy-lookups-data` is now strict, so configs for each
language should specify the exact tables required. This also makes it
easier to control whether the larger clusters and probs tables are
included.

To load `lexeme_norm` from `spacy-lookups-data`:

```
[training.lookups]
@misc = "spacy.LoadLookupsData.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
```
---
 spacy/cli/train.py       |  1 +
 spacy/default_config.cfg |  2 +-
 spacy/language.py        |  8 +++++++-
 spacy/schemas.py         |  3 ++-
 spacy/tests/test_util.py |  7 ++-----
 spacy/util.py            |  8 ++++++++
 spacy/vocab.pyx          | 17 +++++++++--------
 7 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 50306b350..c6b39c289 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -88,6 +88,7 @@ def train(
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
         nlp, config = util.load_model_from_config(config)
+    util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
     if config["training"]["vectors"] is not None:
         util.load_vectors_into_model(nlp, config["training"]["vectors"])
     raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index c7c9593d7..1517421f0 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -12,7 +12,6 @@ use_pytorch_for_gpu_memory = false
 lang = null
 pipeline = []
 disabled = []
-load_vocab_data = true
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@@ -58,6 +57,7 @@ accumulate_gradient = 1
 init_tok2vec = ${paths.init_tok2vec}
 raw_text = ${paths.raw}
 vectors = null
+lookups = null
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
 max_epochs = 0
diff --git a/spacy/language.py b/spacy/language.py
index d530e6b92..1d0990c55 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -31,6 +31,7 @@ from .schemas import ConfigSchema
 from .git_info import GIT_VERSION
 from . import util
 from . import about
+from .lookups import load_lookups
 
 
 # This is the base config will all settings (training etc.)
@@ -86,6 +87,12 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
     return tokenizer_factory
 
 
+@registry.misc("spacy.LoadLookupsData.v1")
+def load_lookups_data(lang, tables):
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
 class Language:
     """A text-processing pipeline. Usually you'll load this once per process,
     and pass the instance around your application.
@@ -152,7 +159,6 @@ class Language:
                 self.lang,
                 self.Defaults,
                 vectors_name=vectors_name,
-                load_data=self._config["nlp"]["load_vocab_data"],
             )
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 06bc4beed..c72b5ca8b 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -8,6 +8,7 @@ from collections import defaultdict
 from thinc.api import Optimizer
 
 from .attrs import NAMES
+from .lookups import Lookups
 
 if TYPE_CHECKING:
     # This lets us add type hints for mypy etc. without causing circular imports
@@ -198,6 +199,7 @@ class ModelMetaSchema(BaseModel):
 class ConfigSchemaTraining(BaseModel):
     # fmt: off
     vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
     dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
     train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
     batcher: Batcher = Field(..., title="Batcher for the training data")
@@ -227,7 +229,6 @@ class ConfigSchemaNlp(BaseModel):
     pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
     disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
     tokenizer: Callable = Field(..., title="The tokenizer to use")
-    load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
     before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 1f073ab32..8c931d31e 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -69,7 +69,6 @@ def test_util_dot_section():
     [nlp]
     lang = "en"
     pipeline = ["textcat"]
-    load_vocab_data = false
 
     [components]
 
@@ -95,15 +94,13 @@ def test_util_dot_section():
     # not exclusive_classes
     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
     # Test that default values got overwritten
-    assert not en_config["nlp"]["load_vocab_data"]
-    assert nl_config["nlp"]["load_vocab_data"]  # default value True
+    assert en_config["nlp"]["pipeline"] == ["textcat"]
+    assert nl_config["nlp"]["pipeline"] == [] # default value []
     # Test proper functioning of 'dot_to_object'
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.pipeline.tagger")
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.unknownattribute")
-    assert not dot_to_object(en_config, "nlp.load_vocab_data")
-    assert dot_to_object(nl_config, "nlp.load_vocab_data")
     assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 18b34e4d6..2e285a128 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -253,6 +253,14 @@ def load_vectors_into_model(
                 nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
+def load_vocab_data_into_model(
+    nlp: "Language", *, lookups: Optional["Lookups"]=None
+) -> None:
+    """Load vocab data."""
+    if lookups:
+        nlp.vocab.load_lookups(lookups)
+
+
 def load_model(
     name: Union[str, Path],
     *,
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ef0847e54..94289036a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
 from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
 
 
-def create_vocab(lang, defaults, vectors_name=None, load_data=True):
+def create_vocab(lang, defaults, vectors_name=None):
     # If the spacy-lookups-data package is installed, we pre-populate the lookups
     # with lexeme data, if available
-    if load_data:
-        tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
-        lookups = load_lookups(lang, tables=tables, strict=False)
-    else:
-        lookups = Lookups()
     lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
     # This is messy, but it's the minimal working fix to Issue #639.
     lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
     lex_attrs[NORM] = util.add_lookups(
         lex_attrs.get(NORM, LEX_ATTRS[NORM]),
         BASE_NORMS,
-        lookups.get_table("lexeme_norm", {}),
     )
     return Vocab(
         lex_attr_getters=lex_attrs,
-        lookups=lookups,
         writing_system=defaults.writing_system,
         get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
         vectors_name=vectors_name,
@@ -424,6 +417,14 @@ cdef class Vocab:
             orth = self.strings.add(orth)
         return orth in self.vectors
 
+    def load_lookups(self, lookups):
+        self.lookups = lookups
+        if lookups.has_table("lexeme_norm"):
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters[NORM],
+                lookups.get_table("lexeme_norm"),
+            )
+
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.
 

From 73ff52b9ec9e61ae2d7faeacfef1b7bee53ea10e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 18 Sep 2020 16:43:15 +0200
Subject: [PATCH 047/133] hack for tok2vec listener

---
 spacy/cli/debug_model.py | 26 +++++++++++++++++---------
 spacy/errors.py          |  3 ++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 04a14bdc9..1d8d043fd 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -66,10 +66,12 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(model, print_settings=print_settings)
+    # call _link_components directly as we won't call nlp.begin_training
+    nlp._link_components()
+    debug_model(nlp, model, print_settings=print_settings)
 
 
-def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -86,10 +88,10 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops)
+    goldY = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=Y)
+        model.initialize(X=X, Y=goldY)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -97,9 +99,16 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     # STEP 2: Updating the model and printing again
     optimizer = Adam(0.001)
     set_dropout_rate(model, 0.2)
+    # ugly hack to deal with Tok2Vec listeners
+    tok2vec = None
+    if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
+        tok2vec = nlp.get_pipe("tok2vec")
+        tok2vec.model.initialize(X=X)
     for e in range(3):
-        Y, get_dX = model.begin_update(_get_docs())
-        dY = get_gradient(model, Y)
+        if tok2vec:
+            tok2vec.predict(X)
+        Y, get_dX = model.begin_update(X)
+        dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -107,7 +116,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         _print_model(model, print_settings)
 
     # STEP 3: the final prediction
-    prediction = model.predict(_get_docs())
+    prediction = model.predict(X)
     if print_settings.get("print_prediction"):
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
@@ -115,8 +124,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     msg.good(f"Succesfully ended analysis - model looks good!")
 
 
-def get_gradient(model, Y):
-    goldY = _get_output(model.ops)
+def get_gradient(goldY, Y):
     return Y - goldY
 
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 173aedab9..af307e069 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -545,7 +545,8 @@ class Errors:
     E949 = ("Can only create an alignment when the texts are the same.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
-    E954 = ("The Tok2Vec listener did not receive a valid input.")
+    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
+            "component.")
     E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")

From 47080fba98bf7efd7432a0ac831d5715fad91a59 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 18 Sep 2020 19:43:19 +0200
Subject: [PATCH 048/133] Minor renaming / refactoring

* Rename loader to `spacy.LookupsDataLoader.v1`, add debugging message
* Make `Vocab.lookups` a property
---
 spacy/language.py |  3 ++-
 spacy/util.py     |  2 +-
 spacy/vocab.pxd   |  2 +-
 spacy/vocab.pyx   | 19 ++++++++++++-------
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 1d0990c55..7d463731a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -87,8 +87,9 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
     return tokenizer_factory
 
 
-@registry.misc("spacy.LoadLookupsData.v1")
+@registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
+    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
     lookups = load_lookups(lang=lang, tables=tables)
     return lookups
 
diff --git a/spacy/util.py b/spacy/util.py
index 2e285a128..88162b23a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -258,7 +258,7 @@ def load_vocab_data_into_model(
 ) -> None:
     """Load vocab data."""
     if lookups:
-        nlp.vocab.load_lookups(lookups)
+        nlp.vocab.lookups = lookups
 
 
 def load_model(
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 69cec7d3d..7d8dfd5d6 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -28,7 +28,7 @@ cdef class Vocab:
     cpdef readonly StringStore strings
     cpdef public Morphology morphology
     cpdef public object vectors
-    cpdef public object lookups
+    cpdef public object _lookups
     cpdef public object writing_system
     cpdef public object get_noun_chunks
     cdef readonly int length
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 94289036a..ce104d9db 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -417,13 +417,18 @@ cdef class Vocab:
             orth = self.strings.add(orth)
         return orth in self.vectors
 
-    def load_lookups(self, lookups):
-        self.lookups = lookups
-        if lookups.has_table("lexeme_norm"):
-            self.lex_attr_getters[NORM] = util.add_lookups(
-                self.lex_attr_getters[NORM],
-                lookups.get_table("lexeme_norm"),
-            )
+    property lookups:
+        def __get__(self):
+            return self._lookups
+
+        def __set__(self, lookups):
+            self._lookups = lookups
+            if lookups.has_table("lexeme_norm"):
+                self.lex_attr_getters[NORM] = util.add_lookups(
+                    self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+                    self.lookups.get_table("lexeme_norm"),
+                )
+
 
     def to_disk(self, path, *, exclude=tuple()):
         """Save the current state to a directory.

From 39872de1f6e49c4b59ed747a2f15ca448a52f7db Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Sat, 19 Sep 2020 01:17:02 +0200
Subject: [PATCH 049/133] Introducing the gpu_allocator (#6091)

* rename 'use_pytorch_for_gpu_memory' to 'gpu_allocator'

* --code instead of --code-path

* update documentation

* avoid querying the "system" section directly

* add explanation of gpu_allocator to TF/PyTorch section in docs

* fix typo

* fix typo 2

* use set_gpu_allocator from thinc 8.0.0a34

* default null instead of empty string
---
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 setup.cfg                                     |  4 ++--
 spacy/cli/debug_model.py                      |  9 +++++++--
 spacy/cli/pretrain.py                         | 17 +++++++++--------
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 spacy/cli/train.py                            | 13 ++++++-------
 spacy/default_config.cfg                      |  4 ++--
 spacy/schemas.py                              |  1 +
 website/docs/api/cli.md                       |  4 +++-
 website/docs/api/data-formats.md              |  1 +
 website/docs/api/top-level.md                 | 14 ++++++++------
 website/docs/usage/layers-architectures.md    | 12 ++++++++++++
 13 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a413a099c..5290660aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0a33,<8.0.0a40",
+    "thinc>=8.0.0a34,<8.0.0a40",
     "blis>=0.4.0,<0.5.0",
     "pytokenizations",
     "pathy"
diff --git a/requirements.txt b/requirements.txt
index 55fe627b8..4d6c1dfd0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0a33,<8.0.0a40
+thinc>=8.0.0a34,<8.0.0a40
 blis>=0.4.0,<0.5.0
 ml_datasets==0.2.0a0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 359e63172..dd0975800 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,13 +34,13 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
 install_requires =
     # Our libraries
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0a33,<8.0.0a40
+    thinc>=8.0.0a34,<8.0.0a40
     blis>=0.4.0,<0.5.0
     wasabi>=0.8.0,<1.1.0
     srsly>=2.1.0,<3.0.0
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index a4899a458..349849f58 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
-from thinc.api import Model, data_validation
+from thinc.api import Model, data_validation, set_gpu_allocator
 import typer
 
 from ._util import Arg, Opt, debug_cli, show_validation_error
@@ -53,7 +53,12 @@ def debug_model_cli(
     }
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
-        config = util.load_config(config_path, overrides=config_overrides)
+        config = util.load_config(
+            config_path, overrides=config_overrides, interpolate=True
+        )
+        allocator = config["training"]["gpu_allocator"]
+        if use_gpu >= 0 and allocator:
+            set_gpu_allocator(allocator)
         nlp, config = util.load_model_from_config(config_path)
     seed = config["training"]["seed"]
     if seed is not None:
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index aec077eb7..9e913396e 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -4,10 +4,9 @@ import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.api import Config
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import require_gpu, set_gpu_allocator
 from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
-from thinc.api import CosineDistance, L2Distance
+from thinc.api import Config, CosineDistance, L2Distance
 from wasabi import msg
 import srsly
 from functools import partial
@@ -32,7 +31,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -99,10 +98,12 @@ def pretrain(
     epoch_resume: Optional[int] = None,
     use_gpu: int = -1,
 ):
-    if config["system"].get("seed") is not None:
-        fix_random_seed(config["system"]["seed"])
-    if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
-        use_pytorch_for_gpu_memory()
+    if config["training"]["seed"] is not None:
+        fix_random_seed(config["training"]["seed"])
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
+
     nlp, config = util.load_model_from_config(config)
     P_cfg = config["pretraining"]
     corpus = dot_to_object(config, P_cfg["corpus"])
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 00b77af4d..ef608e5e8 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -8,7 +8,7 @@ train = ""
 dev = ""
 
 [system]
-use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
+gpu_allocator = {{ "pytorch" if use_transformer else "" }}
 
 [nlp]
 lang = "{{ lang }}"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 50306b350..debecd0b1 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -6,8 +6,7 @@ from pathlib import Path
 from wasabi import msg
 import thinc
 import thinc.schedules
-from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
-from thinc.api import Config, Optimizer
+from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
 import random
 import typer
 import logging
@@ -29,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
@@ -79,11 +78,11 @@ def train(
         config = util.load_config(
             config_path, overrides=config_overrides, interpolate=True
         )
-    if config.get("training", {}).get("seed") is not None:
+    if config["training"]["seed"] is not None:
         fix_random_seed(config["training"]["seed"])
-    if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
-        # It feels kind of weird to not have a default for this.
-        use_pytorch_for_gpu_memory()
+    allocator = config["training"]["gpu_allocator"]
+    if use_gpu >= 0 and allocator:
+        set_gpu_allocator(allocator)
     # Use original config here before it's resolved to functions
     sourced_components = get_sourced_components(config)
     with show_validation_error(config_path):
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index c7c9593d7..f4a453f2a 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -6,7 +6,7 @@ init_tok2vec = null
 
 [system]
 seed = 0
-use_pytorch_for_gpu_memory = false
+gpu_allocator = null
 
 [nlp]
 lang = null
@@ -52,6 +52,7 @@ limit = 0
 # Training hyper-parameters and additional features.
 [training]
 seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 # Extra resources for transfer-learning or pseudo-rehearsal
@@ -75,7 +76,6 @@ train_corpus = "corpora.train"
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 
-
 [training.batcher]
 @batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 06bc4beed..db71af9ca 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -207,6 +207,7 @@ class ConfigSchemaTraining(BaseModel):
     max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
     eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
     seed: Optional[StrictInt] = Field(..., title="Random seed")
+    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
     score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index bd65a1516..7374e1e3f 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -763,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                                                              |
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                               |
+| `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                 |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                  |
@@ -798,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 
 | Name                    | Description                                                                                                                                                                           |
 | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                           |
+| `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                              |
 | `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~  |
 | `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                             |
 | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                   |
+| `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                            |
 | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                            |
 | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
 | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                  |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 3ed846b9e..6e80bb409 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -189,6 +189,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
 | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 5d850be01..3f51d21aa 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -145,9 +145,10 @@ pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```
 
-| Name        | Description                             |
-| ----------- | --------------------------------------- |
-| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | Whether the GPU was activated. ~~bool~~          |
 
 ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
 
@@ -164,9 +165,10 @@ and _before_ loading any pipelines.
 > nlp = spacy.load("en_core_web_sm")
 > ```
 
-| Name        | Description     |
-| ----------- | --------------- |
-| **RETURNS** | `True` ~~bool~~ |
+| Name        | Description                                      |
+| ----------- | ------------------------------------------------ |
+| `gpu_id`    | Device index to select. Defaults to `0`. ~~int~~ |
+| **RETURNS** | `True` ~~bool~~                                  |
 
 ## displaCy {#displacy source="spacy/displacy"}
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index aefc64ece..f9787d815 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.
 
 </Infobox>
 
+Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU
+memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
+"tensorflow" in the training config, cupy will allocate memory via those respective libraries,
+preventing OOM errors when there's available memory sitting in the other
+library's pool.
+
+```ini
+### config.cfg (excerpt)
+[training]
+gpu_allocator = "pytorch"
+```
+
 ## Custom models with Thinc {#thinc}
 
 Of course it's also possible to define the `Model` from the previous section

From 6db1d5dc0dff848dded3d2990543f749707afc45 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 19 Sep 2020 19:11:30 +0200
Subject: [PATCH 050/133] trying some stuff

---
 spacy/cli/debug_model.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1d8d043fd..09feaf671 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -89,6 +89,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
     goldY = _get_output(model.ops)
+    # _set_output_dim(nO=goldY.shape[-1], model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=goldY)
@@ -108,6 +109,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
+        print("get_dX", get_dX)
         dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
@@ -152,6 +154,10 @@ def _get_output(ops):
     return ops.xp.asarray(output)
 
 
+def _get_output_old(xp):
+    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+
+
 def _print_model(model, print_settings):
     layers = print_settings.get("layers", "")
     parameters = print_settings.get("parameters", False)
@@ -200,3 +206,12 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
+
+
+def _set_output_dim(model, nO):
+    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 554c9a24978d968113da02783c7257b5133ec5e6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 12:30:53 +0200
Subject: [PATCH 051/133] Update docs [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja |  6 +++++-
 website/docs/api/data-formats.md              |  7 +++----
 website/docs/api/top-level.md                 | 10 ++++++++++
 website/docs/usage/embeddings-transformers.md | 10 ++++++++++
 website/docs/usage/projects.md                | 10 ++++------
 5 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index ef608e5e8..0db4c8a59 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -8,7 +8,11 @@ train = ""
 dev = ""
 
 [system]
-gpu_allocator = {{ "pytorch" if use_transformer else "" }}
+{% if use_transformer -%}
+gpu_allocator = "pytorch"
+{% else -%}
+gpu_allocator = null
+{% endif %}
 
 [nlp]
 lang = "{{ lang }}"
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 6e80bb409..3a214428b 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
 > [nlp]
 > lang = "en"
 > pipeline = ["tagger", "parser", "ner"]
-> load_vocab_data = true
 > before_creation = null
 > after_creation = null
 > after_pipeline_creation = null
@@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
 | `lang`                    | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~                                                                                                                                                                                        |
 | `pipeline`                | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~                                                                        |
 | `disabled`                | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
-| `load_vocab_data`         | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~                                                                                                                                |
 | `before_creation`         | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~                                                                                                      |
 | `after_creation`          | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                                    |
 | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                                   |
@@ -189,9 +187,10 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                              |
 | `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                               |
 | `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                    |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be "pytorch" or "tensorflow". Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                |
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                              |
+| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                            |
 | `init_tok2vec`        | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                              |
+| `lookups`             | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                     |
 | `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                              |
 | `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                    |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
@@ -476,7 +475,7 @@ lexical data.
 Here's an example of the 20 most frequent lexemes in the English training data:
 
 ```json
-%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl
+%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
 ```
 
 ## Pipeline meta {#meta}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 3f51d21aa..7afe02403 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -458,6 +458,16 @@ remain in the config file stored on your local system.
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
 
+<Project id="integrations/wandb">
+
+Get started with tracking your spaCy training runs in Weights & Biases using our
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.
+
+</Project>
+
 ## Readers {#readers source="spacy/training/corpus.py" new="3"}
 
 Corpus readers are registered functions that load data and return a function
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 4adcd927c..c6c703842 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -655,6 +655,16 @@ and pass in optional config overrides, like the path to the raw text file:
 $ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```
 
+The following defaults are used for the `[pretraining]` block and merged into
+your existing config when you run [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
+you can [configure](#pretraining-configure) the settings and hyperparameters or
+change the [objective](#pretraining-details).
+
+```ini
+%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
+```
+
 ### How pretraining works {#pretraining-details}
 
 The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 665caa15b..08bfb9da2 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -976,14 +976,12 @@ your results.
 
 ![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
 
-<!-- TODO:
-
 <Project id="integrations/wandb">
 
 Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It includes a simple config using the `WandbLogger`, as well
-as a custom logger implementation you can adjust for your specific use case.
+project template. It trains on the IMDB Movie Review Dataset and includes a
+simple config with the built-in `WandbLogger`, as well as a custom example of
+creating variants of the config for a simple hyperparameter grid search and
+logging the results.
 
 </Project>
-
--->

From 889128e5c586f39eb6f18ae6a6b6fbe1505f4080 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:20:57 +0200
Subject: [PATCH 052/133] Improve error handling in run_command

---
 spacy/util.py | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 88162b23a..6e7b28fec 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -659,8 +659,8 @@ def join_command(command: List[str]) -> str:
 def run_command(
     command: Union[str, List[str]],
     *,
-    capture: bool = False,
     stdin: Optional[Any] = None,
+    capture: bool=False,
 ) -> Optional[subprocess.CompletedProcess]:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.
@@ -668,33 +668,46 @@ def run_command(
     command (str / List[str]): The command. If provided as a string, the
         string will be split using shlex.split.
     stdin (Optional[Any]): stdin to read from or None.
-    capture (bool): Whether to capture the output.
+    capture (bool): Whether to capture the output and errors. If False,
+        the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the returncode. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
     RETURNS (Optional[CompletedProcess]): The process object.
     """
     if isinstance(command, str):
-        command = split_command(command)
+        cmd_list = split_command(command)
+        cmd_str = command
+    else:
+        cmd_list = command
+        cmd_str = " ".join(command)
     try:
         ret = subprocess.run(
-            command,
+            cmd_list,
             env=os.environ.copy(),
             input=stdin,
             encoding="utf8",
-            check=True,
+            check=False,
             stdout=subprocess.PIPE if capture else None,
-            stderr=subprocess.PIPE if capture else None,
+            stderr=subprocess.STDOUT if capture else None,
         )
     except FileNotFoundError:
+        # Indicates the *command* wasn't found, it's an error before the command
+        # is run.
         raise FileNotFoundError(
-            Errors.E970.format(str_command=" ".join(command), tool=command[0])
+            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
         ) from None
-    except subprocess.CalledProcessError as e:
-        # We don't want a duplicate traceback here so we're making sure the
-        # CalledProcessError isn't re-raised. We also print both the string
-        # message and the stderr, in case the error only has one of them.
-        print(e.stderr)
-        print(e)
-        sys.exit(1)
-    if ret.returncode != 0:
+    if ret.returncode != 0 and capture:
+        message = f"Error running command:\n\n{cmd_str}\n\n"
+        message += f"Subprocess exited with status {ret.returncode}"
+        if ret.stdout is not None:
+            message += f"\n\nProcess log (stdout and stderr):\n\n"
+            message += ret.stdout
+        error = subprocess.SubprocessError(message)
+        error.ret = ret
+        error.command = cmd_str
+        raise error
+    elif ret.returncode != 0:
         sys.exit(ret.returncode)
     return ret
 

From 2c24d633d0f81e17dca2158b5185f316ae910130 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:21:43 +0200
Subject: [PATCH 053/133] Use updated run_command

---
 spacy/cli/package.py     | 2 +-
 spacy/cli/project/run.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8d6cd84c1..49a0ab75d 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -110,7 +110,7 @@ def package(
     msg.good(f"Successfully created package '{model_name_v}'", main_path)
     if create_sdist:
         with util.working_dir(main_path):
-            util.run_command([sys.executable, "setup.py", "sdist"])
+            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
         zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
         msg.good(f"Successfully created zipped Python package", zip_file)
 
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index eb7b8cc5b..13c28f1da 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -144,7 +144,7 @@ def run_commands(
         if not silent:
             print(f"Running command: {join_command(command)}")
         if not dry:
-            run_command(command)
+            run_command(command, capture=False)
 
 
 def validate_subcommand(

From a0fb5e50dbb1e24901f7b1470ee53cc6bce7a4d6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:22:04 +0200
Subject: [PATCH 054/133] Use simple git clone call if not sparse

---
 spacy/cli/_util.py | 77 ++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 44 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index e8f3be995..6675f4d50 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -308,6 +308,31 @@ def git_checkout(
         msg.fail("Destination of checkout must not exist", exits=1)
     if not dest.parent.exists():
         raise IOError("Parent of destination of checkout must exist")
+
+    if sparse and git_version >= (2, 22):
+        return git_sparse_checkout(repo, subpath, dest, branch)
+    elif sparse:
+        # Only show warnings if the user explicitly wants sparse checkout but
+        # the Git version doesn't support it
+        err_old = (
+            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
+            f"that doesn't fully support sparse checkout yet."
+        )
+        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
+        msg.warn(
+            f"{err_unk if git_version == (0, 0) else err_old} "
+            f"This means that more files than necessary may be downloaded "
+            f"temporarily. To only download the files needed, make sure "
+            f"you're using Git v2.22 or above."
+        )
+    with make_tempdir() as tmp_dir:
+        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
+        ret = run_command(cmd, capture=True)
+        # We need Path(name) to make sure we also support subdirectories
+        shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
+
+
+def git_sparse_checkout(repo, subpath, dest, branch):
     # We're using Git, partial clone and sparse checkout to
     # only clone the files we need
     # This ends up being RIDICULOUS. omg.
@@ -324,47 +349,28 @@ def git_checkout(
     # *that* we can do by path.
     # We're using Git and sparse checkout to only clone the files we need
     with make_tempdir() as tmp_dir:
-        supports_sparse = git_version >= (2, 22)
-        use_sparse = supports_sparse and sparse
         # This is the "clone, but don't download anything" part.
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
-        if use_sparse:
-            cmd += f"--filter=blob:none"  # <-- The key bit
-        # Only show warnings if the user explicitly wants sparse checkout but
-        # the Git version doesn't support it
-        elif sparse:
-            err_old = (
-                f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
-                f"that doesn't fully support sparse checkout yet."
-            )
-            err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
-            msg.warn(
-                f"{err_unk if git_version == (0, 0) else err_old} "
-                f"This means that more files than necessary may be downloaded "
-                f"temporarily. To only download the files needed, make sure "
-                f"you're using Git v2.22 or above."
-            )
-        try_run_command(cmd)
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none"
+        run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
         cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
-        ret = try_run_command(cmd)
+        ret = run_command(cmd, capture=True)
         git_repo = _from_http_to_git(repo)
         # Now pass those missings into another bit of git internals
         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
-        if use_sparse and not missings:
+        if not missings:
             err = (
                 f"Could not find any relevant files for '{subpath}'. "
                 f"Did you specify a correct and complete path within repo '{repo}' "
                 f"and branch {branch}?"
             )
             msg.fail(err, exits=1)
-        if use_sparse:
-            cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
-            try_run_command(cmd)
+        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
+        run_command(cmd, capture=True)
         # And finally, we can checkout our subpath
         cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
-        try_run_command(cmd)
+        run_command(cmd, capture=True)
         # We need Path(name) to make sure we also support subdirectories
         shutil.move(str(tmp_dir / Path(subpath)), str(dest))
 
@@ -378,7 +384,7 @@ def get_git_version(
     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
         (0, 0) if the version couldn't be determined.
     """
-    ret = try_run_command(["git", "--version"], error=error)
+    ret = run_command("git --version", capture=True)
     stdout = ret.stdout.strip()
     if not stdout or not stdout.startswith("git version"):
         return (0, 0)
@@ -386,23 +392,6 @@ def get_git_version(
     return (int(version[0]), int(version[1]))
 
 
-def try_run_command(
-    cmd: Union[str, List[str]], error: str = "Could not run command"
-) -> subprocess.CompletedProcess:
-    """Try running a command and raise an error if it fails.
-
-    cmd (Union[str, List[str]]): The command to run.
-    error (str): The error message.
-    RETURNS (CompletedProcess): The completed process if the command ran.
-    """
-    try:
-        return run_command(cmd, capture=True)
-    except subprocess.CalledProcessError as e:
-        msg.fail(error)
-        print(cmd)
-        sys.exit(1)
-
-
 def _from_http_to_git(repo: str) -> str:
     if repo.startswith("http://"):
         repo = repo.replace(r"http://", r"https://")

From dc22771f879455a81d8338588aa726a58b08bf50 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:30:05 +0200
Subject: [PATCH 055/133] Fix sparse checkout

---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 6675f4d50..cc7be1144 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -354,7 +354,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
         run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
-        cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
+        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
         ret = run_command(cmd, capture=True)
         git_repo = _from_http_to_git(repo)
         # Now pass those missings into another bit of git internals

From 8fb59d958c9676f32d84227c0b042a26b088da35 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Sep 2020 16:31:48 +0200
Subject: [PATCH 056/133] Format

---
 spacy/cli/_util.py | 5 ++++-
 spacy/util.py      | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index cc7be1144..c67863ef1 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -350,7 +350,10 @@ def git_sparse_checkout(repo, subpath, dest, branch):
     # We're using Git and sparse checkout to only clone the files we need
     with make_tempdir() as tmp_dir:
         # This is the "clone, but don't download anything" part.
-        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none"
+        cmd = (
+            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
+            f"-b {branch} --filter=blob:none"
+        )
         run_command(cmd)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
diff --git a/spacy/util.py b/spacy/util.py
index 6e7b28fec..93000ea27 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -254,7 +254,7 @@ def load_vectors_into_model(
 
 
 def load_vocab_data_into_model(
-    nlp: "Language", *, lookups: Optional["Lookups"]=None
+    nlp: "Language", *, lookups: Optional["Lookups"] = None
 ) -> None:
     """Load vocab data."""
     if lookups:
@@ -660,7 +660,7 @@ def run_command(
     command: Union[str, List[str]],
     *,
     stdin: Optional[Any] = None,
-    capture: bool=False,
+    capture: bool = False,
 ) -> Optional[subprocess.CompletedProcess]:
     """Run a command on the command line as a subprocess. If the subprocess
     returns a non-zero exit code, a system exit is performed.

From 744f259b9c93858d97937157414cb67641d4c846 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 16:37:23 +0200
Subject: [PATCH 057/133] Update landing [ci skip]

---
 website/src/widgets/landing.js | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 77fcdfd81..41b009010 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'
 
 const CODE_EXAMPLE = `# pip install spacy
 # python -m spacy download en_core_web_sm
-
 import spacy
 
 # Load English tokenizer, tagger, parser and NER
@@ -120,7 +119,7 @@ const Landing = ({ data }) => {
                         </Li>
                         <Li>
                             ✅ Components for <strong>named entity</strong> recognition,
-                            part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
+                            part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
                             <strong>text classification</strong>, lemmatization, morphological
                             analysis, entity linking and more
                         </Li>

From b2302c0a1ce7bacafdde22039cbd8da9782a3f27 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:44:51 +0200
Subject: [PATCH 058/133] Improve error for missing dependency

---
 spacy/cli/project/run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 13c28f1da..d7e1075f3 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -59,8 +59,9 @@ def project_run(
         for dep in cmd.get("deps", []):
             if not (project_dir / dep).exists():
                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_help = "Maybe you forgot to run the 'project assets' command?"
                 err_kwargs = {"exits": 1} if not dry else {}
-                msg.fail(err, **err_kwargs)
+                msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:
             rerun = check_rerun(current_dir, cmd)
             if not rerun and not force:

From 012b3a709636224534e44720bca00cb0cc6e3f92 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:44:58 +0200
Subject: [PATCH 059/133] Update docs [ci skip]

---
 website/docs/api/cli.md                       |  4 +--
 website/docs/usage/embeddings-transformers.md |  5 ++-
 website/docs/usage/facts-figures.md           |  6 ++--
 website/docs/usage/layers-architectures.md    | 12 +++----
 website/docs/usage/models.md                  |  2 --
 website/docs/usage/projects.md                | 18 ++++------
 website/docs/usage/saving-loading.md          | 13 +++++--
 website/docs/usage/training.md                | 11 +++++-
 website/docs/usage/v3.md                      | 34 +++++++++++--------
 website/meta/site.json                        |  1 +
 website/src/components/tag.js                 |  2 +-
 website/src/components/util.js                |  1 +
 website/src/widgets/landing.js                |  9 ++---
 website/src/widgets/project.js                | 18 ++++++----
 14 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 7374e1e3f..53cd954be 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -895,8 +895,6 @@ what you need. By default, spaCy's
 can provide any other repo (public or private) that you have access to using the
 `--repo` option.
 
-<!-- TODO: update example once we've decided on repo structure -->
-
 ```cli
 $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 ```
@@ -904,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
 > #### Example
 >
 > ```cli
-> $ python -m spacy project clone some_example
+> $ python -m spacy project clone pipelines/ner_wikiner
 > ```
 >
 > Clone from custom repo:
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index c6c703842..a855d703c 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -289,8 +289,7 @@ of objects by referring to creation functions, including functions you register
 yourself. For details on how to get started with training your own model, check
 out the [training quickstart](/usage/training#quickstart).
 
-<!-- TODO:
-<Project id="en_core_trf_lg">
+<!-- TODO: <Project id="en_core_trf_lg">
 
 The easiest way to get started is to clone a transformers-based project
 template. Swap in your data, edit the settings and hyperparameters and train,
@@ -623,7 +622,7 @@ that are familiar from the training block: the `[pretraining.batcher]`,
 `[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
 expect the same types of objects, although for pretraining your corpus does not
 need to have any annotations, so you will often use a different reader, such as
-the [`JsonlReader`](/api/toplevel#jsonlreader).
+the [`JsonlReader`](/api/top-level#jsonlreader).
 
 > #### Raw text format
 >
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index bff31d0d6..75f92070a 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
 right up to **current state-of-the-art**. You can also use a CPU-optimized
 pipeline, which is less accurate but much cheaper to run.
 
-<!-- TODO: -->
+<!-- TODO: update benchmarks and intro -->
 
 > #### Evaluation details
 >
@@ -68,6 +68,6 @@ our project template.
 
 </Project>
 
-<!-- ## Citing spaCy {#citation}
+<!-- TODO: ## Citing spaCy {#citation}
 
-<!-- TODO: update -->
+-->
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index f9787d815..a58ba2ba9 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -356,11 +356,11 @@ that training configs are complete and experiments fully reproducible.
 
 </Infobox>
 
-Note that when using a PyTorch or Tensorflow model, it is recommended to set the GPU
-memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
-"tensorflow" in the training config, cupy will allocate memory via those respective libraries,
-preventing OOM errors when there's available memory sitting in the other
-library's pool.
+Note that when using a PyTorch or Tensorflow model, it is recommended to set the
+GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
+"tensorflow" in the training config, cupy will allocate memory via those
+respective libraries, preventing OOM errors when there's available memory
+sitting in the other library's pool.
 
 ```ini
 ### config.cfg (excerpt)
@@ -489,7 +489,7 @@ with Model.define_operators({">>": chain}):
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 </Infobox>
 
-<!-- TODO:
+<!-- TODO: write trainable component section
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `begin_training`, correlation with add_label
 Example: relation extraction component (implemented as project template)
diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md
index e94cdfe9e..9b686c947 100644
--- a/website/docs/usage/models.md
+++ b/website/docs/usage/models.md
@@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
 native Python packaging. This allows your application to handle a spaCy pipeline
 like any other package dependency.
 
-<!-- TODO: reference relevant spaCy project -->
-
 ### Downloading and requiring package dependencies {#models-download}
 
 spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 08bfb9da2..f8d5a3761 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new
 
 ![Illustration of project workflow and commands](../images/projects.svg)
 
-<!-- TODO:
-<Project id="some_example_project">
+<Project id="pipelines/tagger_parser_ud">
 
-Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
-mattis pretium.
+The easiest way to get started is to clone a project template and run it – for
+example, this end-to-end template that lets you train a **part-of-speech
+tagger** and **dependency parser** on a Universal Dependencies treebank.
 
 </Project>
--->
 
 spaCy projects make it easy to integrate with many other **awesome tools** in
 the data science and machine learning ecosystem to track and manage your data
@@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
 project, e.g. to train a pipeline and edit the commands and scripts to build
 fully custom workflows.
 
-<!-- TODO: update with real example project -->
-
 ```cli
-python -m spacy project clone some_example_project
+python -m spacy project clone pipelines/tagger_parser_ud
 ```
 
 By default, the project will be cloned into the current working directory. You
@@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
 a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.
 
-<!-- TODO: update with better (final) example -->
-
 ```yaml
-https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
+https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
 ```
 
 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index c0fe1323c..3a95bf6aa 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
 meta and configuration will be written out. To make the pipeline more convenient
 to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
 
-<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
+<Accordion title="What’s the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>
 
 When you save a pipeline in spaCy v3.0+, two files will be exported: a
 [`config.cfg`](/api/data-formats#config) based on
@@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).
 
 </Accordion>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started with an end-to-end workflow is to clone a
+[project template](/usage/projects) and run it – for example, this template that
+lets you train a **part-of-speech tagger** and **dependency parser** on a
+Universal Dependencies treebank and generates an installable Python package.
+
+</Project>
+
 ### Generating a pipeline package {#models-generating}
 
 <Infobox title="Important note" variant="warning">
@@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
 ```python
 nlp = spacy.blank("en").from_disk("/path/to/data")
 ```
-
-<!-- TODO: point to spaCy projects? -->
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index c0f4caad7..6e9de62c5 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
 $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
 ```
 
-<Accordion title="How are the config recommendations generated?" id="quickstart-source">
+<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>
 
 The recommended config settings generated by the quickstart widget and the
 [`init config`](/api/cli#init-config) command are based on some general **best
@@ -112,6 +112,15 @@ as we run more experiments.
 
 </Accordion>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ## Training config {#config}
 
 Training config files include all **settings and hyperparameters** for training
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 24babc9bd..5abeb5707 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.
 
 ### Manage end-to-end workflows with projects {#features-projects}
 
-<!-- TODO: update example -->
-
 > #### Example
 >
 > ```cli
 > # Clone a project template
-> $ python -m spacy project clone example
-> $ cd example
+> $ python -m spacy project clone pipelines/tagger_parser_ud
+> $ cd tagger_parser_ud
 > # Download data assets
 > $ python -m spacy project assets
 > # Run a workflow
-> $ python -m spacy project run train
+> $ python -m spacy project run all
 > ```
 
 spaCy projects let you manage and share **end-to-end spaCy workflows** for
@@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
 [Ray](/usage/projects#ray) for parallel training,
 [Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
 
-<!-- <Project id="some_example_project">
-
-The easiest way to get started with an end-to-end training process is to clone a
-[project](/usage/projects) template. Projects let you manage multi-step
-workflows, from data preprocessing to training and packaging your pipeline.
-
-</Project>-->
-
 <Infobox title="Details & Documentation" emoji="📖" list>
 
 - **Usage:** [spaCy projects](/usage/projects),
@@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.
 
 </Infobox>
 
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
+
 ### Parallel and distributed training with Ray {#features-parallel-training}
 
 > #### Example
@@ -875,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
 + python -m spacy train ./config.cfg --output ./output
 ```
 
-<!-- TODO: project template -->
+<Project id="pipelines/tagger_parser_ud">
+
+The easiest way to get started is to clone a [project template](/usage/projects)
+and run it – for example, this end-to-end template that lets you train a
+**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
+treebank.
+
+</Project>
 
 #### Training via the Python API {#migrating-training-python}
 
diff --git a/website/meta/site.json b/website/meta/site.json
index 1955932b9..1a96ca660 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -12,6 +12,7 @@
     "companyUrl": "https://explosion.ai",
     "repo": "explosion/spaCy",
     "modelsRepo": "explosion/spacy-models",
+    "projectsRepo": "explosion/projects/tree/v3",
     "social": {
         "twitter": "spacy_io",
         "github": "explosion"
diff --git a/website/src/components/tag.js b/website/src/components/tag.js
index 3f2b4e994..b406e771e 100644
--- a/website/src/components/tag.js
+++ b/website/src/components/tag.js
@@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
         const isValid = isString(children) && !isNaN(children)
         const version = isValid ? Number(children).toFixed(1) : children
         const tooltipText = `This feature is new and was introduced in spaCy v${version}`
-        // TODO: we probably want to handle this more elegantly, but the idea is
+        // We probably want to handle this more elegantly, but the idea is
         // that we can hide tags referring to old versions
         const major = isString(version) ? Number(version.split('.')[0]) : version
         return major < MIN_VERSION ? null : (
diff --git a/website/src/components/util.js b/website/src/components/util.js
index 3d86cf37e..be55f0bb3 100644
--- a/website/src/components/util.js
+++ b/website/src/components/util.js
@@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
 const DEFAULT_BRANCH = 'develop'
 export const repo = siteMetadata.repo
 export const modelsRepo = siteMetadata.modelsRepo
+export const projectsRepo = siteMetadata.projectsRepo
 
 /**
  * This is used to provide selectors for headings so they can be crawled by
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 41b009010..2e75c893a 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -222,10 +222,11 @@ const Landing = ({ data }) => {
                     <br />
                     <br />
                     <br />
-                    {/** TODO: update with actual example */}
-                    <Project id="some_example">
-                        Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
-                        sodales lectus.
+                    <Project id="pipelines/tagger_parser_ud" title="Get started">
+                        The easiest way to get started is to clone a project template and run it
+                        – for example, this template for training a{' '}
+                        <strong>part-of-speech tagger</strong> and{' '}
+                        <strong>dependency parser</strong> on a Universal Dependencies treebank.
                     </Project>
                 </LandingCol>
                 <LandingCol>
diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js
index 0bd74bc90..8d309394d 100644
--- a/website/src/widgets/project.js
+++ b/website/src/widgets/project.js
@@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
 import Infobox from '../components/infobox'
 import Link from '../components/link'
 import { InlineCode } from '../components/code'
+import { projectsRepo } from '../components/util'
 
-// TODO: move to meta?
-const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
 const COMMAND = 'python -m spacy project clone'
 
-export default function Project({ id, repo, children }) {
+export default function Project({
+    title = 'Get started with a project template',
+    id,
+    repo,
+    children,
+}) {
     const repoArg = repo ? ` --repo ${repo}` : ''
     const text = `${COMMAND} ${id}${repoArg}`
-    const url = `${repo || DEFAULT_REPO}/${id}`
-    const title = (
+    const url = `${repo || projectsRepo}/${id}`
+    const header = (
         <>
-            Get started with a project template:{' '}
+            {title}:{' '}
             <Link to={url}>
                 <InlineCode>{id}</InlineCode>
             </Link>
         </>
     )
     return (
-        <Infobox title={title} emoji="🪐">
+        <Infobox title={header} emoji="🪐">
             {children}
             <CopyInput text={text} prefix="$" />
         </Infobox>

From b9d2b29684c051f956ec808705a2e7288ccf27dd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 20 Sep 2020 17:49:09 +0200
Subject: [PATCH 060/133] Update docs [ci skip]

---
 website/src/styles/copy.module.sass | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/website/src/styles/copy.module.sass b/website/src/styles/copy.module.sass
index c6d2f68cb..3a942552d 100644
--- a/website/src/styles/copy.module.sass
+++ b/website/src/styles/copy.module.sass
@@ -15,6 +15,10 @@
     background: transparent
     resize: none
     font: inherit
+    overflow: hidden
+    white-space: nowrap
+    text-overflow: ellipsis
+    margin-right: 1rem
 
 .prefix
     margin-right: 0.75em

From 3aa57ce6c9ab162715cad72563b25f5aecb28966 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:07:20 +0200
Subject: [PATCH 061/133] Update alignment mode in Doc.char_span docs

---
 website/docs/api/doc.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..44316ea1e 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -187,8 +187,8 @@ Remove a previously registered extension.
 ## Doc.char_span {#char_span tag="method" new="2"}
 
 Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
-`None` if the character indices don't map to a valid span using the default mode
-`"strict".
+`None` if the character indices don't map to a valid span using the default
+alignment mode `"strict".
 
 > #### Example
 >
@@ -198,15 +198,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 > assert span.text == "New York"
 > ```
 
-| Name                                 | Description                                                                                                                                                                                                                                                                 |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                       |
-| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                      |
-| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                 |
-| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                   |
-| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                              |
-| `mode`                               | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"inside"` (span of all tokens completely within the character span), `"outside"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                  |
+| Name                                 | Description                                                                                                                                                                                                                                                                  |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                       |
+| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 

From cc71ec901f26ae1c3bfb62b6bd776295200f418e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:08:55 +0200
Subject: [PATCH 062/133] Fix typo in saving and loading usage docs

---
 website/docs/usage/saving-loading.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 3a95bf6aa..06fb18591 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -299,9 +299,10 @@ installed in the same environment – that's it.
 
 When you load a pipeline, spaCy will generally use its `config.cfg` to set up
 the language class and construct the pipeline. The pipeline is specified as a
-list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
-strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
-defined by the decorators [`@Language.component`](/api/language#component) and
+list of strings, e.g. `pipeline = ["tagger", "parser", "ner"]`. For each of
+those strings, spaCy will call `nlp.add_pipe` and look up the name in all
+factories defined by the decorators
+[`@Language.component`](/api/language#component) and
 [`@Language.factory`](/api/language#factory). This means that you have to import
 your custom components _before_ loading the pipeline.
 

From 9d32cac736da47351e3f38f961aae2fc9e591401 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 10:55:36 +0200
Subject: [PATCH 063/133] Update docs [ci skip]

---
 website/docs/usage/projects.md | 12 ++++++++----
 website/docs/usage/training.md |  8 ++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index f8d5a3761..95e20525a 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -921,6 +921,14 @@ package is installed in the same environment as spaCy, it will automatically add
 [parallel training](/usage/training#parallel-training) for more details on how
 it works under the hood.
 
+<Project id="integrations/ray">
+
+Get started with parallel training using our project template. It trains a
+simple model on a Universal Dependencies Treebank and lets you parallelize the
+training with Ray.
+
+</Project>
+
 You can integrate [`spacy ray train`](/api/cli#ray-train) into your
 `project.yml` just like the regular training command and pass it the config, and
 optional output directory or remote storage URL and config overrides if needed.
@@ -940,10 +948,6 @@ commands:
       - "training/model-best"
 ```
 
-<!-- TODO: <Project id="integrations/ray">
-
-</Project> -->
-
 ---
 
 ### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 6e9de62c5..071434162 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -895,9 +895,13 @@ cluster. If it's not set, Ray will run locally.
 python -m spacy ray train config.cfg --n-workers 2
 ```
 
-<!-- TODO: <Project id="integrations/ray">
+<Project id="integrations/ray">
 
-</Project> -->
+Get started with parallel training using our project template. It trains a
+simple model on a Universal Dependencies Treebank and lets you parallelize the
+training with Ray.
+
+</Project>
 
 ### How parallel training works {#parallel-training-details}
 

From 1114219ae3034a9bec070967cdbf03001ea747d8 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 10:59:07 +0200
Subject: [PATCH 064/133] Tidy up and auto-format

---
 spacy/cli/_util.py                              |  3 +--
 spacy/language.py                               |  8 ++------
 spacy/ml/models/tok2vec.py                      | 16 +++++++++++-----
 spacy/schemas.py                                |  4 ++--
 spacy/tests/doc/test_span.py                    |  7 ++++++-
 spacy/tests/parser/test_parse_navigate.py       |  7 ++++++-
 spacy/tests/pipeline/test_pipe_factories.py     | 15 +++------------
 spacy/tests/regression/test_issue1501-2000.py   | 12 ++++++++++--
 .../tests/serialize/test_serialize_pipeline.py  |  8 +++++++-
 spacy/tests/test_cli.py                         |  1 -
 spacy/tests/test_language.py                    |  3 +--
 spacy/tests/test_util.py                        |  2 +-
 spacy/tests/training/test_readers.py            | 17 ++++++++---------
 spacy/tests/training/test_training.py           | 12 +++++++++++-
 14 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c67863ef1..040434c05 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -6,7 +6,6 @@ from wasabi import msg
 import srsly
 import hashlib
 import typer
-import subprocess
 from click import NoSuchOption
 from typer.main import get_command
 from contextlib import contextmanager
@@ -327,7 +326,7 @@ def git_checkout(
         )
     with make_tempdir() as tmp_dir:
         cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
-        ret = run_command(cmd, capture=True)
+        run_command(cmd, capture=True)
         # We need Path(name) to make sure we also support subdirectories
         shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
 
diff --git a/spacy/language.py b/spacy/language.py
index 7d463731a..4dffd9679 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -156,11 +156,7 @@ class Language:
             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
         if vocab is True:
             vectors_name = meta.get("vectors", {}).get("name")
-            vocab = create_vocab(
-                self.lang,
-                self.Defaults,
-                vectors_name=vectors_name,
-            )
+            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -1462,7 +1458,7 @@ class Language:
         # here :(
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
-                for name2, proc2 in self.pipeline[i+1:]:
+                for name2, proc2 in self.pipeline[i + 1 :]:
                     if isinstance(getattr(proc2, "model", None), Model):
                         proc1.find_listeners(proc2.model)
 
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 7ced4bd04..fec478e21 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -164,7 +164,9 @@ def MultiHashEmbed(
 
 
 @registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
+def CharacterEmbed(
+    width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
+):
     """Construct an embedded representation based on character embeddings, using
     a feed-forward network. A fixed number of UTF-8 byte characters are used for
     each word, taken from the beginning and end of the word equally. Padding is
@@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
                 ),
                 StaticVectors(width, dropout=0.0),
             ),
-            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
+            with_array(
+                Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
+            ),
             ragged2list(),
-    )
+        )
     else:
         model = chain(
             concatenate(
@@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
                     with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
                 ),
             ),
-            with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
+            with_array(
+                Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
+            ),
             ragged2list(),
-    )
+        )
     return model
 
 
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 60655da8c..b0f26dcd7 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator
@@ -255,7 +255,7 @@ class ConfigSchemaPretrain(BaseModel):
     batcher: Batcher = Field(..., title="Batcher for the training data")
     component: str = Field(..., title="Component to find the layer to pretrain")
     layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
- 
+
     # TODO: use a more detailed schema for this?
     objective: Dict[str, Any] = Field(..., title="Pretraining objective")
     # fmt: on
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index ad4f49042..0c538a0eb 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=[2, 1, 1, 0],
+        deps=["dep"] * 4,
+    )
     lca = doc[:2].get_lca_matrix()
     assert lca.shape == (2, 2)
     assert lca[0, 0] == 0  # the & the -> the
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index db1e98ba0..f181a799a 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
+    doc = get_doc(
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=["dep"] * len(heads),
+    )
 
     lefts = {}
     rights = {}
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 1cf06d97f..881460704 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -345,10 +345,7 @@ def test_language_factories_invalid():
             [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
             {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        (
-            [{"a": 0.5, "b": 0.5}, {"b": 1.0}],
-            {"a": 0.25, "b": 0.75},
-        ),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
     ],
 )
 def test_language_factories_combine_score_weights(weights, expected):
@@ -363,16 +360,10 @@ def test_language_factories_scores():
     weights1 = {"a1": 0.5, "a2": 0.5}
     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
     Language.factory(
-        f"{name}1",
-        scores=list(weights1),
-        default_score_weights=weights1,
-        func=func,
+        f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
     )
     Language.factory(
-        f"{name}2",
-        scores=list(weights2),
-        default_score_weights=weights2,
-        func=func,
+        f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
     )
     meta1 = Language.get_factory_meta(f"{name}1")
     assert meta1.default_score_weights == weights1
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index e226c8524..71ed2ea03 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -212,9 +212,17 @@ def test_issue1834():
         heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
         deps=["dep"] * len(words),
     )
-    print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
+    print(
+        doc.has_annotation("DEP"),
+        [t.head.i for t in doc],
+        [t.is_sent_start for t in doc],
+    )
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
+    print(
+        new_doc.has_annotation("DEP"),
+        [t.head.i for t in new_doc],
+        [t.is_sent_start for t in new_doc],
+    )
     assert new_doc[6].sent_start
     assert new_doc.has_annotation("DEP")
     assert new_doc.has_annotation("TAG")
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index eedad31e0..d1c4553be 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
     # See issue #1105
     cfg = {"model": DEFAULT_TEXTCAT_MODEL}
     model = registry.make_from_config(cfg, validate=True)["model"]
-    textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
+    textcat = TextCategorizer(
+        en_vocab,
+        model,
+        labels=["ENTITY", "ACTION", "MODIFIER"],
+        threshold=0.5,
+        positive_label=None,
+    )
     textcat.to_bytes(exclude=["vocab"])
 
 
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0a2300455..422ae74b4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -3,7 +3,6 @@ from click import NoSuchOption
 
 from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
-from spacy.lang.en import English
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 2a24d368a..da46ad424 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -291,8 +291,7 @@ def test_spacy_blank():
 
 
 @pytest.mark.parametrize(
-    "value",
-    [False, None, ["x", "y"], Language, Vocab],
+    "value", [False, None, ["x", "y"], Language, Vocab],
 )
 def test_language_init_invalid_vocab(value):
     err_fragment = "invalid value"
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
index 8c931d31e..1668991cd 100644
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@@ -95,7 +95,7 @@ def test_util_dot_section():
     assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
     # Test that default values got overwritten
     assert en_config["nlp"]["pipeline"] == ["textcat"]
-    assert nl_config["nlp"]["pipeline"] == [] # default value []
+    assert nl_config["nlp"]["pipeline"] == []  # default value []
     # Test proper functioning of 'dot_to_object'
     with pytest.raises(KeyError):
         dot_to_object(en_config, "nlp.pipeline.tagger")
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 898746c2a..d20a032e8 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,7 +1,6 @@
 from typing import Dict, Iterable, Callable
 import pytest
 from thinc.api import Config
-
 from spacy import Language
 from spacy.util import load_model_from_config, registry, dot_to_object
 from spacy.training import Example
@@ -10,19 +9,19 @@ from spacy.training import Example
 def test_readers():
     config_string = """
     [training]
-    
+
     [corpora]
     @readers = "myreader.v1"
 
     [nlp]
     lang = "en"
     pipeline = ["tok2vec", "textcat"]
-    
+
     [components]
-    
+
     [components.tok2vec]
     factory = "tok2vec"
-    
+
     [components.textcat]
     factory = "textcat"
     """
@@ -69,19 +68,19 @@ def test_readers():
 def test_cat_readers(reader, additional_config):
     nlp_config_string = """
     [training]
-    
+
     [corpora]
     @readers = "PLACEHOLDER"
 
     [nlp]
     lang = "en"
     pipeline = ["tok2vec", "textcat"]
-    
+
     [components]
-    
+
     [components.tok2vec]
     factory = "tok2vec"
-    
+
     [components.textcat]
     factory = "textcat"
     """
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 1d3c72a8b..b09487965 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -34,7 +34,17 @@ def doc():
     # fmt: on
     nlp = English()
     words = [t.text for t in nlp.make_doc(text)]
-    doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
+    doc = get_doc(
+        nlp.vocab,
+        words=words,
+        tags=tags,
+        pos=pos,
+        morphs=morphs,
+        heads=heads,
+        deps=deps,
+        lemmas=lemmas,
+        ents=ents,
+    )
     doc.cats = cats
     return doc
 

From 5497acf49aef93a1d6d451da11cc9f3d2841b345 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 11:25:10 +0200
Subject: [PATCH 065/133] Support config overrides via environment variables

---
 spacy/cli/_util.py      | 58 ++++++++++++++++++++++++++++++++---------
 spacy/tests/test_cli.py | 16 ++++++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 040434c05..0159dd473 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -11,9 +11,10 @@ from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError
+import os
 
 from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry
+from ..util import import_file, run_command, make_tempdir, registry, logger
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -61,16 +62,38 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
+def parse_config_env_overrides(
+    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+) -> Dict[str, Any]:
+    """Generate a dictionary of config overrides based on environment variables,
+    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
+    setting.
+
+    prefix (str): The env variable prefix for config overrides.
+    dot (str): String used to represent the "dot", e.g. in training.batch_size.
+    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
+    """
+    result = {}
+    for env_key, value in os.environ.items():
+        if env_key.startswith(prefix):
+            opt = env_key[len(prefix) :].lower().replace(dot, ".")
+            if "." in opt:
+                result[opt] = try_json_loads(value)
+    return result
+
+
+def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
     args (List[str]): The extra arguments from the command line.
+    env_vars (bool): Include environment variables.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    result = {}
+    env_overrides = parse_config_env_overrides() if env_vars else {}
+    cli_overrides = {}
     while args:
         opt = args.pop(0)
         err = f"Invalid CLI argument '{opt}'"
@@ -87,18 +110,27 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
                     value = "true"
                 else:
                     value = args.pop(0)
-            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd be unintuitive to
-            # explicitly mark strings with escaped quotes. So we're working
-            # around that here by falling back to a string if parsing fails.
-            # TODO: improve logic to handle simple types like list of strings?
-            try:
-                result[opt] = srsly.json_loads(value)
-            except ValueError:
-                result[opt] = str(value)
+            if opt not in env_overrides:
+                cli_overrides[opt] = try_json_loads(value)
         else:
             msg.fail(f"{err}: override option should start with --", exits=1)
-    return result
+    if cli_overrides:
+        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def try_json_loads(value: Any) -> Any:
+    # Just like we do in the config, we're calling json.loads on the
+    # values. But since they come from the CLI, it'd be unintuitive to
+    # explicitly mark strings with escaped quotes. So we're working
+    # around that here by falling back to a string if parsing fails.
+    # TODO: improve logic to handle simple types like list of strings?
+    try:
+        return srsly.json_loads(value)
+    except ValueError:
+        return str(value)
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 422ae74b4..d81437f18 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,15 +1,15 @@
 import pytest
 from click import NoSuchOption
-
 from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list
+from spacy.cli._util import string_to_list, parse_config_env_overrides
 from thinc.config import ConfigValidationError
 import srsly
+import os
 
 from .util import make_tempdir
 
@@ -341,6 +341,18 @@ def test_parse_config_overrides_invalid_2(args):
         parse_config_overrides(args)
 
 
+def test_parse_cli_overrides():
+    prefix = "SPACY_CONFIG_"
+    dot = "__"
+    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
+    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
+    os.environ[prefix] = "bad"
+    result = parse_config_env_overrides(prefix=prefix, dot=dot)
+    assert len(result) == 2
+    assert result["training.batch_size"] == 123
+    assert result["foo.bar.baz"] == "hello"
+
+
 @pytest.mark.parametrize("lang", ["en", "nl"])
 @pytest.mark.parametrize(
     "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]

From 758ead8a476fa5f5e55c64c3c4bd242c7cb83d1e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 12:50:13 +0200
Subject: [PATCH 066/133] Sync overrides with CLI overrides

---
 spacy/cli/_util.py      | 80 ++++++++++++++++++-----------------------
 spacy/tests/test_cli.py | 26 ++++++++------
 2 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0159dd473..0dd2ee380 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -7,6 +7,7 @@ import srsly
 import hashlib
 import typer
 from click import NoSuchOption
+from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
@@ -38,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
+OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@@ -62,46 +64,41 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_env_overrides(
-    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+def parse_config_overrides(
+    args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
 ) -> Dict[str, Any]:
-    """Generate a dictionary of config overrides based on environment variables,
-    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
-    setting.
-
-    prefix (str): The env variable prefix for config overrides.
-    dot (str): String used to represent the "dot", e.g. in training.batch_size.
-    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
-    """
-    result = {}
-    for env_key, value in os.environ.items():
-        if env_key.startswith(prefix):
-            opt = env_key[len(prefix) :].lower().replace(dot, ".")
-            if "." in opt:
-                result[opt] = try_json_loads(value)
-    return result
-
-
-def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
-    args (List[str]): The extra arguments from the command line.
-    env_vars (bool): Include environment variables.
+    env_vars (Optional[str]): Optional environment variable to read from.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    env_overrides = parse_config_env_overrides() if env_vars else {}
-    cli_overrides = {}
+    env_string = os.environ.get(env_var, "") if env_var else ""
+    env_overrides = _parse_overrides(split_arg_string(env_string))
+    cli_overrides = _parse_overrides(args, is_cli=True)
+    if cli_overrides:
+        keys = [k for k in cli_overrides if k not in env_overrides]
+        logger.debug(f"Config overrides from CLI: {keys}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
+    result = {}
     while args:
         opt = args.pop(0)
-        err = f"Invalid CLI argument '{opt}'"
+        err = f"Invalid config override '{opt}'"
         if opt.startswith("--"):  # new argument
             orig_opt = opt
             opt = opt.replace("--", "")
             if "." not in opt:
-                raise NoSuchOption(orig_opt)
+                if is_cli:
+                    raise NoSuchOption(orig_opt)
+                else:
+                    msg.fail(f"{err}: can't override top-level sections", exits=1)
             if "=" in opt:  # we have --opt=value
                 opt, value = opt.split("=", 1)
                 opt = opt.replace("-", "_")
@@ -110,27 +107,18 @@ def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str,
                     value = "true"
                 else:
                     value = args.pop(0)
-            if opt not in env_overrides:
-                cli_overrides[opt] = try_json_loads(value)
+            # Just like we do in the config, we're calling json.loads on the
+            # values. But since they come from the CLI, it'd be unintuitive to
+            # explicitly mark strings with escaped quotes. So we're working
+            # around that here by falling back to a string if parsing fails.
+            # TODO: improve logic to handle simple types like list of strings?
+            try:
+                result[opt] = srsly.json_loads(value)
+            except ValueError:
+                result[opt] = str(value)
         else:
-            msg.fail(f"{err}: override option should start with --", exits=1)
-    if cli_overrides:
-        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
-    if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
-    return {**cli_overrides, **env_overrides}
-
-
-def try_json_loads(value: Any) -> Any:
-    # Just like we do in the config, we're calling json.loads on the
-    # values. But since they come from the CLI, it'd be unintuitive to
-    # explicitly mark strings with escaped quotes. So we're working
-    # around that here by falling back to a string if parsing fails.
-    # TODO: improve logic to handle simple types like list of strings?
-    try:
-        return srsly.json_loads(value)
-    except ValueError:
-        return str(value)
+            msg.fail(f"{err}: name should start with --", exits=1)
+    return result
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index d81437f18..a9c9d8ca5 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -6,7 +6,7 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list, parse_config_env_overrides
+from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 from thinc.config import ConfigValidationError
 import srsly
 import os
@@ -342,15 +342,21 @@ def test_parse_config_overrides_invalid_2(args):
 
 
 def test_parse_cli_overrides():
-    prefix = "SPACY_CONFIG_"
-    dot = "__"
-    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
-    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
-    os.environ[prefix] = "bad"
-    result = parse_config_env_overrides(prefix=prefix, dot=dot)
-    assert len(result) == 2
-    assert result["training.batch_size"] == 123
-    assert result["foo.bar.baz"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    result = parse_config_overrides([])
+    assert len(result) == 4
+    assert result["x.foo"] == "bar"
+    assert result["x.bar"] == 12
+    assert result["x.baz"] is False
+    assert result["y.foo"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x"
+    assert parse_config_overrides([], env_var=None) == {}
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    os.environ[OVERRIDES_ENV_VAR] = "hello world"
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    del os.environ[OVERRIDES_ENV_VAR]
 
 
 @pytest.mark.parametrize("lang", ["en", "nl"])

From bc02e864943a790cfc7ec991c67d20cc774417df Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:01:26 +0200
Subject: [PATCH 067/133] Extend Doc.__init__ with additional annotation

Mostly copying from `spacy.tests.util.get_doc`, add additional kwargs to
`Doc.__init__` to initialize the most common doc/token values.
---
 spacy/errors.py                          |  5 +-
 spacy/tests/util.py                      | 60 ++----------------
 spacy/tokens/doc.pyx                     | 77 ++++++++++++++++++++++--
 spacy/training/converters/conllu2docs.py | 35 ++++++-----
 website/docs/api/doc.md                  | 19 ++++--
 5 files changed, 118 insertions(+), 78 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 81e3616be..f219496a5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -57,7 +57,10 @@ class Warnings:
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
             "the Knowledge Base.")
-    W026 = ("Unable to set all sentence boundaries from dependency parses.")
+    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
+            "you are constructing a parse tree incrementally by setting "
+            "token.head values, you can probably ignore this warning. Consider "
+            "using Doc(words, ..., heads=heads, deps=deps) instead.")
     W027 = ("Found a large training file of {size} bytes. Note that it may "
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 741753c89..7bc32bf34 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -30,60 +30,12 @@ def get_doc(
     morphs=None,
 ):
     """Create Doc object from given vocab, words and annotations."""
-    if deps and not heads:
-        heads = [0] * len(deps)
-    headings = []
-    values = []
-    annotations = [pos, heads, deps, lemmas, tags, morphs]
-    possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
-    for a, annot in enumerate(annotations):
-        if annot is not None:
-            if len(annot) != len(words):
-                raise ValueError(Errors.E189)
-            headings.append(possible_headings[a])
-            if annot is not heads:
-                values.extend(annot)
-    for value in values:
-        vocab.strings.add(value)
-
-    doc = Doc(vocab, words=words)
-
-    # if there are any other annotations, set them
-    if headings:
-        attrs = doc.to_array(headings)
-
-        j = 0
-        for annot in annotations:
-            if annot:
-                if annot is heads:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = heads[i]
-                        else:
-                            attrs[i, j] = heads[i]
-                elif annot is morphs:
-                    for i in range(len(words)):
-                        morph_key = vocab.morphology.add(morphs[i])
-                        if attrs.ndim == 1:
-                            attrs[i] = morph_key
-                        else:
-                            attrs[i, j] = morph_key
-                else:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = doc.vocab.strings[annot[i]]
-                        else:
-                            attrs[i, j] = doc.vocab.strings[annot[i]]
-                j += 1
-        doc.from_array(headings, attrs)
-
-    # finally, set the entities
-    if ents:
-        doc.ents = [
-            Span(doc, start, end, label=doc.vocab.strings[label])
-            for start, end, label in ents
-        ]
-    return doc
+    if heads is not None:
+        heads = [i + head for i, head in enumerate(heads)]
+    if ents is not None:
+        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
+    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
+        ents=ents, lemmas=lemmas, morphs=morphs)
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2d9de278b..de7e0f862 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -158,17 +158,38 @@ cdef class Doc:
             raise ValueError(Errors.E046.format(name=name))
         return Underscore.doc_extensions.pop(name)
 
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
+    def __init__(
+        self,
+        Vocab vocab,
+        words=None,
+        spaces=None,
+        user_data=None,
+        *,
+        tags=None,
+        pos=None,
+        morphs=None,
+        lemmas=None,
+        heads=None,
+        deps=None,
+        ents=None,
+    ):
         """Create a Doc object.
 
         vocab (Vocab): A vocabulary object, which must match any models you
             want to use (e.g. tokenizer, parser, entity recognizer).
-        words (list or None): A list of unicode strings to add to the document
+        words (Optional[List[str]]): A list of unicode strings to add to the document
             as words. If `None`, defaults to empty list.
-        spaces (list or None): A list of boolean values, of the same length as
+        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
+        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -217,6 +238,55 @@ cdef class Doc:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
 
+        if heads is not None:
+            heads = [head - i for i, head in enumerate(heads)]
+        if deps and not heads:
+            heads = [0] * len(deps)
+        headings = []
+        values = []
+        annotations = [pos, heads, deps, lemmas, tags, morphs]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        for a, annot in enumerate(annotations):
+            if annot is not None:
+                if len(annot) != len(words):
+                    raise ValueError(Errors.E189)
+                headings.append(possible_headings[a])
+                if annot is not heads:
+                    values.extend(annot)
+        for value in values:
+            self.vocab.strings.add(value)
+
+        # if there are any other annotations, set them
+        if headings:
+            attrs = self.to_array(headings)
+
+            j = 0
+            for annot in annotations:
+                if annot:
+                    if annot is heads:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = heads[i]
+                            else:
+                                attrs[i, j] = heads[i]
+                    elif annot is morphs:
+                        for i in range(len(words)):
+                            morph_key = vocab.morphology.add(morphs[i])
+                            if attrs.ndim == 1:
+                                attrs[i] = morph_key
+                            else:
+                                attrs[i, j] = morph_key
+                    else:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = self.vocab.strings[annot[i]]
+                            else:
+                                attrs[i, j] = self.vocab.strings[annot[i]]
+                    j += 1
+            self.from_array(headings, attrs)
+        if ents is not None:
+            self.ents = ents
+
     @property
     def _(self):
         """Custom extension attributes registered via `set_extension`."""
@@ -1344,7 +1414,6 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
-
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index ebd123375..b4d8b3ac4 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
         heads.append(head)
         deps.append(dep)
 
-    doc = Doc(vocab, words=words, spaces=spaces)
+    doc = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        pos=poses,
+        deps=deps,
+        lemmas=lemmas,
+        heads=heads,
+    )
     for i in range(len(doc)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = poses[i]
-        doc[i].dep_ = deps[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].head = doc[heads[i]]
         doc[i]._.merged_orth = words[i]
         doc[i]._.merged_morph = morphs[i]
         doc[i]._.merged_lemma = lemmas[i]
@@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
         heads.append(t.head.i)
         deps.append(t.dep_)
 
-    doc_x = Doc(vocab, words=words, spaces=spaces)
-    for i in range(len(doc)):
-        doc_x[i].tag_ = tags[i]
-        doc_x[i].morph_ = morphs[i]
-        doc_x[i].lemma_ = lemmas[i]
-        doc_x[i].pos_ = poses[i]
-        doc_x[i].dep_ = deps[i]
-        doc_x[i].head = doc_x[heads[i]]
+    doc_x = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        morphs=morphs,
+        lemmas=lemmas,
+        pos=poses,
+        deps=deps,
+        heads=heads,
+    )
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
 
     return doc_x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..680523c60 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -30,11 +30,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name     | Description                                                                                                                                                                                  |
-| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`  | A storage container for lexical types. ~~Vocab~~                                                                                                                                             |
-| `words`  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                           |
-| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
+| Name           | Description                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
+| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
+| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
+| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
+| _keyword-only_ |                                                                                                                                                                                                |
+| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
+| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 

From 9b8d0b7f904f8751a804f112825a38cebe102ce9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:46:21 +0200
Subject: [PATCH 068/133] Alphabetize API sidebars

---
 website/meta/sidebars.json | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 76d5e63d6..e27817c92 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -75,63 +75,63 @@
             {
                 "label": "Containers",
                 "items": [
-                    { "text": "Language", "url": "/api/language" },
                     { "text": "Doc", "url": "/api/doc" },
-                    { "text": "Token", "url": "/api/token" },
-                    { "text": "Span", "url": "/api/span" },
-                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "DocBin", "url": "/api/docbin" },
                     { "text": "Example", "url": "/api/example" },
-                    { "text": "DocBin", "url": "/api/docbin" }
+                    { "text": "Language", "url": "/api/language" },
+                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "Span", "url": "/api/span" },
+                    { "text": "Token", "url": "/api/token" }
                 ]
             },
             {
                 "label": "Pipeline",
                 "items": [
-                    { "text": "Tokenizer", "url": "/api/tokenizer" },
-                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
-                    { "text": "Transformer", "url": "/api/transformer" },
-                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
-                    { "text": "Morphologizer", "url": "/api/morphologizer" },
-                    { "text": "Tagger", "url": "/api/tagger" },
                     { "text": "AttributeRuler", "url": "/api/attributeruler" },
                     { "text": "DependencyParser", "url": "/api/dependencyparser" },
+                    { "text": "EntityLinker", "url": "/api/entitylinker" },
                     { "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
                     { "text": "EntityRuler", "url": "/api/entityruler" },
-                    { "text": "EntityLinker", "url": "/api/entitylinker" },
-                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
-                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
+                    { "text": "Morphologizer", "url": "/api/morphologizer" },
+                    { "text": "Pipe", "url": "/api/pipe" },
                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
-                    { "text": "Other Functions", "url": "/api/pipeline-functions" },
-                    { "text": "Pipe", "url": "/api/pipe" }
+                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Tagger", "url": "/api/tagger" },
+                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
+                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
+                    { "text": "Tokenizer", "url": "/api/tokenizer" },
+                    { "text": "Transformer", "url": "/api/transformer" },
+                    { "text": "Other Functions", "url": "/api/pipeline-functions" }
                 ]
             },
             {
                 "label": "Matchers",
                 "items": [
+                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
                     { "text": "Matcher", "url": "/api/matcher" },
-                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" },
-                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
+                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" }
                 ]
             },
             {
                 "label": "Other",
                 "items": [
-                    { "text": "Vocab", "url": "/api/vocab" },
-                    { "text": "StringStore", "url": "/api/stringstore" },
-                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Corpus", "url": "/api/corpus" },
+                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
                     { "text": "Morphology", "url": "/api/morphology" },
-                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Scorer", "url": "/api/scorer" },
-                    { "text": "Corpus", "url": "/api/corpus" }
+                    { "text": "StringStore", "url": "/api/stringstore" },
+                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Vocab", "url": "/api/vocab" }
                 ]
             },
             {
                 "label": "Cython",
                 "items": [
                     { "text": "Architecture", "url": "/api/cython" },
-                    { "text": "Structs", "url": "/api/cython-structs" },
-                    { "text": "Classes", "url": "/api/cython-classes" }
+                    { "text": "Classes", "url": "/api/cython-classes" },
+                    { "text": "Structs", "url": "/api/cython-structs" }
                 ]
             }
         ]

From ce455f30ca847fc8038d034f39977cb6f3ed53c3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:52:46 +0200
Subject: [PATCH 069/133] Fix formatting

---
 spacy/tests/util.py  | 13 +++++++++++--
 spacy/tokens/doc.pyx |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 7bc32bf34..6c67d2ee1 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -34,8 +34,17 @@ def get_doc(
         heads = [i + head for i, head in enumerate(heads)]
     if ents is not None:
         ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
-        ents=ents, lemmas=lemmas, morphs=morphs)
+    return Doc(
+        vocab,
+        words=words,
+        pos=pos,
+        heads=heads,
+        deps=deps,
+        tags=tags,
+        ents=ents,
+        lemmas=lemmas,
+        morphs=morphs,
+    )
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index de7e0f862..13167c2d4 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1414,6 +1414,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
+
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.

From e548654aca291621ddcbd8739f620b74c9932166 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 14:46:55 +0200
Subject: [PATCH 070/133] Update docs [ci skip]

---
 website/docs/usage/training.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 071434162..b63145636 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -214,6 +214,24 @@ overrides. Overrides are added before [variables](#config-interpolation) are
 resolved, by the way – so if you need to use a value in multiple places,
 reference it across your config and override it on the CLI once.
 
+> #### 💡 Tip: Verbose logging
+>
+> If you're using config overrides, you can set the `--verbose` flag on
+> [`spacy train`](/api/cli#train) to make spaCy log more info, including which
+> overrides were set via the CLI and environment variables.
+
+#### Adding overrides via environment variables {#config-overrides-env}
+
+Instead of defining the overrides as CLI arguments, you can also use the
+`SPACY_CONFIG_OVERRIDES` environment variable using the same argument syntax.
+This is especially useful if you're training models as part of an automated
+process. Environment variables **take precedence** over CLI overrides and values
+defined in the config file.
+
+```cli
+$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
+```
+
 ### Defining pipeline components {#config-components}
 
 You typically train a [pipeline](/usage/processing-pipelines) of **one or more

From 6aa91c7ca02acd0df8d5dfba236faf09c3a5a477 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 16:00:06 +0200
Subject: [PATCH 071/133] Make user_data keyword-only

---
 spacy/tokens/doc.pyx    | 2 +-
 website/docs/api/doc.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 13167c2d4..27efa6cef 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -163,8 +163,8 @@ cdef class Doc:
         Vocab vocab,
         words=None,
         spaces=None,
-        user_data=None,
         *,
+        user_data=None,
         tags=None,
         pos=None,
         morphs=None,
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 680523c60..baf264b80 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -35,8 +35,8 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
 | `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
 | `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | _keyword-only_ |                                                                                                                                                                                                |
+| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |

From e8bcaa44f17be63302feca946997a6fe20761cd7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:01:46 +0200
Subject: [PATCH 072/133] Don't auto-decompress archives with smart_open [ci
 skip]

---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0dd2ee380..797a701b9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -306,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
     if dest.exists() and not force:
         return None
     src = str(src)
-    with smart_open.open(src, mode="rb") as input_file:
+    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
         with dest.open(mode="wb") as output_file:
             output_file.write(input_file.read())
 

From b3327c1e45d14c6ef03c70455e09f449ed8ad6f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:04:30 +0200
Subject: [PATCH 073/133] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4fb6dfff1..ec3c168a5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a19"
+__version__ = "3.0.0a20"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 447b3e5787dec59f2ed4b8a96c4b2ceb808d182f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 16:58:40 +0200
Subject: [PATCH 074/133] Merge remote-tracking branch 'upstream/develop' into
 fix/debug_model

# Conflicts:
#	spacy/cli/debug_model.py
---
 spacy/cli/debug_model.py | 48 ++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index af961d033..3d76cdbde 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
@@ -93,11 +93,10 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    goldY = _get_output(model.ops)
-    # _set_output_dim(nO=goldY.shape[-1], model=model)
+    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=goldY)
+        model.initialize(X=X)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -110,12 +109,15 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
         tok2vec.model.initialize(X=X)
+    goldY = None
     for e in range(3):
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        print("get_dX", get_dX)
-        dY = get_gradient(goldY, Y)
+        # simulate a goldY value
+        if not goldY:
+            goldY = _simulate_gold(Y)
+        dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -128,11 +130,20 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
-    msg.good(f"Succesfully ended analysis - model looks good!")
+    msg.good(f"Succesfully ended analysis - model looks good.")
 
 
-def get_gradient(goldY, Y):
-    return Y - goldY
+def _simulate_gold(element, counter=1):
+    if isinstance(element, Iterable):
+        for i in range(len(element)):
+            element[i] = _simulate_gold(element[i], counter+i)
+        return element
+    else:
+        return 1/counter
+
+
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
 
 
 def _sentences():
@@ -149,18 +160,13 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(ops):
-    docs = len(_get_docs())
-    labels = 6
-    output = ops.alloc2f(d0=docs, d1=labels)
-    for i in range(docs):
-        for j in range(labels):
-            output[i, j] = 1 / (i+j+0.01)
-    return ops.xp.asarray(output)
-
-
-def _get_output_old(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _set_output_dim(model, nO):
+    # simulating dim inference by directly setting the nO argument of the model
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
 
 
 def _print_model(model, print_settings):

From f212303729cb0775bb00eebb6eef0a6c646f92da Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 17:59:09 +0200
Subject: [PATCH 075/133] Add sent_starts to Doc.__init__

Add sent_starts to `Doc.__init__`. Officially specify `is_sent_start`
values but also convert to and accept `sent_start` internally.
---
 spacy/tests/doc/test_doc_api.py | 20 ++++++++++++++
 spacy/tokens/doc.pyx            | 46 +++++++++++++++++++++++----------
 website/docs/api/doc.md         |  1 +
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c979931b1..0579642c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 from ..util import get_doc
 
 
+def test_doc_api_init(en_vocab):
+    # set sent_start by sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # set sent_start by heads
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # heads override sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+
 @pytest.mark.parametrize("text", [["one", "two", "three"]])
 def test_doc_api_compare_by_string_position(en_vocab, text):
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 27efa6cef..c5f1f6801 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -171,6 +171,7 @@ cdef class Doc:
         lemmas=None,
         heads=None,
         deps=None,
+        sent_starts=None,
         ents=None,
     ):
         """Create a Doc object.
@@ -183,13 +184,24 @@ cdef class Doc:
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
-        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
-        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
-        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
-        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
-        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
-        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
+        tags (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as
+            words, to assign as heads. Head indices are the position of the
+            head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.dep. Defaults to None.
+        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
+            the same length as words, to assign as token.is_sent_start. Will be
+            overridden by heads if heads is provided. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
+            Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -242,16 +254,24 @@ cdef class Doc:
             heads = [head - i for i, head in enumerate(heads)]
         if deps and not heads:
             heads = [0] * len(deps)
+        if sent_starts is not None:
+            for i in range(len(sent_starts)):
+                if sent_starts[i] is True:
+                    sent_starts[i] = 1
+                elif sent_starts[i] is False:
+                    sent_starts[i] = -1
+                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
+                    sent_starts[i] = 0
         headings = []
         values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
         for a, annot in enumerate(annotations):
             if annot is not None:
                 if len(annot) != len(words):
                     raise ValueError(Errors.E189)
                 headings.append(possible_headings[a])
-                if annot is not heads:
+                if annot is not heads and annot is not sent_starts:
                     values.extend(annot)
         for value in values:
             self.vocab.strings.add(value)
@@ -263,12 +283,12 @@ cdef class Doc:
             j = 0
             for annot in annotations:
                 if annot:
-                    if annot is heads:
+                    if annot is heads or annot is sent_starts:
                         for i in range(len(words)):
                             if attrs.ndim == 1:
-                                attrs[i] = heads[i]
+                                attrs[i] = annot[i]
                             else:
-                                attrs[i, j] = heads[i]
+                                attrs[i, j] = annot[i]
                     elif annot is morphs:
                         for i in range(len(words)):
                             morph_key = vocab.morphology.add(morphs[i])
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index baf264b80..52f94a83d 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -43,6 +43,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
 | heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
 | deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
 | ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}

From 67fbcb3da57c9830be34bf56518d8ec659ed65b6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 20:43:54 +0200
Subject: [PATCH 076/133] Tidy up tests and docs

---
 CONTRIBUTING.md                               |   4 +-
 spacy/errors.py                               |   2 +-
 spacy/tests/README.md                         |  84 ++++-------
 spacy/tests/conftest.py                       |   5 +
 spacy/tests/doc/test_add_entities.py          |  10 +-
 spacy/tests/doc/test_array.py                 |   8 +-
 spacy/tests/doc/test_doc_api.py               |  74 ++++------
 spacy/tests/doc/test_retokenize_merge.py      |  78 ++++------
 spacy/tests/doc/test_retokenize_split.py      |  10 +-
 spacy/tests/doc/test_span.py                  |  14 +-
 spacy/tests/doc/test_to_json.py               |   7 +-
 spacy/tests/doc/test_token_api.py             |  89 ++++-------
 spacy/tests/lang/de/test_parser.py            |  26 ++--
 spacy/tests/lang/en/test_noun_chunks.py       |   9 +-
 spacy/tests/lang/en/test_parser.py            |  57 +++-----
 spacy/tests/lang/en/test_sbd.py               |  22 +--
 spacy/tests/lang/ru/test_lemmatizer.py        |  15 +-
 spacy/tests/lang/sv/test_noun_chunks.py       |  16 +-
 .../tests/matcher/test_dependency_matcher.py  |  13 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/parser/test_nonproj.py            |  25 +---
 spacy/tests/parser/test_parse.py              |  94 ++++++------
 spacy/tests/parser/test_parse_navigate.py     | 120 ++++++++-------
 spacy/tests/parser/test_space_attachment.py   |  46 +++---
 spacy/tests/pipeline/test_attributeruler.py   |  19 +--
 spacy/tests/pipeline/test_functions.py        |  47 ++----
 spacy/tests/regression/test_issue1-1000.py    |   9 +-
 spacy/tests/regression/test_issue1501-2000.py |  21 +--
 spacy/tests/regression/test_issue2001-2500.py |  11 +-
 spacy/tests/regression/test_issue2501-3000.py |  10 +-
 spacy/tests/regression/test_issue3001-3500.py |  26 +---
 spacy/tests/regression/test_issue3501-4000.py |  12 +-
 spacy/tests/regression/test_issue5001-5500.py | 138 ++++++++++++++++++
 spacy/tests/regression/test_issue5048.py      |  32 ----
 spacy/tests/regression/test_issue5082.py      |  37 -----
 spacy/tests/regression/test_issue5137.py      |  32 ----
 spacy/tests/regression/test_issue5141.py      |  11 --
 spacy/tests/regression/test_issue5152.py      |  20 ---
 spacy/tests/regression/test_issue5458.py      |  23 ---
 spacy/tests/regression/test_issue5918.py      |   4 +-
 spacy/tests/test_displacy.py                  |  18 +--
 spacy/tests/test_scorer.py                    |  23 +--
 spacy/tests/training/test_training.py         |  55 ++-----
 spacy/tests/util.py                           |  35 +----
 spacy/tokens/doc.pyx                          |  10 +-
 spacy/training/example.pyx                    |   4 +-
 website/docs/api/doc.md                       |  44 ++++--
 website/docs/usage/v3.md                      |   9 +-
 48 files changed, 612 insertions(+), 875 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5001-5500.py
 delete mode 100644 spacy/tests/regression/test_issue5048.py
 delete mode 100644 spacy/tests/regression/test_issue5082.py
 delete mode 100644 spacy/tests/regression/test_issue5137.py
 delete mode 100644 spacy/tests/regression/test_issue5141.py
 delete mode 100644 spacy/tests/regression/test_issue5152.py
 delete mode 100644 spacy/tests/regression/test_issue5458.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0abde2abf..70324d8fd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -224,7 +224,7 @@ for that particular code. Here's an example:
 ```python
 # fmt: off
 text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
 deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
         "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
         "poss", "nsubj", "ccomp", "punct"]
@@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
 `@pytest.mark.models`. Loading the models is expensive and not necessary if
 you're not actually testing the model performance. If all you need is a `Doc`
 object with annotations like heads, POS tags or the dependency parse, you can
-use the `get_doc()` utility function to construct it manually.
+use the `Doc` constructor to construct it manually.
 
 📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
 
diff --git a/spacy/errors.py b/spacy/errors.py
index f219496a5..406ea603b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -455,7 +455,7 @@ class Errors:
             "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
     E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
     E187 = ("Only unicode strings are supported as labels.")
-    E189 = ("Each argument to `get_doc` should be of equal length.")
+    E189 = ("Each argument to Doc.__init__ should be of equal length.")
     E190 = ("Token head out of range in `Doc.from_array()` for token index "
             "'{index}' with value '{value}' (equivalent to relative head "
             "index: '{rel_head_index}'). The head indices should be relative "
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 7aa7f6166..86bbd52da 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -17,7 +17,6 @@ Tests for spaCy modules and classes live in their own directories of the same na
 5. [Helpers and utilities](#helpers-and-utilities)
 6. [Contributing to the tests](#contributing-to-the-tests)
 
-
 ## Running the tests
 
 To show print statements, run the tests with `py.test -s`. To abort after the
@@ -41,17 +40,16 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
 
-* **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
-* If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-* Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-* Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
-* If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-* Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
-* **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are  available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
-* If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-* Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-* Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
-
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
+- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
+- If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
+- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
 
 ## Parameters
 
@@ -64,7 +62,7 @@ def test_tokenizer_keep_urls(tokenizer, text):
     assert len(tokens) == 1
 ```
 
-This will run the test once for each `text` value. Even if you're only testing  one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
+This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
 
 You can also specify parameters as tuples to test with multiple values per test:
 
@@ -81,18 +79,17 @@ To test for combinations of parameters, you can add several `parametrize` marker
 
 This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
 
-
 ## Fixtures
 
 Fixtures to create instances of spaCy objects and other components should only be defined once in the global [`conftest.py`](conftest.py). We avoid having per-directory conftest files, as this can easily lead to confusion.
 
 These are the main fixtures that are currently available:
 
-| Fixture | Description |
-| --- | --- |
-| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
-| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
-| `en_vocab` | Creates an instance of the English `Vocab`. |
+| Fixture                             | Description                                                                  |
+| ----------------------------------- | ---------------------------------------------------------------------------- |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
+| `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
 The fixtures can be used in all tests by simply setting them as an argument, like this:
 
@@ -107,59 +104,32 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
+### Constructing a `Doc` object manually with
 
-### Constructing a `Doc` object manually with `get_doc()`
-
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can use `get_doc()` to construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
-def test_doc_token_api_strings(en_tokenizer):
+def test_doc_token_api_strings(en_vocab):
     text = "Give it back! He pleaded."
     pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
 
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
+    doc = Doc(en_vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
     assert doc[0].text == 'Give'
     assert doc[0].lower_ == 'give'
     assert doc[0].pos_ == 'VERB'
     assert doc[0].dep_ == 'ROOT'
 ```
 
-You can construct a `Doc` with the following arguments:
-
-| Argument | Description |
-| --- | --- |
-| `vocab` | `Vocab` instance to use. If you're tokenizing before creating a `Doc`, make sure to use the tokenizer's vocab. Otherwise, you can also use the `en_vocab` fixture. **(required)** |
-| `words` | List of words, for example `[t.text for t in tokens]`. **(required)** |
-| `heads` | List of heads as integers. |
-| `pos` | List of POS tags as text values. |
-| `tag` | List of tag names as text values. |
-| `dep` | List of dependencies as text values. |
-| `ents` | List of entity tuples with `start`, `end`, `label` (for example `(0, 2, 'PERSON')`). The `label` will be looked up in `vocab.strings[label]`. |
-
-Here's how to quickly get these values from within spaCy:
-
-```python
-doc = nlp(u'Some text here')
-print([token.head.i-token.i for token in doc])
-print([token.tag_ for token in doc])
-print([token.pos_ for token in doc])
-print([token.dep_ for token in doc])
-print([(ent.start, ent.end, ent.label_) for ent in doc.ents])
-```
-
-**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
-
 ### Other utilities
 
-| Name | Description |
-| --- | --- |
-| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
-| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
-| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
-| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
+| Name                                               | Description                                                                                                   |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state.                          |
+| `add_vecs_to_vocab(vocab, vectors)`                | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
+| `get_cosine(vec1, vec2)`                           | Get cosine for two given vectors.                                                                             |
+| `assert_docs_equal(doc1, doc2)`                    | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
 
 ## Contributing to the tests
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e17199a08..3a9a1f26b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -59,6 +59,11 @@ def de_tokenizer():
     return get_lang_class("de")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def de_vocab():
+    return get_lang_class("de")().vocab
+
+
 @pytest.fixture(scope="session")
 def el_tokenizer():
     return get_lang_class("el")().tokenizer
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 0c2a2a40b..40aff8e31 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,12 +1,10 @@
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.training import Example
 from spacy.pipeline import EntityRecognizer
 from spacy.tokens import Span, Doc
 from spacy import registry
 import pytest
 
-from ..util import get_doc
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-
 
 def _ner_example(ner):
     doc = Doc(
@@ -19,7 +17,7 @@ def _ner_example(ner):
 
 def test_doc_add_entities_set_ents_iob(en_vocab):
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -41,7 +39,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
 def test_ents_reset(en_vocab):
     """Ensure that resetting doc.ents does not change anything"""
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -59,7 +57,7 @@ def test_ents_reset(en_vocab):
 
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     entity = Span(doc, 0, 4, label=391)
     doc.ents = [entity]
 
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index e721b3f09..9c050f740 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -2,8 +2,6 @@ import pytest
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_array_attr_of_token(en_vocab):
     doc = Doc(en_vocab, words=["An", "example", "sentence"])
@@ -35,7 +33,7 @@ def test_doc_scalar_attr_of_token(en_vocab):
 def test_doc_array_tag(en_vocab):
     words = ["A", "nice", "sentence", "."]
     pos = ["DET", "ADJ", "NOUN", "PUNCT"]
-    doc = get_doc(en_vocab, words=words, pos=pos)
+    doc = Doc(en_vocab, words=words, pos=pos)
     assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
     feats_array = doc.to_array((ORTH, POS))
     assert feats_array[0][1] == doc[0].pos
@@ -47,7 +45,7 @@ def test_doc_array_tag(en_vocab):
 def test_doc_array_morph(en_vocab):
     words = ["Eat", "blue", "ham"]
     morph = ["Feat=V", "Feat=J", "Feat=N"]
-    doc = get_doc(en_vocab, words=words, morphs=morph)
+    doc = Doc(en_vocab, words=words, morphs=morph)
     assert morph[0] == doc[0].morph_
     assert morph[1] == doc[1].morph_
     assert morph[2] == doc[2].morph_
@@ -61,7 +59,7 @@ def test_doc_array_morph(en_vocab):
 def test_doc_array_dep(en_vocab):
     words = ["A", "nice", "sentence", "."]
     deps = ["det", "amod", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, deps=deps)
+    doc = Doc(en_vocab, words=words, deps=deps)
     feats_array = doc.to_array((ORTH, DEP))
     assert feats_array[0][1] == doc[0].dep
     assert feats_array[1][1] == doc[1].dep
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 0579642c4..2c22926e9 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -6,25 +6,20 @@ from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_api_init(en_vocab):
+    words = ["a", "b", "c", "d"]
+    heads = [0, 0, 2, 2]
     # set sent_start by sent_starts
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
-    )
+    doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
     # set sent_start by heads
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
-    )
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
-
     # heads override sent_starts
     doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+        en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
     )
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
@@ -178,7 +173,7 @@ def test_doc_api_runtime_error(en_tokenizer):
             "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
     nps = []
     for np in doc.noun_chunks:
         while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
@@ -195,17 +190,19 @@ def test_doc_api_runtime_error(en_tokenizer):
             retokenizer.merge(np, attrs=attrs)
 
 
-def test_doc_api_right_edge(en_tokenizer):
+def test_doc_api_right_edge(en_vocab):
     """Test for bug occurring from Unshift action, causing incorrect right edge"""
     # fmt: off
-    text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
-    heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-             -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    words = [
+        "I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
+        "of", "such", "as", "live", "under", "the", "government", "of", "the",
+        "Romans", ",", "to", "translate", "those", "books", "into", "the",
+        "Greek", "tongue", "."
+    ]
+    heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
     deps = ["dep"] * len(heads)
     # fmt: on
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -233,16 +230,16 @@ def test_doc_api_similarity_match():
 
 
 @pytest.mark.parametrize(
-    "sentence,heads,lca_matrix",
+    "words,heads,lca_matrix",
     [
         (
-            "the lazy dog slept",
-            [2, 1, 1, 0],
+            ["the", "lazy", "dog", "slept"],
+            [2, 2, 3, 3],
             numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
         ),
         (
-            "The lazy dog slept. The quick fox jumped",
-            [2, 1, 1, 0, -1, 2, 1, 1, 0],
+            ["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
+            [2, 2, 3, 3, 3, 7, 7, 8, 8],
             numpy.array(
                 [
                     [0, 2, 2, 3, 3, -1, -1, -1, -1],
@@ -259,11 +256,8 @@ def test_doc_api_similarity_match():
         ),
     ],
 )
-def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
-    tokens = en_tokenizer(sentence)
-    doc = get_doc(
-        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
-    )
+def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
+    doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -287,26 +281,23 @@ def test_doc_is_nered(en_vocab):
 
 
 def test_doc_from_array_sent_starts(en_vocab):
-    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
+    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
     deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     # HEAD overrides SENT_START without warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     # no warning using default attrs
     attrs = doc._get_array_attrs()
     arr = doc.to_array(attrs)
     with pytest.warns(None) as record:
         new_doc.from_array(attrs, arr)
         assert len(record) == 0
-
     # only SENT_START uses SENT_START
     attrs = [SENT_START]
     arr = doc.to_array(attrs)
@@ -314,7 +305,6 @@ def test_doc_from_array_sent_starts(en_vocab):
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
     assert not new_doc.has_annotation("DEP")
-
     # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
@@ -325,19 +315,17 @@ def test_doc_from_array_sent_starts(en_vocab):
 
 
 def test_doc_from_array_morph(en_vocab):
-    words = ["I", "live", "in", "New", "York", "."]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", "."]
     morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
     # fmt: on
     doc = Doc(en_vocab, words=words)
     for i, morph in enumerate(morphs):
         doc[i].morph_ = morph
-
     attrs = [MORPH]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     assert [t.morph_ for t in new_doc] == morphs
     assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
 
@@ -349,15 +337,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs = [en_tokenizer(text) for text in en_texts]
     docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
-    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
-        True,
-        None,
-        None,
-        None,
-    )
-
+    expected = (True, None, None, None)
+    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
     assert Doc.from_docs([]) is None
-
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
 
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index bc9567b2a..806c4b46f 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -3,8 +3,6 @@ from spacy.attrs import LEMMA
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_merge(en_tokenizer):
     text = "WKRO played songs by the beach boys all night"
@@ -88,9 +86,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
 
 def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     text = "Los Angeles start."
-    heads = [1, 1, 0, -1]
+    heads = [1, 2, 2, 2]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert len(doc) == 4
     assert doc[0].head.text == "Angeles"
     assert doc[1].head.text == "start"
@@ -103,17 +101,12 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     assert doc[0].ent_type_ == "GPE"
 
 
-def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
-    text = "The players start."
-    heads = [1, 1, 0, -1]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
+    words = ["The", "players", "start", "."]
+    heads = [1, 2, 2, 2]
+    tags = ["DT", "NN", "VBZ", "."]
+    pos = ["DET", "NOUN", "VERB", "PUNCT"]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -124,13 +117,7 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[0].text == "The players"
     assert doc[0].tag_ == "NN"
     assert doc[0].pos_ == "NOUN"
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -147,11 +134,10 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[1].pos_ == "VERB"
 
 
-def test_doc_retokenize_spans_merge_heads(en_tokenizer):
-    text = "I found a pilates class near work."
-    heads = [1, 0, 2, 1, -3, -1, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_doc_retokenize_spans_merge_heads(en_vocab):
+    words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
+    heads = [1, 1, 4, 6, 1, 4, 5, 1]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 8
     with doc.retokenize() as retokenizer:
         attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
@@ -182,9 +168,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
 
 def test_doc_retokenize_span_np_merges(en_tokenizer):
     text = "displaCy is a parse tool built with Javascript"
-    heads = [1, 0, 2, 1, -3, -1, -1, -1]
+    heads = [1, 1, 4, 4, 1, 4, 5, 6]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert doc[4].head.i == 1
     with doc.retokenize() as retokenizer:
         attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
@@ -192,18 +178,18 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
     assert doc[2].head.i == 1
 
     text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
-    heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
+    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
             retokenizer.merge(ent, attrs=attrs)
 
     text = "One test with entities like New York City so the ents list is not void"
-    heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
+    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             retokenizer.merge(ent)
@@ -212,12 +198,12 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
 def test_doc_retokenize_spans_entity_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
     tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
-    ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
+    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
     )
     assert len(doc) == 17
@@ -282,13 +268,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # if there is a parse, span.root provides default values
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
+    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
     deps = ["dep"] * len(words)
     en_vocab.strings.add("ent-de")
     en_vocab.strings.add("ent-fg")
     en_vocab.strings.add("dep")
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     assert doc[2:4].root == doc[3]  # root of 'c d' is d
     assert doc[4:6].root == doc[4]  # root is 'e f' is e
     with doc.retokenize() as retokenizer:
@@ -305,10 +291,10 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # check that B is preserved if span[start] is B
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
+    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
     deps = ["dep"] * len(words)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     with doc.retokenize() as retokenizer:
         retokenizer.merge(doc[3:5])
         retokenizer.merge(doc[5:7])
@@ -322,13 +308,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
-    heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
+    heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
     deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
             'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
             'compound', 'dobj', 'punct']
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
@@ -343,13 +329,13 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
 def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
     deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
             "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
             "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1 = list(doc.sents)[0]
     init_len = len(list(sent1.root.subtree))
     with doc.retokenize() as retokenizer:
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 5f40da425..4d4b170f9 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -2,13 +2,11 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_split(en_vocab):
     words = ["LosAngeles", "start", "."]
-    heads = [1, 1, 0]
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    heads = [1, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 3
     assert len(str(doc)) == 19
     assert doc[0].head.text == "start"
@@ -88,11 +86,11 @@ def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
     # fmt: off
     words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
              "lives", "in", "England", "and", "loves", "JoePasquale", "."]
-    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
+    heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
     deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
             "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0c538a0eb..2f562deb7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -4,19 +4,17 @@ from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 
-from ..util import get_doc
-
 
 @pytest.fixture
 def doc(en_tokenizer):
     # fmt: off
     text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
     tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -69,10 +67,10 @@ def test_spans_string_fn(doc):
 
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
-    heads = [0, 3, -1, -2, -4]
+    heads = [0, 4, 1, 1, 0]
     deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,10 +90,10 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab,
         words=[t.text for t in tokens],
-        heads=[2, 1, 1, 0],
+        heads=[2, 2, 3, 3],
         deps=["dep"] * 4,
     )
     lca = doc[:2].get_lca_matrix()
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index da3bc7dbb..c9bcafcfa 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,6 +1,5 @@
 import pytest
 from spacy.tokens import Doc
-from ..util import get_doc
 
 
 @pytest.fixture()
@@ -8,10 +7,10 @@ def doc(en_vocab):
     words = ["c", "d", "e"]
     pos = ["VERB", "NOUN", "NOUN"]
     tags = ["VBP", "NN", "NN"]
-    heads = [0, -1, -2]
+    heads = [0, 0, 0]
     deps = ["ROOT", "dobj", "dobj"]
-    ents = [(1, 2, "ORG")]
-    return get_doc(
+    ents = [("ORG", 1, 2)]
+    return Doc(
         en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
     )
 
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 1308df67b..3c5c063bd 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -5,31 +5,24 @@ from spacy.symbols import VERB
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 
-from ..util import get_doc
-
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 10, 12, 10, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
-def test_doc_token_api_strings(en_tokenizer):
-    text = "Give it back! He pleaded."
+def test_doc_token_api_strings(en_vocab):
+    words = ["Give", "it", "back", "!", "He", "pleaded", "."]
     pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, heads=heads, deps=deps)
     assert doc[0].orth_ == "Give"
     assert doc[0].text == "Give"
     assert doc[0].text_with_ws == "Give "
@@ -97,88 +90,70 @@ def test_doc_token_api_vectors():
     assert doc[0].similarity(doc[1]) == cosine
 
 
-def test_doc_token_api_ancestors(en_tokenizer):
+def test_doc_token_api_ancestors(en_vocab):
     # the structure of this sentence depends on the English annotation scheme
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
     assert [t.text for t in doc[1].ancestors] == ["saw"]
     assert [t.text for t in doc[2].ancestors] == []
-
     assert doc[2].is_ancestor(doc[7])
     assert not doc[6].is_ancestor(doc[2])
 
 
-def test_doc_token_api_head_setter(en_tokenizer):
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+def test_doc_token_api_head_setter(en_vocab):
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 1
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 0
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 3
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[6].head = doc[3]
-
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 1
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 0
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[0].head = doc[5]
-
     assert doc[5].left_edge.i == 0
     assert doc[6].left_edge.i == 0
     assert doc[3].left_edge.i == 0
     assert doc[4].left_edge.i == 0
     assert doc[2].left_edge.i == 0
-
     # head token must be from the same document
-    doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc2 = Doc(en_vocab, words=words, heads=heads)
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
-
     # test sentence starts when two sentences are joined
-    text = "This is one sentence. This is another sentence."
-    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
+    # fmt: off
+    words = ["This", "is", "one", "sentence", ".", "This", "is", "another", "sentence", "."]
+    heads = [0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
+    # fmt: on
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     # initially two sentences
     assert doc[0].is_sent_start
     assert doc[5].is_sent_start
@@ -186,7 +161,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # modifying with a sentence doesn't change sent starts
     doc[2].head = doc[3]
     assert doc[0].is_sent_start
@@ -195,7 +169,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # attach the second sentence to the first, resulting in one sentence
     doc[5].head = doc[0]
     assert doc[0].is_sent_start
@@ -252,28 +225,28 @@ def test_tokenlast_has_sent_end_true():
 
 
 def test_token_api_conjuncts_chain(en_vocab):
-    words = "The boy and the girl and the man went .".split()
-    heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1]
+    words = ["The", "boy", "and", "the", "girl", "and", "the", "man", "went", "."]
+    heads = [1, 8, 1, 4, 1, 4, 7, 4, 8, 8]
     deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["girl", "man"]
     assert [w.text for w in doc[4].conjuncts] == ["boy", "man"]
     assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"]
 
 
 def test_token_api_conjuncts_simple(en_vocab):
-    words = "They came and went .".split()
-    heads = [1, 0, -1, -2, -1]
+    words = ["They", "came", "and", "went", "."]
+    heads = [1, 1, 1, 1, 3]
     deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["went"]
     assert [w.text for w in doc[3].conjuncts] == ["came"]
 
 
 def test_token_api_non_conjuncts(en_vocab):
-    words = "They came .".split()
-    heads = [1, 0, -1]
+    words = ["They", "came", "."]
+    heads = [1, 1, 1]
     deps = ["nsubj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[0].conjuncts] == []
     assert [w.text for w in doc[1].conjuncts] == []
diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py
index e2154b4c0..8c858a4cb 100644
--- a/spacy/tests/lang/de/test_parser.py
+++ b/spacy/tests/lang/de/test_parser.py
@@ -1,30 +1,26 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_de_parser_noun_chunks_standard_de(de_tokenizer):
-    text = "Eine Tasse steht auf dem Tisch."
-    heads = [1, 1, 0, -1, 1, -2, -4]
+def test_de_parser_noun_chunks_standard_de(de_vocab):
+    words = ["Eine", "Tasse", "steht", "auf", "dem", "Tisch", "."]
+    heads = [1, 2, 2, 2, 5, 3, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "Eine Tasse "
     assert chunks[1].text_with_ws == "dem Tisch "
 
 
-def test_de_extended_chunk(de_tokenizer):
-    text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
-    heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
+def test_de_extended_chunk(de_vocab):
+    # fmt: off
+    words = ["Die", "Sängerin", "singt", "mit", "einer", "Tasse", "Kaffee", "Arien", "."]
+    heads = [1, 2, 2, 2, 5, 3, 5, 2, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "NOUN", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    # fmt: on
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Die Sängerin "
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index fa3a134bd..0189a26d4 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -2,13 +2,10 @@ import numpy
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
 from spacy.lang.en.syntax_iterators import noun_chunks
-
+from spacy.tokens import Doc
 import pytest
 
 
-from ...util import get_doc
-
-
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
     """
@@ -19,9 +16,9 @@ def test_noun_chunks_is_parsed(en_tokenizer):
 
 def test_en_noun_chunks_not_nested(en_vocab):
     words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
-    heads = [1, 0, 4, 3, -1, -2, -5]
+    heads = [1, 1, 6, 6, 3, 3, 1]
     deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.from_array(
         [HEAD, DEP],
         numpy.asarray(
diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py
index 4d06ff8ef..426605566 100644
--- a/spacy/tests/lang/en/test_parser.py
+++ b/spacy/tests/lang/en/test_parser.py
@@ -1,63 +1,51 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_en_parser_noun_chunks_standard(en_tokenizer):
-    text = "A base phrase should be recognized."
-    heads = [2, 1, 3, 2, 1, 0, -1]
+def test_en_parser_noun_chunks_standard(en_vocab):
+    words = ["A", "base", "phrase", "should", "be", "recognized", "."]
+    heads = [2, 2, 5, 5, 5, 5, 5]
     pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
     deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "A base phrase "
 
 
-def test_en_parser_noun_chunks_coordinated(en_tokenizer):
+def test_en_parser_noun_chunks_coordinated(en_vocab):
     # fmt: off
-    text = "A base phrase and a good phrase are often the same."
-    heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
+    words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
+    heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
     pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
     deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A base phrase "
     assert chunks[1].text_with_ws == "a good phrase "
 
 
-def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
-    text = "A phrase with another phrase occurs."
-    heads = [1, 4, -1, 1, -2, 0, -1]
+def test_en_parser_noun_chunks_pp_chunks(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
+    heads = [1, 5, 1, 4, 2, 5, 5]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A phrase "
     assert chunks[1].text_with_ws == "another phrase "
 
 
-def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
+def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
     # fmt: off
-    text = "Sam, my brother, arrived to the house."
-    heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
+    words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
+    heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
     pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Sam "
@@ -65,15 +53,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
     assert chunks[2].text_with_ws == "the house "
 
 
-def test_en_parser_noun_chunks_dative(en_tokenizer):
-    text = "She gave Bob a raise."
-    heads = [1, 0, -1, 1, -3, -4]
+def test_en_parser_noun_chunks_dative(en_vocab):
+    words = ["She", "gave", "Bob", "a", "raise", "."]
+    heads = [1, 1, 1, 4, 1, 1]
     pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "She "
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index ee1e6be17..39d8d3b59 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -1,15 +1,16 @@
 import pytest
+from spacy.tokens import Doc
 
-from ...util import get_doc, apply_transition_sequence
+from ...util import apply_transition_sequence
 
 
-@pytest.mark.parametrize("text", ["A test sentence"])
+@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
-def test_en_sbd_single_punct(en_tokenizer, text, punct):
-    heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+def test_en_sbd_single_punct(en_vocab, words, punct):
+    heads = [2, 2, 2, 2] if punct else [2, 2, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    words = [*words, punct] if punct else words
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
@@ -18,17 +19,16 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_en_sentence_breaks(en_tokenizer, en_parser):
+def test_en_sentence_breaks(en_vocab, en_parser):
     # fmt: off
-    text = "This is a sentence . This is another one ."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "one", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct"]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
                   "L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     apply_transition_sequence(en_parser, doc, transition)
     assert len(list(doc.sents)) == 2
     for token in doc:
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index bcf103b65..3810323bf 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_ru_doc_lemmatization(ru_lemmatizer):
@@ -11,7 +10,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
         "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
         "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
     ]
-    doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = Doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
     doc = ru_lemmatizer(doc)
     lemmas = [token.lemma_ for token in doc]
     assert lemmas == ["мама", "мыть", "рама"]
@@ -28,7 +27,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
     ],
 )
 def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert sorted(result_lemmas) == lemmas
 
@@ -51,7 +50,7 @@ def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
 def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ru_lemmatizer, text, pos, morph, lemma
 ):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
@@ -66,13 +65,13 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ],
 )
 def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
 
 def test_ru_lemmatizer_punct(ru_lemmatizer):
-    doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
-    doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index 458cdadd5..3791d8021 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
@@ -16,21 +15,21 @@ SV_NP_TEST_EXAMPLES = [
         "En student läste en bok",  # A student read a book
         ["DET", "NOUN", "VERB", "DET", "NOUN"],
         ["det", "nsubj", "ROOT", "det", "dobj"],
-        [1, 1, 0, 1, -2],
+        [1, 2, 2, 4, 2],
         ["En student", "en bok"],
     ),
     (
         "Studenten läste den bästa boken.",  # The student read the best book
         ["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
         ["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
-        [1, 0, 2, 1, -3, -4],
+        [1, 1, 4, 4, 1, 1],
         ["Studenten", "den bästa boken"],
     ),
     (
         "De samvetslösa skurkarna hade stulit de största juvelerna på söndagen",  # The remorseless crooks had stolen the largest jewels that sunday
         ["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
         ["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
-        [2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
+        [2, 2, 4, 4, 4, 7, 7, 4, 9, 4],
         ["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
     ),
 ]
@@ -41,12 +40,9 @@ SV_NP_TEST_EXAMPLES = [
 )
 def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
     tokens = sv_tokenizer(text)
-
     assert len(heads) == len(pos)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
-    )
-
+    words = [t.text for t in tokens]
+    doc = Doc(tokens.vocab, words=words, heads=heads, deps=deps, pos=pos)
     noun_chunks = list(doc.noun_chunks)
     assert len(noun_chunks) == len(expected_noun_chunks)
     for i, np in enumerate(noun_chunks):
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 6361a10ce..e18a8f6d8 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -4,16 +4,15 @@ import re
 import copy
 from mock import Mock
 from spacy.matcher import DependencyMatcher
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
 def doc(en_vocab):
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
+    heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
     deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-    return doc
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -236,10 +235,10 @@ def test_dependency_matcher_callback(en_vocab, doc):
 @pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
 def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
     # two sentences to test that all matches are within the same sentence
-    doc = get_doc(
+    doc = Doc(
         en_vocab,
         words=["a", "b", "c", "d", "e"] * 2,
-        heads=[0, -1, -2, -3, -4] * 2,
+        heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
         deps=["dep"] * 10,
     )
     match_count = 0
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 9caf284a3..522356ffc 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -3,7 +3,6 @@ import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span
-from ..util import get_doc
 
 
 def test_matcher_phrase_matcher(en_vocab):
@@ -140,10 +139,10 @@ def test_phrase_matcher_string_attrs(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
     pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 1
     match_id, start, end = matches[0]
@@ -158,10 +157,10 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
     pos2 = ["X", "X", "X"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 0
 
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 41da7cf49..544701a4c 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -2,8 +2,7 @@ import pytest
 from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
 from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
 from spacy.pipeline._parser_internals import nonproj
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
@@ -74,16 +73,10 @@ def test_parser_is_nonproj_tree(
     assert is_nonproj_tree(multirooted_tree) is True
 
 
-def test_parser_pseudoprojectivity(en_tokenizer):
+def test_parser_pseudoprojectivity(en_vocab):
     def deprojectivize(proj_heads, deco_labels):
-        tokens = en_tokenizer("whatever " * len(proj_heads))
-        rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
-        doc = get_doc(
-            tokens.vocab,
-            words=[t.text for t in tokens],
-            deps=deco_labels,
-            heads=rel_proj_heads,
-        )
+        words = ["whatever "] * len(proj_heads)
+        doc = Doc(en_vocab, words=words, deps=deco_labels, heads=proj_heads)
         nonproj.deprojectivize(doc)
         return [t.head.i for t in doc], [token.dep_ for token in doc]
 
@@ -94,49 +87,39 @@ def test_parser_pseudoprojectivity(en_tokenizer):
     labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
     labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
     # fmt: on
-
     assert nonproj.decompose("X||Y") == ("X", "Y")
     assert nonproj.decompose("X") == ("X", "")
     assert nonproj.is_decorated("X||Y") is True
     assert nonproj.is_decorated("X") is False
-
     nonproj._lift(0, tree)
     assert tree == [2, 2, 2]
-
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
-
     # fmt: off
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
     assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
     assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                            "nsubj", "acl||dobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree
     assert undeco_labels == labels
-
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
     assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
     assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
                            "det", "dobj", "det", "nmod", "aux", "nmod||dobj",
                            "advmod", "det", "amod", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree2
     assert undeco_labels == labels2
-
     # if decoration is wrong such that there is no head with the desired label
     # the structure is kept and the label is undecorated
     proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
     deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
                    "acl||iobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == proj_heads
     assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                              "nsubj", "acl", "punct"]
-
     # if there are two potential new heads, the first one is chosen even if
     # it"s wrong
     proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 9e760c1e7..8648f2018 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,9 +1,11 @@
 import pytest
-
 from spacy.lang.en import English
-from ..util import get_doc, apply_transition_sequence, make_tempdir
-from ... import util
-from ...training import Example
+from spacy.training import Example
+from spacy.tokens import Doc
+from spacy import util
+
+from ..util import apply_transition_sequence, make_tempdir
+
 
 TRAIN_DATA = [
     (
@@ -23,12 +25,11 @@ TRAIN_DATA = [
 ]
 
 
-def test_parser_root(en_tokenizer):
-    text = "i don't have other assistance"
-    heads = [3, 2, 1, 0, 1, -2]
+def test_parser_root(en_vocab):
+    words = ["i", "do", "n't", "have", "other", "assistance"]
+    heads = [3, 3, 3, 3, 5, 3]
     deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for t in doc:
         assert t.dep != 0, t.text
 
@@ -36,13 +37,9 @@ def test_parser_root(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-@pytest.mark.parametrize("text", ["Hello"])
-def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
-
+@pytest.mark.parametrize("words", [["Hello"]])
+def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
+    doc = Doc(en_vocab, words=words, heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     with en_parser.step_through(doc) as _:  # noqa: F841
         pass
@@ -52,24 +49,22 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_initial(en_tokenizer, en_parser):
-    text = "I ate the pizza with anchovies."
-    # heads = [1, 0, 1, -2, -3, -1, -5]
+def test_parser_initial(en_vocab, en_parser):
+    words = ["I", "ate", "the", "pizza", "with", "anchovies", "."]
     transition = ["L-nsubj", "S", "L-det"]
-    tokens = en_tokenizer(text)
-    apply_transition_sequence(en_parser, tokens, transition)
-    assert tokens[0].head.i == 1
-    assert tokens[1].head.i == 1
-    assert tokens[2].head.i == 3
-    assert tokens[3].head.i == 3
+    doc = Doc(en_vocab, words=words)
+    apply_transition_sequence(en_parser, doc, transition)
+    assert doc[0].head.i == 1
+    assert doc[1].head.i == 1
+    assert doc[2].head.i == 3
+    assert doc[3].head.i == 3
 
 
-def test_parser_parse_subtrees(en_tokenizer, en_parser):
-    text = "The four wheels on the bus turned quickly"
-    heads = [2, 1, 4, -1, 1, -2, 0, -1]
+def test_parser_parse_subtrees(en_vocab, en_parser):
+    words = ["The", "four", "wheels", "on", "the", "bus", "turned", "quickly"]
+    heads = [2, 2, 6, 2, 5, 3, 6, 6]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -79,15 +74,12 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
     assert len(list(doc[2].subtree)) == 6
 
 
-def test_parser_merge_pp(en_tokenizer):
-    text = "A phrase with another phrase occurs"
-    heads = [1, 4, -1, 1, -2, 0]
+def test_parser_merge_pp(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs"]
+    heads = [1, 5, 1, 4, 2, 5]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
-    )
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads, pos=pos)
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
             retokenizer.merge(np, attrs={"lemma": np.lemma_})
@@ -100,12 +92,11 @@ def test_parser_merge_pp(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
-    text = "a b c d e"
-
+def test_parser_arc_eager_finalize_state(en_vocab, en_parser):
+    words = ["a", "b", "c", "d", "e"]
     # right branching
     transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -140,7 +131,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 
     # left branching
     transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -177,10 +168,10 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 def test_parser_set_sent_starts(en_vocab):
     # fmt: off
     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
-    heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
+    heads = [1, 1, 1, 30, 4, 4, 7, 4, 7, 17, 14, 14, 11, 14, 17, 16, 17, 6, 17, 20, 11, 20, 26, 22, 26, 26, 20, 26, 29, 31, 31, 25, 31, 32, 17, 4, 4, 36]
     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
     # fmt: on
-    doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads)
     for i in range(len(words)):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
@@ -201,24 +192,21 @@ def test_overfitting_IO():
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
     optimizer = nlp.begin_training()
-
     for i in range(100):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["parser"] < 0.0001
-
     # test the trained model
     test_text = "I like securities."
     doc = nlp(test_text)
-    assert doc[0].dep_ is "nsubj"
-    assert doc[2].dep_ is "dobj"
-    assert doc[3].dep_ is "punct"
-
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        assert doc2[0].dep_ is "nsubj"
-        assert doc2[2].dep_ is "dobj"
-        assert doc2[3].dep_ is "punct"
+        assert doc2[0].dep_ == "nsubj"
+        assert doc2[2].dep_ == "dobj"
+        assert doc2[3].dep_ == "punct"
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f181a799a..8ca4039a2 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -1,59 +1,75 @@
 import pytest
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
-def text():
-    return """
-It was a bright cold day in April, and the clocks were striking thirteen.
-Winston Smith, his chin nuzzled into his breast in an effort to escape the
-vile wind, slipped quickly through the glass doors of Victory Mansions,
-though not quickly enough to prevent a swirl of gritty dust from entering
-along with him.
-
-The hallway smelt of boiled cabbage and old rag mats. At one end of it a
-coloured poster, too large for indoor display, had been tacked to the wall.
-It depicted simply an enormous face, more than a metre wide: the face of a
-man of about forty-five, with a heavy black moustache and ruggedly handsome
-features. Winston made for the stairs. It was no use trying the lift. Even at
-the best of times it was seldom working, and at present the electric current
-was cut off during daylight hours. It was part of the economy drive in
-preparation for Hate Week. The flat was seven flights up, and Winston, who
-was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
-resting several times on the way. On each landing, opposite the lift-shaft,
-the poster with the enormous face gazed from the wall. It was one of those
-pictures which are so contrived that the eyes follow you about when you move.
-BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
-"""
+def words():
+    # fmt: off
+    return [
+        "\n", "It", "was", "a", "bright", "cold", "day", "in", "April", ",",
+        "and", "the", "clocks", "were", "striking", "thirteen", ".", "\n",
+        "Winston", "Smith", ",", "his", "chin", "nuzzled", "into", "his",
+        "breast", "in", "an", "effort", "to", "escape", "the", "\n", "vile",
+        "wind", ",", "slipped", "quickly", "through", "the", "glass", "doors",
+        "of", "Victory", "Mansions", ",", "\n", "though", "not", "quickly",
+        "enough", "to", "prevent", "a", "swirl", "of", "gritty", "dust",
+        "from", "entering", "\n", "along", "with", "him", ".", "\n\n", "The",
+        "hallway", "smelt", "of", "boiled", "cabbage", "and", "old", "rag",
+        "mats", ".", "At", "one", "end", "of", "it", "a", "\n", "coloured",
+        "poster", ",", "too", "large", "for", "indoor", "display", ",", "had",
+        "been", "tacked", "to", "the", "wall", ".", "\n", "It", "depicted",
+        "simply", "an", "enormous", "face", ",", "more", "than", "a", "metre",
+        "wide", ":", "the", "face", "of", "a", "\n", "man", "of", "about",
+        "forty", "-", "five", ",", "with", "a", "heavy", "black", "moustache",
+        "and", "ruggedly", "handsome", "\n", "features", ".", "Winston", "made",
+        "for", "the", "stairs", ".", "It", "was", "no", "use", "trying", "the",
+        "lift", ".", "Even", "at", "\n", "the", "best", "of", "times", "it",
+        "was", "seldom", "working", ",", "and", "at", "present", "the",
+        "electric", "current", "\n", "was", "cut", "off", "during", "daylight",
+        "hours", ".", "It", "was", "part", "of", "the", "economy", "drive",
+        "in", "\n", "preparation", "for", "Hate", "Week", ".", "The", "flat",
+        "was", "seven", "flights", "up", ",", "and", "Winston", ",", "who",
+        "\n", "was", "thirty", "-", "nine", "and", "had", "a", "varicose",
+        "ulcer", "above", "his", "right", "ankle", ",", "went", "slowly", ",",
+        "\n", "resting", "several", "times", "on", "the", "way", ".", "On",
+        "each", "landing", ",", "opposite", "the", "lift", "-", "shaft", ",",
+        "\n", "the", "poster", "with", "the", "enormous", "face", "gazed",
+        "from", "the", "wall", ".", "It", "was", "one", "of", "those", "\n",
+        "pictures", "which", "are", "so", "contrived", "that", "the", "eyes",
+        "follow", "you", "about", "when", "you", "move", ".", "\n", "BIG",
+        "BROTHER", "IS", "WATCHING", "YOU", ",", "the", "caption", "beneath",
+        "it", "ran", ".", "\n", ]
+    # fmt: on
 
 
 @pytest.fixture
 def heads():
     # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
-            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
-            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
-            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
-            0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
-            -1, 0, -1, -1]
+    return [
+        1, 2, 2, 6, 6, 6, 2, 6, 7, 2, 2, 12, 14, 14, 2, 14, 14, 16, 19, 23, 23,
+        22, 23, 23, 23, 26, 24, 23, 29, 27, 31, 29, 35, 32, 35, 31, 23, 23, 37,
+        37, 42, 42, 39, 42, 45, 43, 37, 46, 37, 50, 51, 37, 53, 51, 55, 53, 55,
+        58, 56, 53, 59, 60, 60, 62, 63, 23, 65, 68, 69, 69, 69, 72, 70, 72, 76,
+        76, 72, 69, 96, 80, 78, 80, 81, 86, 83, 86, 96, 96, 89, 96, 89, 92, 90,
+        96, 96, 96, 96, 96, 99, 97, 96, 100, 103, 103, 103, 107, 107, 103, 107,
+        111, 111, 112, 113, 107, 103, 116, 136, 116, 120, 118, 117, 120, 125,
+        125, 125, 121, 116, 116, 131, 131, 131, 127, 131, 134, 131, 134, 136,
+        136, 139, 139, 139, 142, 140, 139, 145, 145, 147, 145, 147, 150, 148,
+        145, 153, 162, 153, 156, 162, 156, 157, 162, 162, 162, 162, 162, 162,
+        172, 165, 169, 169, 172, 169, 172, 162, 172, 172, 176, 174, 172, 179,
+        179, 179, 180, 183, 181, 179, 184, 185, 185, 187, 190, 188, 179, 193,
+        194, 194, 196, 194, 196, 194, 194, 218, 200, 204, 202, 200, 207, 207,
+        204, 204, 204, 212, 212, 209, 212, 216, 216, 213, 200, 194, 218, 218,
+        220, 218, 224, 222, 222, 227, 225, 218, 246, 231, 229, 246, 246, 237,
+        237, 237, 233, 246, 238, 241, 246, 241, 245, 245, 242, 246, 246, 249,
+        247, 246, 252, 252, 252, 253, 257, 255, 254, 259, 257, 261, 259, 265,
+        264, 265, 261, 265, 265, 270, 270, 267, 252, 271, 274, 275, 275, 276,
+        283, 283, 280, 283, 280, 281, 283, 283, 284]
     # fmt: on
 
 
-def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for head in doc:
         for child in head.lefts:
             assert child.head == head
@@ -61,15 +77,8 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
             assert child.head == head
 
 
-def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
-
+def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     lefts = {}
     rights = {}
     for head in doc:
@@ -99,9 +108,8 @@ def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
         assert not children
 
 
-def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_edges(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for token in doc:
         subtree = list(token.subtree)
         debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3672dabea..2b80272d6 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -1,42 +1,40 @@
 import pytest
+from spacy.tokens import Doc
 
-from spacy.tokens.doc import Doc
-
-from ..util import get_doc, apply_transition_sequence
+from ..util import apply_transition_sequence
 
 
-def test_parser_space_attachment(en_tokenizer):
-    text = "This is a test.\nTo ensure  spaces are attached well."
-    heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+def test_parser_space_attachment(en_vocab):
+    # fmt: off
+    words = ["This", "is", "a", "test", ".", "\n", "To", "ensure", " ", "spaces", "are", "attached", "well", "."]
+    heads = [1, 1, 3, 1, 1, 4, 7, 11, 7, 11, 11, 11, 11, 11]
+    # fmt: on
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
 
 
-def test_parser_sentence_space(en_tokenizer):
+def test_parser_sentence_space(en_vocab):
     # fmt: off
-    text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-    heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+    words = ["I", "look", "forward", "to", "using", "Thingamajig", ".", " ", "I", "'ve", "been", "told", "it", "will", "make", "my", "life", "easier", "..."]
+    heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
     deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
             "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
             "poss", "nsubj", "ccomp", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc.sents)) == 2
 
 
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_leading(en_tokenizer, en_parser):
-    text = "\t \n This is a sentence ."
-    heads = [1, 1, 0, 1, -2, -3]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+def test_parser_space_attachment_leading(en_vocab, en_parser):
+    words = ["\t", "\n", "This", "is", "a", "sentence", "."]
+    heads = [1, 2, 2, 4, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[0].is_space
     assert doc[1].is_space
     assert doc[2].text == "This"
@@ -50,18 +48,16 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
-    text = "This is \t a \t\n \n sentence . \n\n \n"
-    heads = [1, 0, -1, 2, -1, -4, -5, -1]
+def test_parser_space_attachment_intermediate_trailing(en_vocab, en_parser):
+    words = ["This", "is", "\t", "a", "\t\n", "\n", "sentence", ".", "\n\n", "\n"]
+    heads = [1, 1, 1, 5, 3, 1, 1, 6]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[2].is_space
     assert doc[4].is_space
     assert doc[5].is_space
     assert doc[8].is_space
     assert doc[9].is_space
-
     apply_transition_sequence(en_parser, doc, transition)
     for token in doc:
         assert token.dep != 0 or token.is_space
@@ -72,7 +68,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
+def test_parser_space_attachment_space(en_parser, text, length):
     doc = Doc(en_parser.vocab, words=text)
     assert len(doc) == length
     with en_parser.step_through(doc) as _:  # noqa: F841
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index a66b34bc0..b9e5894dd 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -4,8 +4,9 @@ from spacy.training import Example
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
 from spacy import util, registry
+from spacy.tokens import Doc
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.fixture
@@ -66,7 +67,6 @@ def test_attributeruler_init(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     for p in pattern_dicts:
         a.add(**p)
-
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
@@ -129,7 +129,7 @@ def test_attributeruler_rule_order(nlp):
         {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
     ]
     a.add_patterns(patterns)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
@@ -141,13 +141,12 @@ def test_attributeruler_rule_order(nlp):
 def test_attributeruler_tag_map(nlp, tag_map):
     a = AttributeRuler(nlp.vocab)
     a.load_from_tag_map(tag_map)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i == 4:
             assert doc[i].pos_ == "PUNCT"
@@ -160,13 +159,12 @@ def test_attributeruler_tag_map(nlp, tag_map):
 def test_attributeruler_morph_rules(nlp, morph_rules):
     a = AttributeRuler(nlp.vocab)
     a.load_from_morph_rules(morph_rules)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "the", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i != 2:
             assert doc[i].pos_ == ""
@@ -193,7 +191,6 @@ def test_attributeruler_indices(nlp):
 
     text = "This is a test."
     doc = nlp(text)
-
     for i in range(len(doc)):
         if i == 1:
             assert doc[i].lemma_ == "was"
@@ -205,12 +202,10 @@ def test_attributeruler_indices(nlp):
             assert doc[i].lemma_ == "cat"
         else:
             assert doc[i].morph_ == ""
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
     with pytest.raises(ValueError):
         doc = nlp(text)
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
     with pytest.raises(ValueError):
@@ -220,7 +215,6 @@ def test_attributeruler_indices(nlp):
 def test_attributeruler_patterns_prop(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     for p1, p2 in zip(pattern_dicts, a.patterns):
         assert p1["patterns"] == p2["patterns"]
         assert p1["attrs"] == p2["attrs"]
@@ -231,18 +225,15 @@ def test_attributeruler_patterns_prop(nlp, pattern_dicts):
 def test_attributeruler_serialize(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     text = "This is a test."
     attrs = ["ORTH", "LEMMA", "MORPH"]
     doc = nlp(text)
-
     # bytes roundtrip
     a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
     assert a.to_bytes() == a_reloaded.to_bytes()
     doc1 = a_reloaded(nlp.make_doc(text))
     numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
     assert a.patterns == a_reloaded.patterns
-
     # disk roundtrip
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index ee9e34df3..025ac04af 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -1,57 +1,38 @@
 import pytest
 from spacy.pipeline.functions import merge_subtokens
 from spacy.language import Language
-from spacy.tokens import Span
-
-from ..util import get_doc
+from spacy.tokens import Span, Doc
 
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 11, 12, 13, 13]
     deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
             "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
-def doc2(en_tokenizer):
-    text = "I like New York in Autumn."
-    heads = [1, 0, 1, -2, -3, -1, -5]
+def doc2(en_vocab):
+    words = ["I", "like", "New", "York", "in", "Autumn", "."]
+    heads = [1, 1, 3, 1, 1, 4, 1]
     tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        tags=tags,
-        pos=pos,
-        deps=deps,
-    )
-    doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
+    doc = Doc(en_vocab, words=words, heads=heads, tags=tags, pos=pos, deps=deps)
+    doc.ents = [Span(doc, 2, 4, label="GPE")]
     return doc
 
 
 def test_merge_subtokens(doc):
     doc = merge_subtokens(doc)
-    # get_doc() doesn't set spaces, so the result is "And a third ."
-    assert [t.text for t in doc] == [
-        "This",
-        "is",
-        "a sentence",
-        ".",
-        "This",
-        "is",
-        "another sentence",
-        ".",
-        "And a third .",
-    ]
+    # Doc doesn't have spaces, so the result is "And a third ."
+    # fmt: off
+    assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]
+    # fmt: on
 
 
 def test_factories_merge_noun_chunks(doc2):
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 30f66fb1d..d841ee24b 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -9,7 +9,7 @@ from spacy.lang.en import English
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize(
@@ -88,12 +88,9 @@ def test_issue242(en_tokenizer):
         doc.ents += tuple(matches)
 
 
-def test_issue309(en_tokenizer):
+def test_issue309(en_vocab):
     """Test Issue #309: SBD fails on empty string"""
-    tokens = en_tokenizer(" ")
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
+    doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 71ed2ea03..dce3e8298 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 def test_issue1506():
@@ -197,32 +197,21 @@ def test_issue1807():
 def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
-    string = "This is a first sentence . And another one"
-    words = string.split()
-    doc = get_doc(Vocab(), words=words)
+    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
+    doc = Doc(Vocab(), words=words)
     doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
     assert not new_doc.has_annotation("DEP")
     assert not new_doc.has_annotation("TAG")
-    doc = get_doc(
+    doc = Doc(
         Vocab(),
         words=words,
         tags=["TAG"] * len(words),
-        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
         deps=["dep"] * len(words),
     )
-    print(
-        doc.has_annotation("DEP"),
-        [t.head.i for t in doc],
-        [t.is_sent_start for t in doc],
-    )
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    print(
-        new_doc.has_annotation("DEP"),
-        [t.head.i for t in new_doc],
-        [t.is_sent_start for t in new_doc],
-    )
     assert new_doc[6].sent_start
     assert new_doc.has_annotation("DEP")
     assert new_doc.has_annotation("TAG")
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 3bea5d3f6..c4c755153 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -7,7 +7,7 @@ from spacy.training import iob_to_biluo
 from spacy.lang.it import Italian
 from spacy.lang.en import English
 
-from ..util import add_vecs_to_vocab, get_doc
+from ..util import add_vecs_to_vocab
 
 
 @pytest.mark.skip(
@@ -69,9 +69,10 @@ def test_issue2219(en_vocab):
     assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
 
 
-def test_issue2361(de_tokenizer):
+def test_issue2361(de_vocab):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
-    doc = de_tokenizer('< > & " ')
+    words = ["<", ">", "&", '"']
+    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
     html = render(doc)
     for char in chars:
         assert char in html
@@ -105,7 +106,7 @@ def test_issue2385_biluo(tags):
 
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
-    heads = [1, 0, 1, -2, -1, -1]
+    heads = [1, 1, 3, 1, 3, 4]
     deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
@@ -118,7 +119,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 9267a7346..5895b616e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -12,8 +12,6 @@ from spacy.compat import pickle
 import numpy
 import random
 
-from ..util import get_doc
-
 
 def test_issue2564():
     """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
@@ -117,12 +115,14 @@ def test_issue2754(en_tokenizer):
 
 def test_issue2772(en_vocab):
     """Test that deprojectivization doesn't mess up sentence boundaries."""
-    words = "When we write or communicate virtually , we can hide our true feelings .".split()
+    # fmt: off
+    words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
+    # fmt: on
     # A tree with a non-projective (i.e. crossing) arc
     # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
+    heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
     deps = ["dep"] * len(heads)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[1].is_sent_start is False
 
 
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index d848467dd..a64dc53e4 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -10,10 +10,8 @@ from spacy.vocab import Vocab
 from spacy.attrs import ENT_IOB, ENT_TYPE
 from spacy.compat import pickle
 from spacy import displacy
-import numpy
-
 from spacy.vectors import Vectors
-from ..util import get_doc
+import numpy
 
 
 def test_issue3002():
@@ -47,7 +45,7 @@ def test_issue3009(en_vocab):
     words = ["also", "has", "to", "do", "with"]
     tags = ["RB", "VBZ", "TO", "VB", "IN"]
     pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos)
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
     matcher = Matcher(en_vocab)
     for i, pattern in enumerate(patterns):
         matcher.add(str(i), [pattern])
@@ -61,19 +59,15 @@ def test_issue3012(en_vocab):
     words = ["This", "is", "10", "%", "."]
     tags = ["DT", "VBZ", "CD", "NN", "."]
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [(2, 4, "PERCENT")]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+    ents = [("PERCENT", 2, 4)]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
     assert doc.has_annotation("TAG")
-
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     header = [ENT_IOB, ENT_TYPE]
     ent_array = doc.to_array(header)
     doc.from_array(header, ent_array)
-
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     # Serializing then deserializing
     doc_bytes = doc.to_bytes()
     doc2 = Doc(en_vocab).from_bytes(doc_bytes)
@@ -85,12 +79,8 @@ def test_issue3199():
     is available. To make this test future-proof, we're constructing a Doc
     with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = get_doc(
-        Vocab(),
-        words=["This", "is", "a", "sentence"],
-        heads=[0, -1, -2, -3],
-        deps=["dep"] * 4,
-    )
+    words = ["This", "is", "a", "sentence"]
+    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -147,9 +137,9 @@ def test_issue3288(en_vocab):
     """Test that retokenization works correctly via displaCy when punctuation
     is merged onto the preceeding token and tensor is resized."""
     words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-    heads = [1, 0, -1, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 4, 4, 6, 4, 4]
     deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
     displacy.render(doc)
 
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 8c483d877..a79be6638 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -20,7 +20,7 @@ import spacy
 import srsly
 import numpy
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
@@ -355,7 +355,7 @@ def test_issue3882(en_vocab):
     """Test that displaCy doesn't serialize the doc.user_data when making a
     copy of the Doc.
     """
-    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -398,10 +398,10 @@ def test_issue3962(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = doc[1:5]  # "jests at scars ,"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
@@ -436,10 +436,10 @@ def test_issue3962_long(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = two_sent_doc[1:7]  # "jests at scars. They never"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
new file mode 100644
index 000000000..dbfe78679
--- /dev/null
+++ b/spacy/tests/regression/test_issue5001-5500.py
@@ -0,0 +1,138 @@
+import numpy
+from spacy.tokens import Doc, DocBin
+from spacy.attrs import DEP, POS, TAG
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.lang.en.syntax_iterators import noun_chunks
+from spacy.vocab import Vocab
+import spacy
+import pytest
+
+from ...util import make_tempdir
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+    strings = en_vocab.strings
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+    doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
+
+
+def test_issue5082():
+    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+    nlp = English()
+    vocab = nlp.vocab
+    array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
+    array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
+    array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
+    array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
+    array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
+    vocab.set_vector("I", array1)
+    vocab.set_vector("like", array2)
+    vocab.set_vector("David", array3)
+    vocab.set_vector("Bowie", array4)
+    text = "I like David Bowie"
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    parsed_vectors_1 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_1) == 4
+    numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
+    numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
+    nlp.add_pipe("merge_entities")
+    parsed_vectors_2 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_2) == 3
+    numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
+
+
+def test_issue5137():
+    @Language.factory("my_component")
+    class MyComponent:
+        def __init__(self, nlp, name="my_component", categories="all_categories"):
+            self.nlp = nlp
+            self.categories = categories
+            self.name = name
+
+        def __call__(self, doc):
+            pass
+
+        def to_disk(self, path, **kwargs):
+            pass
+
+        def from_disk(self, path, **cfg):
+            pass
+
+    nlp = English()
+    my_component = nlp.add_pipe("my_component")
+    assert my_component.categories == "all_categories"
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
+        nlp2 = spacy.load(tmpdir, config=overrides)
+        assert nlp2.get_pipe("my_component").categories == "my_categories"
+
+
+def test_issue5141(en_vocab):
+    """ Ensure an empty DocBin does not crash on serialization """
+    doc_bin = DocBin(attrs=["DEP", "HEAD"])
+    assert list(doc_bin.get_docs(en_vocab)) == []
+    doc_bin_bytes = doc_bin.to_bytes()
+    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+    assert list(doc_bin_2.get_docs(en_vocab)) == []
+
+
+def test_issue5152():
+    # Test that the comparison between a Span and a Token, goes well
+    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+    nlp = English()
+    text = nlp("Talk about being boring!")
+    text_var = nlp("Talk of being boring!")
+    y = nlp("Let")
+    span = text[0:3]  # Talk about being
+    span_2 = text[0:3]  # Talk about being
+    span_3 = text_var[0:3]  # Talk of being
+    token = y[0]  # Let
+    with pytest.warns(UserWarning):
+        assert span.similarity(token) == 0.0
+    assert span.similarity(span_2) == 1.0
+    with pytest.warns(UserWarning):
+        assert span_2.similarity(span_3) < 1.0
+
+
+def test_issue5458():
+    # Test that the noun chuncker does not generate overlapping spans
+    # fmt: off
+    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+    vocab = Vocab(strings=words)
+    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
+    # fmt: on
+    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
+    en_doc.noun_chunks_iterator = noun_chunks
+
+    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+    nlp = English()
+    merge_nps = nlp.create_pipe("merge_noun_chunks")
+    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
deleted file mode 100644
index bc52ae82f..000000000
--- a/spacy/tests/regression/test_issue5048.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy
-from spacy.tokens import Doc
-from spacy.attrs import DEP, POS, TAG
-
-from ..util import get_doc
-
-
-def test_issue5048(en_vocab):
-    words = ["This", "is", "a", "sentence"]
-    pos_s = ["DET", "VERB", "DET", "NOUN"]
-    spaces = [" ", " ", " ", ""]
-    deps_s = ["dep", "adj", "nn", "atm"]
-    tags_s = ["DT", "VBZ", "DT", "NN"]
-
-    strings = en_vocab.strings
-
-    for w in words:
-        strings.add(w)
-    deps = [strings.add(d) for d in deps_s]
-    pos = [strings.add(p) for p in pos_s]
-    tags = [strings.add(t) for t in tags_s]
-
-    attrs = [POS, DEP, TAG]
-    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
-
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    doc.from_array(attrs, array)
-    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
-
-    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
-    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
-    assert v1 == v2
diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py
deleted file mode 100644
index 76f3a552e..000000000
--- a/spacy/tests/regression/test_issue5082.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-from spacy.lang.en import English
-
-
-def test_issue5082():
-    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
-    nlp = English()
-    vocab = nlp.vocab
-    array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32)
-    array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32)
-    array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32)
-    array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32)
-    array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32)
-
-    vocab.set_vector("I", array1)
-    vocab.set_vector("like", array2)
-    vocab.set_vector("David", array3)
-    vocab.set_vector("Bowie", array4)
-
-    text = "I like David Bowie"
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    parsed_vectors_1 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_1) == 4
-    np.testing.assert_array_equal(parsed_vectors_1[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_1[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_1[2], array3)
-    np.testing.assert_array_equal(parsed_vectors_1[3], array4)
-    nlp.add_pipe("merge_entities")
-    parsed_vectors_2 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_2) == 3
-    np.testing.assert_array_equal(parsed_vectors_2[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_2[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_2[2], array34)
diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py
deleted file mode 100644
index cc7a9bd38..000000000
--- a/spacy/tests/regression/test_issue5137.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import spacy
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.tests.util import make_tempdir
-
-
-def test_issue5137():
-    @Language.factory("my_component")
-    class MyComponent:
-        def __init__(self, nlp, name="my_component", categories="all_categories"):
-            self.nlp = nlp
-            self.categories = categories
-            self.name = name
-
-        def __call__(self, doc):
-            pass
-
-        def to_disk(self, path, **kwargs):
-            pass
-
-        def from_disk(self, path, **cfg):
-            pass
-
-    nlp = English()
-    my_component = nlp.add_pipe("my_component")
-    assert my_component.categories == "all_categories"
-
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
-        nlp2 = spacy.load(tmpdir, config=overrides)
-        assert nlp2.get_pipe("my_component").categories == "my_categories"
diff --git a/spacy/tests/regression/test_issue5141.py b/spacy/tests/regression/test_issue5141.py
deleted file mode 100644
index 845454583..000000000
--- a/spacy/tests/regression/test_issue5141.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue5141(en_vocab):
-    """ Ensure an empty DocBin does not crash on serialization """
-    doc_bin = DocBin(attrs=["DEP", "HEAD"])
-    assert list(doc_bin.get_docs(en_vocab)) == []
-    doc_bin_bytes = doc_bin.to_bytes()
-
-    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
-    assert list(doc_bin_2.get_docs(en_vocab)) == []
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
deleted file mode 100644
index c7a70a99c..000000000
--- a/spacy/tests/regression/test_issue5152.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-def test_issue5152():
-    # Test that the comparison between a Span and a Token, goes well
-    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
-    nlp = English()
-    text = nlp("Talk about being boring!")
-    text_var = nlp("Talk of being boring!")
-    y = nlp("Let")
-    span = text[0:3]  # Talk about being
-    span_2 = text[0:3]  # Talk about being
-    span_3 = text_var[0:3]  # Talk of being
-    token = y[0]  # Let
-    with pytest.warns(UserWarning):
-        assert span.similarity(token) == 0.0
-    assert span.similarity(span_2) == 1.0
-    with pytest.warns(UserWarning):
-        assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py
deleted file mode 100644
index a7a2959df..000000000
--- a/spacy/tests/regression/test_issue5458.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from spacy.lang.en import English
-from spacy.lang.en.syntax_iterators import noun_chunks
-from spacy.tests.util import get_doc
-from spacy.vocab import Vocab
-
-
-def test_issue5458():
-    # Test that the noun chuncker does not generate overlapping spans
-    # fmt: off
-    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
-    dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
-    pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
-    heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
-    # fmt: on
-
-    en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
-    en_doc.noun_chunks_iterator = noun_chunks
-
-    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
-    nlp = English()
-    merge_nps = nlp.create_pipe("merge_noun_chunks")
-    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index 66280f012..db957709c 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,5 +1,6 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
+import pytest
 
 
 def test_issue5918():
@@ -22,6 +23,7 @@ def test_issue5918():
     assert len(doc.ents) == 3
     # make it so that the third span's head is within the entity (ent_iob=I)
     # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    doc[29].head = doc[33]
+    with pytest.warns(UserWarning):
+        doc[29].head = doc[33]
     doc = merge_entities(doc)
     assert len(doc.ents) == 3
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 1fa0eeaa1..040dd657f 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,15 +1,13 @@
 import pytest
 from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
-from spacy.tokens import Span
+from spacy.tokens import Span, Doc
 from spacy.lang.fa import Persian
 
-from .util import get_doc
-
 
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     ents = displacy.parse_ents(doc)
     assert isinstance(ents, dict)
@@ -20,11 +18,11 @@ def test_displacy_parse_ents(en_vocab):
 def test_displacy_parse_deps(en_vocab):
     """Test that deps and tags on a Doc are converted into displaCy's format."""
     words = ["This", "is", "a", "sentence"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 1, 3, 1]
     pos = ["DET", "VERB", "DET", "NOUN"]
     tags = ["DT", "VBZ", "DT", "NN"]
     deps = ["nsubj", "ROOT", "det", "attr"]
-    doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
     deps = displacy.parse_deps(doc)
     assert isinstance(deps, dict)
     assert deps["words"] == [
@@ -53,7 +51,7 @@ def test_displacy_invalid_arcs():
 
 def test_displacy_spans(en_vocab):
     """Test that displaCy can render Spans."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc[1:4], style="ent")
     assert html.startswith("<div")
@@ -70,9 +68,9 @@ def test_displacy_rtl():
     # These are (likely) wrong, but it's just for testing
     pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
     deps = ["foo", "bar", "foo", "baz"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 0, 3, 1]
     nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
+    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
     doc.ents = [Span(doc, 1, 3, label="TEST")]
     html = displacy.render(doc, page=True, style="dep")
     assert "direction: rtl" in html
@@ -90,7 +88,7 @@ def test_displacy_render_wrapper(en_vocab):
         return "TEST" + html + "TEST"
 
     displacy.set_render_wrapper(wrapper)
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc, style="ent")
     assert html.startswith("TEST<div")
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 6e3604ce8..a1406c14a 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -5,7 +5,6 @@ from spacy.training import Example
 from spacy.training.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
-from .util import get_doc
 from spacy.lang.en import English
 from spacy.tokens import Doc
 
@@ -137,11 +136,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         example = Example.from_dict(doc, gold)
@@ -161,11 +157,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         doc[0].dep_ = "compound"
@@ -188,10 +181,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_cardinal:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
+            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
@@ -213,10 +206,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_apple:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
+            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index b09487965..4cab5b015 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,13 +12,14 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.fixture
-def doc():
+def doc(en_vocab):
+    nlp = English()  # make sure we get a new vocab every time
     # fmt: off
-    text = "Sarah's sister flew to Silicon Valley via London."
+    words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
     tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     morphs = ["NounType=prop|Number=sing", "Poss=yes", "Number=sing", "Tense=past|VerbForm=fin",
@@ -26,15 +27,12 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
-    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
+    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
-    nlp = English()
-    words = [t.text for t in nlp.make_doc(text)]
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=words,
         tags=tags,
@@ -212,41 +210,24 @@ def test_json2docs_no_ner(en_vocab):
 
 
 def test_split_sentences(en_vocab):
+    # fmt: off
     words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San",
-        "Francisco",
-        "Valley",
-        "had",
-        "loads",
-        "of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
     sent_starts = [True, False, False, False, False, False, True, False, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "I flew to San Francisco Valley "
     assert split_examples[1].text == "had loads of fun "
-
+    # fmt: off
     words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San Francisco",
-        "Valley",
-        "had",
-        "loads of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
     sent_starts = [True, False, False, False, False, True, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
@@ -479,7 +460,6 @@ def test_roundtrip_docs_to_docbin(doc):
     heads = [t.head.i for t in doc]
     cats = doc.cats
     ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-
     # roundtrip to DocBin
     with make_tempdir() as tmpdir:
         # use a separate vocab to test that all labels are added
@@ -600,7 +580,6 @@ def test_tuple_format_implicit():
 
 def test_tuple_format_implicit_invalid():
     """Test that an error is thrown for an implicit invalid field"""
-
     train_data = [
         ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
         (
@@ -609,7 +588,6 @@ def test_tuple_format_implicit_invalid():
         ),
         ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
     ]
-
     with pytest.raises(KeyError):
         _train_tuples(train_data)
 
@@ -619,11 +597,9 @@ def _train_tuples(train_data):
     ner = nlp.add_pipe("ner")
     ner.add_label("ORG")
     ner.add_label("LOC")
-
     train_examples = []
     for t in train_data:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
     optimizer = nlp.begin_training()
     for i in range(5):
         losses = {}
@@ -639,17 +615,14 @@ def test_split_sents(merged_dict):
         merged_dict,
     )
     assert example.text == "Hi there everyone It is just me"
-
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "Hi there everyone "
     assert split_examples[1].text == "It is just me"
-
     token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
     assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
     assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
     assert token_annotation_1["SENT_START"] == [1, 0, 0]
-
     token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
     assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
     assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 6c67d2ee1..ef7b4d00d 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -2,11 +2,7 @@ import numpy
 import tempfile
 import contextlib
 import srsly
-
-from spacy import Errors
-from spacy.tokens import Doc, Span
-from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
-
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
 from spacy.util import make_tempdir  # noqa: F401
 
@@ -18,35 +14,6 @@ def make_tempfile(mode="r"):
     f.close()
 
 
-def get_doc(
-    vocab,
-    words=[],
-    pos=None,
-    heads=None,
-    deps=None,
-    tags=None,
-    ents=None,
-    lemmas=None,
-    morphs=None,
-):
-    """Create Doc object from given vocab, words and annotations."""
-    if heads is not None:
-        heads = [i + head for i, head in enumerate(heads)]
-    if ents is not None:
-        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(
-        vocab,
-        words=words,
-        pos=pos,
-        heads=heads,
-        deps=deps,
-        tags=tags,
-        ents=ents,
-        lemmas=lemmas,
-        morphs=morphs,
-    )
-
-
 def get_batch(batch_size):
     vocab = Vocab()
     docs = []
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c5f1f6801..f81e4a96b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -200,8 +200,8 @@ cdef class Doc:
         sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
             the same length as words, to assign as token.is_sent_start. Will be
             overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
-            Defaults to None.
+        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
+            (label, start, end) tuples to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -665,7 +665,7 @@ cdef class Doc:
             cdef attr_t kb_id
             cdef int ent_start, ent_end
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
                 for token_index in range(ent_start, ent_end):
                     if token_index in tokens_in_ents.keys():
                         raise ValueError(Errors.E103.format(
@@ -1583,7 +1583,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info):
+def get_entity_info(ent_info, vocab):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,4 +1596,6 @@ def get_entity_info(ent_info):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
+    if isinstance(ent_type, str):
+        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..371b4a06a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -221,7 +221,7 @@ cdef class Example:
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
-        if not self.reference.is_sentenced:
+        if not self.reference.has_annotation("SENT_START"):
             return [self]
 
         align = self.alignment.y2x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 52f94a83d..648ade5f6 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -25,26 +25,27 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 >
 > # Construction 2
 > from spacy.tokens import Doc
+>
 > words = ["hello", "world", "!"]
 > spaces = [True, False, False]
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name           | Description                                                                                                                                                                                    |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
-| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
-| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| _keyword-only_ |                                                                                                                                                                                                |
-| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
-| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
-| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
-| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
+| Name                                     | Description                                                                                                                                                                                                       |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                                  |
+| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                                |
+| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~                      |
+| _keyword-only_                           |                                                                                                                                                                                                                   |
+| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                                |
+| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~                |
+| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~                 |
+| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 
@@ -281,6 +282,19 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 
 Check whether the doc contains annotation on a token attribute.
 
+<Infobox title="Changed in v3.0" variant="warning">
+
+This method replaces the previous boolean attributes like `Doc.is_tagged`,
+`Doc.is_parsed` or `Doc.is_sentenced`.
+
+```diff
+doc = nlp("This is a text")
+- assert doc.is_parsed
++ assert doc.has_annotation("DEP")
+```
+
+</Infobox>
+
 | Name               | Description                                                                                         |
 | ------------------ | --------------------------------------------------------------------------------------------------- |
 | `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 5abeb5707..406ba4b75 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -530,6 +530,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
   The `on_match` callback becomes an optional keyword argument.
+- The `Doc` flags like `Doc.is_parsed` or `Doc.is_tagged` have been replaced by
+  [`Doc.has_annotation`](/api/doc#has_annotation).
 - The `spacy.gold` module has been renamed to
   [`spacy.training`](%%GITHUB_SPACY/spacy/training).
 - The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
@@ -807,10 +809,11 @@ nlp = spacy.blank("en")
 
 ### Migrating Doc flags {#migrating-doc-flags}
 
-The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
-`Doc.is_sentenced` are deprecated in v3 and replaced by
+The [`Doc`](/api/doc) flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3.0 and replaced by
 [`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
-token attribute symbols (the same symbols used in `Matcher` patterns):
+token attribute symbols (the same symbols used in [`Matcher`](/api/matcher)
+patterns):
 
 ```diff
 doc = nlp(text)

From 3abc4a5adb9c29605de89ab984190f64d88190b4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 21 Sep 2020 22:58:03 +0200
Subject: [PATCH 077/133] Slightly tidy doc.ents.__set__

---
 spacy/tokens/doc.pyx | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f81e4a96b..b82bab294 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -663,11 +663,14 @@ cdef class Doc:
             tokens_in_ents = {}
             cdef attr_t entity_type
             cdef attr_t kb_id
-            cdef int ent_start, ent_end
+            cdef int ent_start, ent_end, token_index
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
+                entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                if isinstance(entity_type_, str):
+                    self.vocab.strings.add(entity_type_)
+                entity_type = self.vocab.strings.as_int(entity_type_)
                 for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents.keys():
+                    if token_index in tokens_in_ents:
                         raise ValueError(Errors.E103.format(
                             span1=(tokens_in_ents[token_index][0],
                                    tokens_in_ents[token_index][1],
@@ -1583,7 +1586,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info, vocab):
+def get_entity_info(ent_info):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,6 +1599,4 @@ def get_entity_info(ent_info, vocab):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
-    if isinstance(ent_type, str):
-        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end

From fa5c416db646b919153a362c02f842c7a19dbb9e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:09:22 +0200
Subject: [PATCH 078/133] initialize through nlp object and with train_corpus

---
 spacy/cli/debug_model.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 3d76cdbde..017bcd239 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,5 +1,9 @@
+import warnings
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
+
+from spacy.training import Example
+from spacy.util import dot_to_object
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
@@ -71,12 +75,10 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    # call _link_components directly as we won't call nlp.begin_training
-    nlp._link_components()
-    debug_model(nlp, model, print_settings=print_settings)
+    debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -93,10 +95,21 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X)
+        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
+        try:
+            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
+            nlp.begin_training(lambda: train_corpus(nlp))
+            msg.info("Initialized the model with the training corpus.")
+        except ValueError:
+            try:
+                _set_output_dim(nO=7, model=model)
+                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                msg.info("Initialized the model with dummy data.")
+            except:
+                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -114,8 +127,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        # simulate a goldY value
-        if not goldY:
+        if goldY is None:
             goldY = _simulate_gold(Y)
         dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)

From 45b29c4a5b926c8f85b0a2ed4a9b8be13c5bf7eb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:17:23 +0200
Subject: [PATCH 079/133] cleanup

---
 spacy/cli/debug_model.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 017bcd239..1d27c7c52 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -78,7 +78,9 @@ def debug_model_cli(
     debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(
+    config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
+):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -97,7 +99,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     X = _get_docs()
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
         try:
             train_corpus = dot_to_object(config, config["training"]["train_corpus"])
             nlp.begin_training(lambda: train_corpus(nlp))
@@ -108,7 +109,10 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
                 nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except:
-                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+                msg.fail(
+                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
+                    exits=1,
+                )
 
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
@@ -121,7 +125,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     tok2vec = None
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
-        tok2vec.model.initialize(X=X)
     goldY = None
     for e in range(3):
         if tok2vec:
@@ -145,17 +148,17 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     msg.good(f"Succesfully ended analysis - model looks good.")
 
 
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
+
+
 def _simulate_gold(element, counter=1):
     if isinstance(element, Iterable):
         for i in range(len(element)):
-            element[i] = _simulate_gold(element[i], counter+i)
+            element[i] = _simulate_gold(element[i], counter + i)
         return element
     else:
-        return 1/counter
-
-
-def get_gradient(goldY, Y, ops):
-    return ops.asarray(Y) - ops.asarray(goldY)
+        return 1 / counter
 
 
 def _sentences():
@@ -229,12 +232,3 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
-
-
-def _set_output_dim(model, nO):
-    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
-    if model.has_dim("nO") is None:
-        model.set_dim("nO", nO)
-    if model.has_ref("output_layer"):
-        if model.get_ref("output_layer").has_dim("nO") is None:
-            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 69f7e52c26ef545fb9e39cd748666ae451318c77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:10:06 +0200
Subject: [PATCH 080/133] Update README.md

---
 spacy/tests/README.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 86bbd52da..833dc9266 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -38,18 +38,17 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 ## Dos and don'ts
 
-To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
+To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
 - **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
 - If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- Before requiring the models, always make sure there is no other way to test the particular behavior. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
 - **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
 - If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behavior at a time.
 
 ## Parameters
 
@@ -77,7 +76,7 @@ To test for combinations of parameters, you can add several `parametrize` marker
 @pytest.mark.parametrize('punct', ['.', '!', '?'])
 ```
 
-This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
+This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unnecessary or undesired test bloat.
 
 ## Fixtures
 
@@ -104,9 +103,9 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
-### Constructing a `Doc` object manually with
+### Constructing a `Doc` object manually
 
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need is a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
 def test_doc_token_api_strings(en_vocab):

From beb766d0a09509a7d91518e60c990489789978e0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:15:57 +0200
Subject: [PATCH 081/133] Add test

---
 spacy/tests/doc/test_doc_api.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 2c22926e9..163de5ab0 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -455,3 +455,16 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_doc_set_ents():
+    """Test that both strings and integers can be used to set entities in
+    tuple format via doc.ents."""
+    words = ["a", "b", "c", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+    vocab = Vocab()
+    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    doc = Doc(vocab, words=words, ents=ents)
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]

From fc9c78da25202322c9ec042b529a6a3f91d48e4d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:23:47 +0200
Subject: [PATCH 082/133] Add MorphAnalysis to API sidebar

---
 website/meta/sidebars.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index e27817c92..28915ebb7 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,6 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
+                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 844db6ff12441f63f51d4d9921cdaf4e6af61a04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:31:47 +0200
Subject: [PATCH 083/133] Update architecture overview

---
 website/docs/usage/101/_architecture.md | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 98011f173..6e9120022 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -65,22 +65,22 @@ Matchers help you find and extract information from [`Doc`](/api/doc) objects
 based on match patterns describing the sequences you're looking for. A matcher
 operates on a `Doc` and gives you access to the matched tokens **in context**.
 
-| Name                                          | Description                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                  |
-| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                         |
-| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
+| Name                                          | Description                                                                                                                                                                        |
+| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                 |
+| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                        |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 
 ### Other classes {#architecture-other}
 
-| Name                                             | Description                                                                                                      |
-| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects.               |
-| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                                             |
-| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                                 |
-| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                                         |
-| [`Morphology`](/api/morphology)                  | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
-| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                                        |
-| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                                         |
-| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                                       |
-| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                                           |
+| Name                                             | Description                                                                                        |
+| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
+| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                               |
+| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                   |
+| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
+| [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
+| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
+| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
+| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                         |
+| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |

From e05d6d358d04166779093d2acff0e2c3bb95fe04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:36:37 +0200
Subject: [PATCH 084/133] Update API sidebar MorphAnalysis link

---
 website/meta/sidebars.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 28915ebb7..c5404b68e 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,7 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
-                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
+                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 6316d5f3989a53e4868cd346256fa614bd49e711 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:34 +0200
Subject: [PATCH 085/133] Improve messages in project CLI [ci skip]

---
 spacy/cli/project/assets.py | 1 +
 spacy/cli/project/run.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 8a3aaff25..58f59a3f9 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
                 branch=asset["git"].get("branch"),
                 sparse=sparse_checkout,
             )
+            msg.good(f"Downloaded asset {dest}")
         else:
             url = asset.get("url")
             if not url:
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index d7e1075f3..69c49fba7 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -59,7 +59,7 @@ def project_run(
         for dep in cmd.get("deps", []):
             if not (project_dir / dep).exists():
                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                err_help = "Maybe you forgot to run the 'project assets' command?"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                 err_kwargs = {"exits": 1} if not dry else {}
                 msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:

From f9af7d365c228a8113e6db66d5bc4941c2546d88 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:41 +0200
Subject: [PATCH 086/133] Update docs [ci skip]

---
 website/docs/api/language.md              |  2 +-
 website/docs/usage/linguistic-features.md | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index ffdae9ec6..a7b9c0d88 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -187,7 +187,7 @@ more efficient than processing texts one-by-one.
 > ```python
 > texts = ["One document.", "...", "Lots of documents"]
 > for doc in nlp.pipe(texts, batch_size=50):
->     assert doc.is_parsed
+>     assert doc.has_annotation("DEP")
 > ```
 
 | Name                                       | Description                                                                                                                                                         |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index a229c18e9..914e18acb 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -205,9 +205,10 @@ acquired from [WordNet](https://wordnet.princeton.edu/).
 spaCy features a fast and accurate syntactic dependency parser, and has a rich
 API for navigating the tree. The parser also powers the sentence boundary
 detection, and lets you iterate over base noun phrases, or "chunks". You can
-check whether a [`Doc`](/api/doc) object has been parsed with the
-`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
-`False`, the default sentence iterator will raise an exception.
+check whether a [`Doc`](/api/doc) object has been parsed by calling
+`doc.has_annotation("DEP")`, which checks whether the attribute `Token.dep` has
+been set returns a boolean value. If the result is `False`, the default sentence
+iterator will raise an exception.
 
 <Infobox title="Dependency label scheme" emoji="📖">
 
@@ -1705,9 +1706,10 @@ and can still be overwritten by the parser.
 <Infobox title="Important note" variant="warning">
 
 To prevent inconsistent state, you can only set boundaries **before** a document
-is parsed (and `doc.is_parsed` is `False`). To ensure that your component is
-added in the right place, you can set `before='parser'` or `first=True` when
-adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
+is parsed (and `doc.has_annotation("DEP")` is `False`). To ensure that your
+component is added in the right place, you can set `before='parser'` or
+`first=True` when adding it to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe).
 
 </Infobox>
 

From 135de82a2d7073d535d1ffd1e4254e5dca37c046 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:22:06 +0200
Subject: [PATCH 087/133] add textcat to quickstart

---
 spacy/cli/templates/quickstart_training.jinja | 48 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0db4c8a59..2c7ce024b 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -93,6 +93,29 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "textcat" in components %}
+[components.textcat]
+factory = "textcat"
+
+{% if optimize == "accuracy" %}
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v1"
+exclusive_classes = false
+width = 64
+conv_depth = 2
+embed_size = 2000
+window_size = 1
+ngram_size = 1
+nO = null
+
+{% else -%}
+[components.textcat.model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+{%- endif %}
+{%- endif %}
+
 {# NON-TRANSFORMER PIPELINE #}
 {% else -%}
 
@@ -167,10 +190,33 @@ nO = null
 @architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
+
+{% if "textcat" in components %}
+[components.textcat]
+factory = "textcat"
+
+{% if optimize == "accuracy" %}
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v1"
+exclusive_classes = false
+width = 64
+conv_depth = 2
+embed_size = 2000
+window_size = 1
+ngram_size = 1
+nO = null
+
+{% else -%}
+[components.textcat.model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+{%- endif %}
+{%- endif %}
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner"] %}
+{% if pipe not in ["tagger", "parser", "ner", "textcat"] %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"

From db7126ead9675d70212c33ab9f09d2f67d72cf77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 10:31:26 +0200
Subject: [PATCH 088/133] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec3c168a5..b57bbeda2 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a20"
+__version__ = "3.0.0a21"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 396b33257f7dff646040067c2ed7872d8c194f8b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:40:05 +0200
Subject: [PATCH 089/133] add entity_linker to jinja template

---
 spacy/cli/init_config.py                      |  2 +-
 spacy/cli/templates/quickstart_training.jinja | 34 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index e70195e15..5203c5dbb 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -36,7 +36,7 @@ def init_config_cli(
     """
     Generate a starter config.cfg for training. Based on your requirements
     specified via the CLI arguments, this command generates a config with the
-    optimal settings for you use case. This includes the choice of architecture,
+    optimal settings for your use case. This includes the choice of architecture,
     pretrained weights and related hyperparameters.
 
     DOCS: https://nightly.spacy.io/api/cli#init-config
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 2c7ce024b..0674f0964 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -93,6 +93,22 @@ grad_factor = 1.0
 @layers = "reduce_mean.v1"
 {% endif -%}
 
+{% if "entity_linker" in components -%}
+[components.entity_linker]
+factory = "entity_linker"
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+incl_context = true
+incl_prior = true
+
+[components.entity_linker.model]
+@architectures = "spacy.EntityLinker.v1"
+nO = null
+
+[components.entity_linker.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+{% endif -%}
+
 {% if "textcat" in components %}
 [components.textcat]
 factory = "textcat"
@@ -191,6 +207,22 @@ nO = null
 width = ${components.tok2vec.model.encode.width}
 {% endif %}
 
+{% if "entity_linker" in components -%}
+[components.entity_linker]
+factory = "entity_linker"
+get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
+incl_context = true
+incl_prior = true
+
+[components.entity_linker.model]
+@architectures = "spacy.EntityLinker.v1"
+nO = null
+
+[components.entity_linker.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
 {% if "textcat" in components %}
 [components.textcat]
 factory = "textcat"
@@ -216,7 +248,7 @@ ngram_size = 1
 {% endif %}
 
 {% for pipe in components %}
-{% if pipe not in ["tagger", "parser", "ner", "textcat"] %}
+{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
 {# Other components defined by the user: we just assume they're factories #}
 [components.{{ pipe }}]
 factory = "{{ pipe }}"

From e931f4d75771dc63b2573e2cbd7c834de96def7d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 10:56:43 +0200
Subject: [PATCH 090/133] add textcat score

---
 spacy/cli/templates/quickstart_training.jinja | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0674f0964..0e83b9bdb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -323,3 +323,6 @@ ents_f = {{ (1.0 / components|length)|round(2) }}
 ents_p = 0.0
 ents_r = 0.0
 {%- endif -%}
+{%- if "textcat" in components %}
+cats_score = {{ (1.0 / components|length)|round(2) }}
+{%- endif -%}

From b556a1080893202651d473fc93c4b9010ee01665 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 11:50:19 +0200
Subject: [PATCH 091/133] rename converts in_to_out

---
 spacy/cli/_util.py                            |  4 +--
 spacy/cli/convert.py                          | 14 ++++-----
 spacy/errors.py                               |  2 +-
 spacy/tests/regression/test_issue4001-4500.py |  4 +--
 spacy/tests/regression/test_issue4501-5000.py |  6 ++--
 spacy/tests/test_cli.py                       | 30 +++++++++----------
 spacy/tests/test_scorer.py                    |  6 ++--
 spacy/tests/training/test_training.py         | 26 ++++++++--------
 spacy/training/__init__.py                    |  4 +--
 spacy/training/converters/__init__.py         |  8 ++---
 ...conll_ner2docs.py => conll_ner_to_docs.py} |  2 +-
 .../{conllu2docs.py => conllu_to_docs.py}     | 12 ++++----
 .../{iob2docs.py => iob_to_docs.py}           |  4 +--
 .../{json2docs.py => json_to_docs.py}         |  6 ++--
 spacy/training/example.pyx                    | 18 +++++------
 spacy/training/gold_io.pyx                    |  4 +--
 spacy/training/iob_utils.py                   | 14 ++++-----
 website/docs/api/data-formats.md              |  2 +-
 website/docs/api/top-level.md                 | 18 +++++------
 website/docs/usage/processing-pipelines.md    |  6 ++--
 website/docs/usage/v3.md                      | 15 +++++-----
 21 files changed, 103 insertions(+), 102 deletions(-)
 rename spacy/training/converters/{conll_ner2docs.py => conll_ner_to_docs.py} (99%)
 rename spacy/training/converters/{conllu2docs.py => conllu_to_docs.py} (97%)
 rename spacy/training/converters/{iob2docs.py => iob_to_docs.py} (95%)
 rename spacy/training/converters/{json2docs.py => json_to_docs.py} (82%)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 797a701b9..21a4e54ce 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
         # Looking for this 'rev-list' command in the git --help? Hah.
         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
         ret = run_command(cmd, capture=True)
-        git_repo = _from_http_to_git(repo)
+        git_repo = _http_to_git(repo)
         # Now pass those missings into another bit of git internals
         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
         if not missings:
@@ -414,7 +414,7 @@ def get_git_version(
     return (int(version[0]), int(version[1]))
 
 
-def _from_http_to_git(repo: str) -> str:
+def _http_to_git(repo: str) -> str:
     if repo.startswith("http://"):
         repo = repo.replace(r"http://", r"https://")
     if repo.startswith(r"https://"):
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index ad89b9976..8f8234c61 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -9,7 +9,7 @@ import sys
 from ._util import app, Arg, Opt
 from ..training import docs_to_json
 from ..tokens import DocBin
-from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
+from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
@@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
 # imported from /converters.
 
 CONVERTERS = {
-    "conllubio": conllu2docs,
-    "conllu": conllu2docs,
-    "conll": conllu2docs,
-    "ner": conll_ner2docs,
-    "iob": iob2docs,
-    "json": json2docs,
+    "conllubio": conllu_to_docs,
+    "conllu": conllu_to_docs,
+    "conll": conllu_to_docs,
+    "ner": conll_ner_to_docs,
+    "iob": iob_to_docs,
+    "json": json_to_docs,
 }
 
 
diff --git a/spacy/errors.py b/spacy/errors.py
index f276c4d1a..153f8da0c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -69,7 +69,7 @@ class Warnings:
             "in problems with the vocab further on in the pipeline.")
     W030 = ("Some entities could not be aligned in the text \"{text}\" with "
             "entities \"{entities}\". Use "
-            "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
+            "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
             " to check the alignment. Misaligned entities ('-') will be "
             "ignored during training.")
     W033 = ("Training a new {model} using a model with no lexeme normalization "
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index 4e58c347e..7b7ddfe0d 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
 from spacy.matcher import PhraseMatcher, Matcher
 from spacy.tokens import Doc, Span, DocBin
 from spacy.training import Example, Corpus
-from spacy.training.converters import json2docs
+from spacy.training.converters import json_to_docs
 from spacy.vocab import Vocab
 from spacy.lang.en import English
 from spacy.util import minibatch, ensure_path, load_model
@@ -425,7 +425,7 @@ def test_issue4402():
     attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
     with make_tempdir() as tmpdir:
         output_file = tmpdir / "test4402.spacy"
-        docs = json2docs([json_data])
+        docs = json_to_docs([json_data])
         data = DocBin(docs=docs, attrs=attrs).to_bytes()
         with output_file.open("wb") as file_:
             file_.write(data)
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 9454d7f0c..e351858f5 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -1,7 +1,7 @@
 import pytest
 from spacy.tokens import Doc, Span, DocBin
 from spacy.training import Example
-from spacy.training.converters.conllu2docs import conllu2docs
+from spacy.training.converters.conllu_to_docs import conllu_to_docs
 from spacy.lang.en import English
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
@@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
 
 def test_issue4665():
     """
-    conllu2json should not raise an exception if the HEAD column contains an
+    conllu_to_docs should not raise an exception if the HEAD column contains an
     underscore
     """
     input_data = """
@@ -105,7 +105,7 @@ def test_issue4665():
 17	.	_	PUNCT	.	_	_	punct	_	_
 18	]	_	PUNCT	-RRB-	_	_	punct	_	_
 """
-    conllu2docs(input_data)
+    conllu_to_docs(input_data)
 
 
 def test_issue4674():
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a9c9d8ca5..7141a11ff 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,7 +1,7 @@
 import pytest
 from click import NoSuchOption
-from spacy.training import docs_to_json, biluo_tags_from_offsets
-from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
+from spacy.training import docs_to_json, offsets_to_biluo_tags
+from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
@@ -14,7 +14,7 @@ import os
 from .util import make_tempdir
 
 
-def test_cli_converters_conllu2json():
+def test_cli_converters_conllu_to_json():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@@ -23,7 +23,7 @@ def test_cli_converters_conllu2json():
         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(input_data, n_sents=1)
+    converted_docs = conllu_to_docs(input_data, n_sents=1)
     assert len(converted_docs) == 1
     converted = [docs_to_json(converted_docs)]
     assert converted[0]["id"] == 0
@@ -39,7 +39,7 @@ def test_cli_converters_conllu2json():
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
 
 
@@ -62,9 +62,9 @@ def test_cli_converters_conllu2json():
         ),
     ],
 )
-def test_cli_converters_conllu2json_name_ner_map(lines):
+def test_cli_converters_conllu_to_json_name_ner_map(lines):
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(
+    converted_docs = conllu_to_docs(
         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
     )
     assert len(converted_docs) == 1
@@ -83,11 +83,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
-def test_cli_converters_conllu2json_subtokens():
+def test_cli_converters_conllu_to_json_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
@@ -98,7 +98,7 @@ def test_cli_converters_conllu2json_subtokens():
         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conllu2docs(
+    converted_docs = conllu_to_docs(
         input_data, n_sents=1, merge_subtokens=True, append_morphology=True
     )
     assert len(converted_docs) == 1
@@ -132,11 +132,11 @@ def test_cli_converters_conllu2json_subtokens():
     ent_offsets = [
         (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
     ]
-    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
+    biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
     assert biluo_tags == ["O", "U-PER", "O", "O"]
 
 
-def test_cli_converters_iob2json():
+def test_cli_converters_iob_to_docs():
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
         "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -144,7 +144,7 @@ def test_cli_converters_iob2json():
         "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
     ]
     input_data = "\n".join(lines)
-    converted_docs = iob2docs(input_data, n_sents=10)
+    converted_docs = iob_to_docs(input_data, n_sents=10)
     assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
     assert converted["id"] == 0
@@ -161,7 +161,7 @@ def test_cli_converters_iob2json():
         assert ent.text in ["New York City", "London"]
 
 
-def test_cli_converters_conll_ner2json():
+def test_cli_converters_conll_ner_to_docs():
     lines = [
         "-DOCSTART- -X- O O",
         "",
@@ -211,7 +211,7 @@ def test_cli_converters_conll_ner2json():
         ".\t.\t_\tO",
     ]
     input_data = "\n".join(lines)
-    converted_docs = conll_ner2docs(input_data, n_sents=10)
+    converted_docs = conll_ner_to_docs(input_data, n_sents=10)
     assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
     assert converted["id"] == 0
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index a1406c14a..2825f1703 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
 import pytest
 from pytest import approx
 from spacy.training import Example
-from spacy.training.iob_utils import biluo_tags_from_offsets
+from spacy.training.iob_utils import offsets_to_biluo_tags
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from spacy.lang.en import English
@@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
         )
-        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
         # a hack for sentence boundaries
         example.predicted[1].is_sent_start = False
@@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
             words=input_.split(" "),
             ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
         )
-        entities = biluo_tags_from_offsets(doc, annot["entities"])
+        entities = offsets_to_biluo_tags(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
         # a hack for sentence boundaries
         example.predicted[1].is_sent_start = False
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 4cab5b015..a04e6aadd 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,9 +1,9 @@
 import numpy
-from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
-from spacy.training import spans_from_biluo_tags, iob_to_biluo
+from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
+from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json
 from spacy.training.example import Example
-from spacy.training.converters import json2docs
+from spacy.training.converters import json_to_docs
 from spacy.training.augment import make_orth_variants_example
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
     spaces = [True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to London"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "U-LOC", "O"]
 
 
@@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
     spaces = [True, True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
 
 
@@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
     spaces = [True, True, True, True, True, False, True]
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    tags = biluo_tags_from_offsets(doc, entities)
+    tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 
 
@@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
         (len("I flew to "), len("I flew to San Francisco"), "LOC"),
     ]
     with pytest.raises(ValueError):
-        biluo_tags_from_offsets(doc, entities)
+        offsets_to_biluo_tags(doc, entities)
 
 
 def test_gold_biluo_misalign(en_vocab):
@@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
     doc = Doc(en_vocab, words=words, spaces=spaces)
     entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
     with pytest.warns(UserWarning):
-        tags = biluo_tags_from_offsets(doc, entities)
+        tags = offsets_to_biluo_tags(doc, entities)
     assert tags == ["O", "O", "O", "-", "-", "-"]
 
 
@@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
 
 
 @pytest.mark.filterwarnings("ignore::UserWarning")
-def test_json2docs_no_ner(en_vocab):
+def test_json_to_docs_no_ner(en_vocab):
     data = [
         {
             "id": 1,
@@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
             ],
         }
     ]
-    docs = json2docs(data)
+    docs = json_to_docs(data)
     assert len(docs) == 1
     for doc in docs:
         assert not doc.has_annotation("ENT_IOB")
@@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
     offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
     doc = en_tokenizer(text)
-    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
+    biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
     assert biluo_tags_converted == biluo_tags
-    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
+    offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
     offsets_converted = [ent for ent in offsets if ent[2]]
     assert offsets_converted == offsets
 
@@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 def test_biluo_spans(en_tokenizer):
     doc = en_tokenizer("I flew to Silicon Valley via London.")
     biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
-    spans = spans_from_biluo_tags(doc, biluo_tags)
+    spans = biluo_tags_to_spans(doc, biluo_tags)
     spans = [span for span in spans if span.label_]
     assert len(spans) == 2
     assert spans[0].text == "Silicon Valley"
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 35e67f696..9172dde25 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -2,8 +2,8 @@ from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
-from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags  # noqa: F401
-from .iob_utils import spans_from_biluo_tags, tags_to_entities  # noqa: F401
+from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
+from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .loggers import console_logger, wandb_logger  # noqa: F401
diff --git a/spacy/training/converters/__init__.py b/spacy/training/converters/__init__.py
index 15f025a08..e91b6aaa6 100644
--- a/spacy/training/converters/__init__.py
+++ b/spacy/training/converters/__init__.py
@@ -1,4 +1,4 @@
-from .iob2docs import iob2docs  # noqa: F401
-from .conll_ner2docs import conll_ner2docs  # noqa: F401
-from .json2docs import json2docs  # noqa: F401
-from .conllu2docs import conllu2docs  # noqa: F401
+from .iob_to_docs import iob_to_docs  # noqa: F401
+from .conll_ner_to_docs import conll_ner_to_docs  # noqa: F401
+from .json_to_docs import json_to_docs  # noqa: F401
+from .conllu_to_docs import conllu_to_docs  # noqa: F401
diff --git a/spacy/training/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner_to_docs.py
similarity index 99%
rename from spacy/training/converters/conll_ner2docs.py
rename to spacy/training/converters/conll_ner_to_docs.py
index 8dcaf2599..3b851039c 100644
--- a/spacy/training/converters/conll_ner2docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -7,7 +7,7 @@ from ...tokens import Doc, Span
 from ...util import load_model
 
 
-def conll_ner2docs(
+def conll_ner_to_docs(
     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
     """
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu_to_docs.py
similarity index 97%
rename from spacy/training/converters/conllu2docs.py
rename to spacy/training/converters/conllu_to_docs.py
index b4d8b3ac4..18a2b6a93 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu_to_docs.py
@@ -1,13 +1,13 @@
 import re
 
-from .conll_ner2docs import n_sents_info
-from ...training import iob_to_biluo, spans_from_biluo_tags
+from .conll_ner_to_docs import n_sents_info
+from ...training import iob_to_biluo, biluo_tags_to_spans
 from ...tokens import Doc, Token, Span
 from ...vocab import Vocab
 from wasabi import Printer
 
 
-def conllu2docs(
+def conllu_to_docs(
     input_data,
     n_sents=10,
     append_morphology=False,
@@ -78,7 +78,7 @@ def read_conllx(
         if lines:
             while lines[0].startswith("#"):
                 lines.pop(0)
-            doc = doc_from_conllu_sentence(
+            doc = conllu_sentence_to_doc(
                 vocab,
                 lines,
                 ner_tag_pattern,
@@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
     return iob_to_biluo(iob)
 
 
-def doc_from_conllu_sentence(
+def conllu_sentence_to_doc(
     vocab,
     lines,
     ner_tag_pattern,
@@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
         doc[i]._.merged_lemma = lemmas[i]
         doc[i]._.merged_spaceafter = spaces[i]
     ents = get_entities(lines, ner_tag_pattern, ner_map)
-    doc.ents = spans_from_biluo_tags(doc, ents)
+    doc.ents = biluo_tags_to_spans(doc, ents)
 
     if merge_subtokens:
         doc = merge_conllu_subtokens(lines, doc)
diff --git a/spacy/training/converters/iob2docs.py b/spacy/training/converters/iob_to_docs.py
similarity index 95%
rename from spacy/training/converters/iob2docs.py
rename to spacy/training/converters/iob_to_docs.py
index 2f6742fea..bfd981649 100644
--- a/spacy/training/converters/iob2docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -1,13 +1,13 @@
 from wasabi import Printer
 
-from .conll_ner2docs import n_sents_info
+from .conll_ner_to_docs import n_sents_info
 from ...vocab import Vocab
 from ...training import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
 from ...util import minibatch
 
 
-def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
+def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
     """
     Convert IOB files with one sentence per line and tags separated with '|'
     into Doc objects so they can be saved. IOB and IOB2 are accepted.
diff --git a/spacy/training/converters/json2docs.py b/spacy/training/converters/json_to_docs.py
similarity index 82%
rename from spacy/training/converters/json2docs.py
rename to spacy/training/converters/json_to_docs.py
index 342f94848..d7df1d6f9 100644
--- a/spacy/training/converters/json2docs.py
+++ b/spacy/training/converters/json_to_docs.py
@@ -1,12 +1,12 @@
 import srsly
 from ..gold_io import json_iterate, json_to_annotations
-from ..example import annotations2doc
+from ..example import annotations_to_doc
 from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 from ...util import load_model
 from ...lang.xx import MultiLanguage
 
 
-def json2docs(input_data, model=None, **kwargs):
+def json_to_docs(input_data, model=None, **kwargs):
     nlp = load_model(model) if model is not None else MultiLanguage()
     if not isinstance(input_data, bytes):
         if not isinstance(input_data, str):
@@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
         for json_para in json_to_annotations(json_doc):
             example_dict = _fix_legacy_dict_data(json_para)
             tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
+            doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
             docs.append(doc)
     return docs
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 371b4a06a..fbf05b224 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -7,13 +7,13 @@ from ..tokens.span cimport Span
 from ..tokens.span import Span
 from ..attrs import IDS
 from .align import Alignment
-from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
-from .iob_utils import spans_from_biluo_tags
+from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
+from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
 
 
-cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
+cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
     """ Create a Doc from dictionaries with token and doc annotations. """
     attrs, array = _annot2array(vocab, tok_annot, doc_annot)
     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
@@ -92,7 +92,7 @@ cdef class Example:
             tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
         return Example(
             predicted,
-            annotations2doc(predicted.vocab, tok_dict, doc_dict)
+            annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
         )
 
     @property
@@ -176,7 +176,7 @@ cdef class Example:
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
-        x_tags = biluo_tags_from_offsets(
+        x_tags = offsets_to_biluo_tags(
             self.x,
             [(e.start_char, e.end_char, e.label_) for e in x_ents],
             missing=None
@@ -195,7 +195,7 @@ cdef class Example:
         return {
             "doc_annotation": {
                 "cats": dict(self.reference.cats),
-                "entities": biluo_tags_from_doc(self.reference),
+                "entities": doc_to_biluo_tags(self.reference),
                 "links": self._links_to_dict()
             },
             "token_annotation": {
@@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
     elif isinstance(ner_data[0], tuple):
         return _add_entities_to_doc(
             doc,
-            biluo_tags_from_offsets(doc, ner_data)
+            offsets_to_biluo_tags(doc, ner_data)
         )
     elif isinstance(ner_data[0], str) or ner_data[0] is None:
         return _add_entities_to_doc(
             doc,
-            spans_from_biluo_tags(doc, ner_data)
+            biluo_tags_to_spans(doc, ner_data)
         )
     elif isinstance(ner_data[0], Span):
         # Ugh, this is super messy. Really hard to set O entities
@@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
         # This is annoying but to convert the offsets we need a Doc
         # that has the target tokenization.
         reference = Doc(vocab, words=words, spaces=spaces)
-        biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
+        biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
     else:
         biluo = biluo_or_offsets
     ent_iobs = []
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index b58df0d71..524da0a16 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -3,7 +3,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import biluo_tags_from_offsets, tags_to_entities
+from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 import json
 
 
@@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
             if ent.kb_id_:
                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                 json_para["links"].append(link_dict)
-        biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
         for j, sent in enumerate(doc.sents):
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index ceb5e16b8..63deed3a5 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -50,15 +50,15 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
-def biluo_tags_from_doc(doc, missing="O"):
-    return biluo_tags_from_offsets(
+def doc_to_biluo_tags(doc, missing="O"):
+    return offsets_to_biluo_tags(
         doc,
         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
         missing=missing,
     )
 
 
-def biluo_tags_from_offsets(doc, entities, missing="O"):
+def offsets_to_biluo_tags(doc, entities, missing="O"):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
 
@@ -80,7 +80,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
         >>> text = 'I like London.'
         >>> entities = [(len('I like '), len('I like London'), 'LOC')]
         >>> doc = nlp.tokenizer(text)
-        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> tags = offsets_to_biluo_tags(doc, entities)
         >>> assert tags == ["O", "O", 'U-LOC', "O"]
     """
     # Ensure no overlapping entity labels exist
@@ -143,7 +143,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
     return biluo
 
 
-def spans_from_biluo_tags(doc, tags):
+def biluo_tags_to_spans(doc, tags):
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
 
@@ -161,7 +161,7 @@ def spans_from_biluo_tags(doc, tags):
     return spans
 
 
-def offsets_from_biluo_tags(doc, tags):
+def biluo_tags_to_offsets(doc, tags):
     """Encode per-token tags following the BILUO scheme into entity offsets.
 
     doc (Doc): The document that the BILUO tags refer to.
@@ -172,7 +172,7 @@ def offsets_from_biluo_tags(doc, tags):
         `end` will be character-offset integers denoting the slice into the
         original string.
     """
-    spans = spans_from_biluo_tags(doc, tags)
+    spans = biluo_tags_to_spans(doc, tags)
     return [(span.start_char, span.end_char, span.label_) for span in spans]
 
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 3a214428b..e3b3900be 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
 > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
 > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
 > representing a `PERSON` entity. The
-> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
 > can help you convert entity offsets to the right format.
 
 ```python
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 7afe02403..2c082ae0b 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -619,7 +619,7 @@ sequences in the batch.
 
 ## Training data and alignment {#gold source="spacy/training"}
 
-### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
+### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
 
 Encode labelled spans into per-token tags, using the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
@@ -635,11 +635,11 @@ single-token entity.
 > #### Example
 >
 > ```python
-> from spacy.training import biluo_tags_from_offsets
+> from spacy.training import offsets_to_biluo_tags
 >
 > doc = nlp("I like London.")
 > entities = [(7, 13, "LOC")]
-> tags = biluo_tags_from_offsets(doc, entities)
+> tags = offsets_to_biluo_tags(doc, entities)
 > assert tags == ["O", "O", "U-LOC", "O"]
 > ```
 
@@ -649,7 +649,7 @@ single-token entity.
 | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
 | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    |
 
-### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
+### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
 
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
@@ -657,11 +657,11 @@ Encode per-token tags following the
 > #### Example
 >
 > ```python
-> from spacy.training import offsets_from_biluo_tags
+> from spacy.training import biluo_tags_to_offsets
 >
 > doc = nlp("I like London.")
 > tags = ["O", "O", "U-LOC", "O"]
-> entities = offsets_from_biluo_tags(doc, tags)
+> entities = biluo_tags_to_offsets(doc, tags)
 > assert entities == [(7, 13, "LOC")]
 > ```
 
@@ -671,7 +671,7 @@ Encode per-token tags following the
 | `entities`  | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
 | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~                                                                                 |
 
-### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
+### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
 
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into
@@ -681,11 +681,11 @@ token-based tags, e.g. to overwrite the `doc.ents`.
 > #### Example
 >
 > ```python
-> from spacy.training import spans_from_biluo_tags
+> from spacy.training import biluo_tags_to_spans
 >
 > doc = nlp("I like London.")
 > tags = ["O", "O", "U-LOC", "O"]
-> doc.ents = spans_from_biluo_tags(doc, tags)
+> doc.ents = biluo_tags_to_spans(doc, tags)
 > ```
 
 | Name        | Description                                                                                                                                                                                                                                                  |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 3d756215f..97806dc2a 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1501,7 +1501,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
 component function and pass it the token texts from the `Doc` object received by
 the component.
 
-The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
+The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
 helpful here, because it takes a `Doc` object and token-based BILUO tags and
 returns a sequence of `Span` objects in the `Doc` with added labels. So all your
 wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
@@ -1516,14 +1516,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
 ```python
 ### {highlight="1,8-9"}
 import your_custom_entity_recognizer
-from spacy.training import offsets_from_biluo_tags
+from spacy.training import biluo_tags_to_spans
 from spacy.language import Language
 
 @Language.component("custom_ner_wrapper")
 def custom_ner_wrapper(doc):
     words = [token.text for token in doc]
     custom_entities = your_custom_entity_recognizer(words)
-    doc.ents = spans_from_biluo_tags(doc, custom_entities)
+    doc.ents = biluo_tags_to_spans(doc, custom_entities)
     return doc
 ```
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 406ba4b75..b3c586fe1 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -968,16 +968,17 @@ python -m spacy package ./output ./packages
 
 #### Data utilities and gold module {#migrating-gold}
 
-The `spacy.gold` module has been renamed to `spacy.training`. This mostly
+The `spacy.gold` module has been renamed to `spacy.training` and the conversion 
+utilities now follow the naming format of `x_to_y`. This mostly
 affects internals, but if you've been using the span offset conversion utilities
-[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets),
-[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or
-[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to
-change your imports:
+[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
+[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
+[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
+change your names and imports:
 
 ```diff
-- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags
-+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags
+- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
++ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
 ```
 
 #### Migration notes for plugin maintainers {#migrating-plugins}

From e1b8090b9bdc880ede79bab5f269e3c352e17183 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 12:01:06 +0200
Subject: [PATCH 092/133] few more fixes

---
 spacy/tests/test_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 7141a11ff..99e83eccf 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -14,7 +14,7 @@ import os
 from .util import make_tempdir
 
 
-def test_cli_converters_conllu_to_json():
+def test_cli_converters_conllu_to_docs():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@@ -62,7 +62,7 @@ def test_cli_converters_conllu_to_json():
         ),
     ],
 )
-def test_cli_converters_conllu_to_json_name_ner_map(lines):
+def test_cli_converters_conllu_to_docs_name_ner_map(lines):
     input_data = "\n".join(lines)
     converted_docs = conllu_to_docs(
         input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
@@ -87,7 +87,7 @@ def test_cli_converters_conllu_to_json_name_ner_map(lines):
     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
-def test_cli_converters_conllu_to_json_subtokens():
+def test_cli_converters_conllu_to_docs_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
         "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",

From 085a1c8e2b4b3a136025ef693bb6e7537d88729f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 22 Sep 2020 12:06:40 +0200
Subject: [PATCH 093/133] add no_output_layer to TextCatBOW config

---
 spacy/cli/templates/quickstart_training.jinja | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 0e83b9bdb..a0d9f78ac 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -129,6 +129,7 @@ nO = null
 @architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
 ngram_size = 1
+no_output_layer = false
 {%- endif %}
 {%- endif %}
 
@@ -243,6 +244,7 @@ nO = null
 @architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
 ngram_size = 1
+no_output_layer = false
 {%- endif %}
 {%- endif %}
 {% endif %}

From 5e3b796b122fc9b1125f350b5dcda625fd9740f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 12:24:39 +0200
Subject: [PATCH 094/133] Validate section refs in debug config

---
 spacy/cli/debug_config.py | 27 +++++++++++++++++++++++++--
 spacy/tests/test_cli.py   | 15 ++++++++++++++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 7930d0674..d07a0bb2d 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
 from thinc.api import Config
-from thinc.config import VARIABLE_RE
+from thinc.config import VARIABLE_RE, ConfigValidationError
 import typer
 
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@@ -51,7 +51,10 @@ def debug_config(
     msg.divider("Config validation")
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
-        nlp, _ = util.load_model_from_config(config)
+        nlp, resolved = util.load_model_from_config(config)
+        # Use the resolved config here in case user has one function returning
+        # a dict of corpora etc.
+        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
     msg.good("Config is valid")
     if show_vars:
         variables = get_variables(config)
@@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
         value = util.dot_to_object(config, path)
         result[variable] = repr(value)
     return result
+
+
+def check_section_refs(config: Config, fields: List[str]) -> None:
+    """Validate fields in the config that refer to other sections or values
+    (e.g. in the corpora) and make sure that those references exist.
+    """
+    errors = []
+    for field in fields:
+        # If the field doesn't exist in the config, we ignore it
+        try:
+            value = util.dot_to_object(config, field)
+        except KeyError:
+            continue
+        try:
+            util.dot_to_object(config, value)
+        except KeyError:
+            msg = f"not a valid section reference: {value}"
+            errors.append({"loc": field.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config, errors)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a9c9d8ca5..1bc246fef 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -7,7 +7,8 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from thinc.config import ConfigValidationError
+from spacy.cli.debug_config import check_section_refs
+from thinc.config import ConfigValidationError, Config
 import srsly
 import os
 
@@ -413,3 +414,15 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
     assert string_to_list(value, intify=False) == ["1", "2", "3"]
     assert string_to_list(value, intify=True) == [1, 2, 3]
+
+
+def test_check_section_refs():
+    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
+    config = Config(config)
+    # Valid section reference
+    check_section_refs(config, ["a.b.c"])
+    # Section that doesn't exist in this config
+    check_section_refs(config, ["x.y.z"])
+    # Invalid section reference
+    with pytest.raises(ConfigValidationError):
+        check_section_refs(config, ["a.b.c", "f.g"])

From d53c84b6d6717375ee91d2847a3d0f24beafd8d1 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 13:54:44 +0200
Subject: [PATCH 095/133] avoid None callback (#6100)

---
 spacy/pipeline/tok2vec.py            |  2 +-
 spacy/tests/pipeline/test_tok2vec.py | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 721c67a19..9ab4e42b7 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
         tokvecs = self.model.predict(docs)
         batch_id = Tok2VecListener.get_batch_id(docs)
         for listener in self.listeners:
-            listener.receive(batch_id, tokvecs, None)
+            listener.receive(batch_id, tokvecs, lambda dX: [])
         return tokvecs
 
     def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 2e514f490..6041657d3 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -169,3 +169,22 @@ def test_tok2vec_listener():
     nlp.select_pipes(disable="tok2vec")
     assert nlp.pipe_names == ["tagger"]
     nlp("Running the pipeline with the Tok2Vec component disabled.")
+
+
+def test_tok2vec_listener_callback():
+    orig_config = Config().from_str(cfg_string)
+    nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp.pipe_names == ["tok2vec", "tagger"]
+    tagger = nlp.get_pipe("tagger")
+    tok2vec = nlp.get_pipe("tok2vec")
+    nlp._link_components()
+    docs = [nlp.make_doc("A random sentence")]
+    tok2vec.model.initialize(X=docs)
+    gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
+    label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
+    tagger.model.initialize(X=docs, Y=label_sample)
+    docs = [nlp.make_doc("Another entirely random sentence")]
+    tok2vec.predict(docs)
+    Y, get_dX = tagger.model.begin_update(docs)
+    # assure that the backprop call works (and doesn't hit a 'None' callback)
+    assert get_dX(Y) is not None

From e0e793be4d8146768e722c23d16cf7c5b170155e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 21:53:06 +0200
Subject: [PATCH 096/133] fix KB IO (#6118)

---
 spacy/kb.pxd                               |  1 -
 spacy/kb.pyx                               | 47 ++++++++++++----------
 spacy/tests/pipeline/test_entity_linker.py | 23 +++++++++++
 3 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 695693666..4a71b26a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -140,7 +140,6 @@ cdef class KnowledgeBase:
         self._entries.push_back(entry)
         self._aliases_table.push_back(alias)
 
-    cpdef from_disk(self, loc)
     cpdef set_entities(self, entity_list, freq_list, vector_list)
 
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index b24ed3a20..ff5382c24 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -9,7 +9,8 @@ from libcpp.vector cimport vector
 
 from pathlib import Path
 import warnings
-from os import path
+
+from spacy import util
 
 from .typedefs cimport hash_t
 from .errors import Errors, Warnings
@@ -319,8 +320,14 @@ cdef class KnowledgeBase:
         return 0.0
 
 
-    def to_disk(self, loc):
-        cdef Writer writer = Writer(loc)
+    def to_disk(self, path):
+        path = util.ensure_path(path)
+        if path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        if not path.parent.exists():
+            path.parent.mkdir(parents=True)
+
+        cdef Writer writer = Writer(path)
         writer.write_header(self.get_size_entities(), self.entity_vector_length)
 
         # dumping the entity vectors in their original order
@@ -359,7 +366,13 @@ cdef class KnowledgeBase:
 
         writer.close()
 
-    cpdef from_disk(self, loc):
+    def from_disk(self, path):
+        path = util.ensure_path(path)
+        if path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
@@ -369,7 +382,7 @@ cdef class KnowledgeBase:
         cdef AliasC alias
         cdef float vector_element
 
-        cdef Reader reader = Reader(loc)
+        cdef Reader reader = Reader(path)
 
         # STEP 0: load header and initialize KB
         cdef int64_t nr_entities
@@ -450,16 +463,13 @@ cdef class KnowledgeBase:
 
 
 cdef class Writer:
-    def __init__(self, object loc):
-        if isinstance(loc, Path):
-            loc = bytes(loc)
-        if path.exists(loc):
-            if path.isdir(loc):
-                raise ValueError(Errors.E928.format(loc=loc))
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+    def __init__(self, path):
+        assert isinstance(path, Path)
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
         self._fp = fopen(<char*>bytes_loc, 'wb')
         if not self._fp:
-            raise IOError(Errors.E146.format(path=loc))
+            raise IOError(Errors.E146.format(path=path))
         fseek(self._fp, 0, 0)
 
     def close(self):
@@ -496,14 +506,9 @@ cdef class Writer:
 
 
 cdef class Reader:
-    def __init__(self, object loc):
-        if isinstance(loc, Path):
-            loc = bytes(loc)
-        if not path.exists(loc):
-            raise ValueError(Errors.E929.format(loc=loc))
-        if path.isdir(loc):
-            raise ValueError(Errors.E928.format(loc=loc))
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+    def __init__(self, path):
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
         self._fp = fopen(<char*>bytes_loc, 'rb')
         if not self._fp:
             PyErr_SetFromErrno(IOError)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index c43d2c58e..88e0646b3 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -144,6 +144,29 @@ def test_kb_empty(nlp):
         entity_linker.begin_training(lambda: [])
 
 
+def test_kb_serialize(nlp):
+    """Test serialization of the KB"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    with make_tempdir() as d:
+        # normal read-write behaviour
+        mykb.to_disk(d / "kb")
+        mykb.from_disk(d / "kb")
+        mykb.to_disk(d / "kb.file")
+        mykb.from_disk(d / "kb.file")
+        mykb.to_disk(d / "new" / "kb")
+        mykb.from_disk(d / "new" / "kb")
+        # allow overwriting an existing file
+        mykb.to_disk(d / "kb.file")
+        with pytest.raises(ValueError):
+            # can not write to a directory
+            mykb.to_disk(d)
+        with pytest.raises(ValueError):
+            # can not read from a directory
+            mykb.from_disk(d)
+        with pytest.raises(ValueError):
+            # can not read from an unknown file
+            mykb.from_disk(d / "unknown" / "kb")
+
 def test_candidate_generation(nlp):
     """Test correct candidate generation"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

From 86a08f819d192e50beff97e1b90c12f0daba2975 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 22 Sep 2020 21:54:52 +0200
Subject: [PATCH 097/133] tok2vec.update instead of predict (#6113)

---
 spacy/cli/debug_model.py             | 2 +-
 spacy/tests/pipeline/test_tok2vec.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1d27c7c52..7f8e1dabc 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -128,7 +128,7 @@ def debug_model(
     goldY = None
     for e in range(3):
         if tok2vec:
-            tok2vec.predict(X)
+            tok2vec.update([Example.from_dict(x, {}) for x in X])
         Y, get_dX = model.begin_update(X)
         if goldY is None:
             goldY = _simulate_gold(Y)
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 6041657d3..985314217 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -184,7 +184,7 @@ def test_tok2vec_listener_callback():
     label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
     tagger.model.initialize(X=docs, Y=label_sample)
     docs = [nlp.make_doc("Another entirely random sentence")]
-    tok2vec.predict(docs)
+    tok2vec.update([Example.from_dict(x, {}) for x in docs])
     Y, get_dX = tagger.model.begin_update(docs)
     # assure that the backprop call works (and doesn't hit a 'None' callback)
     assert get_dX(Y) is not None

From 4a56ea72b545ea1162ae85d3b1ccc37f809182ec Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 09:15:07 +0200
Subject: [PATCH 098/133] fallbacks for old names

---
 spacy/training/iob_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 63deed3a5..03a502912 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -50,6 +50,10 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
+def biluo_tags_from_doc(doc, missing="O"):
+    return doc_to_biluo_tags(doc, missing)
+
+
 def doc_to_biluo_tags(doc, missing="O"):
     return offsets_to_biluo_tags(
         doc,
@@ -58,6 +62,10 @@ def doc_to_biluo_tags(doc, missing="O"):
     )
 
 
+def biluo_tags_from_offsets(doc, entities, missing="O"):
+    return offsets_to_biluo_tags(doc, entities, missing)
+
+
 def offsets_to_biluo_tags(doc, entities, missing="O"):
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
@@ -143,6 +151,10 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     return biluo
 
 
+def spans_from_biluo_tags(doc, tags):
+    return biluo_tags_to_spans(doc, tags)
+
+
 def biluo_tags_to_spans(doc, tags):
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
@@ -161,6 +173,10 @@ def biluo_tags_to_spans(doc, tags):
     return spans
 
 
+def offsets_from_biluo_tags(doc, tags):
+    return biluo_tags_to_offsets(doc, tags)
+
+
 def biluo_tags_to_offsets(doc, tags):
     """Encode per-token tags following the BILUO scheme into entity offsets.
 

From 556f3e4652a33eb1465e1f886310653d8e3d2fd2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 09:24:28 +0200
Subject: [PATCH 099/133] add pooling to NEL's TransformerListener

---
 spacy/cli/templates/quickstart_training.jinja | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index a0d9f78ac..c55374899 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -107,6 +107,9 @@ nO = null
 [components.entity_linker.model.tok2vec]
 @architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
+
+[components.entity_linker.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
 {% endif -%}
 
 {% if "textcat" in components %}

From f976bab710dae664501e6fecd7360053a080090e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:30:09 +0200
Subject: [PATCH 100/133] Remove empty file [ci skip]

---
 spacy/lang/cs/test_text.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 spacy/lang/cs/test_text.py

diff --git a/spacy/lang/cs/test_text.py b/spacy/lang/cs/test_text.py
deleted file mode 100644
index e69de29bb..000000000

From d8f661c9103b6b0a09de5b0e25428782d6736006 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:30:26 +0200
Subject: [PATCH 101/133] Update docs [ci skip]

---
 README.md                   |   4 +-
 website/meta/languages.json | 239 +++++++++++++++++-------------------
 2 files changed, 113 insertions(+), 130 deletions(-)

diff --git a/README.md b/README.md
index d23051af0..61cefb69a 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
 
 spaCy comes with
 [pretrained pipelines](https://spacy.io/models) and vectors, and
-currently supports tokenization for **59+ languages**. It features
+currently supports tokenization for **60+ languages**. It features
 state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
 spaCy is commercial open-source software, released under the MIT license.
@@ -69,7 +69,7 @@ it.
 
 ## Features
 
-- Support for **59+ languages**
+- Support for **60+ languages**
 - **Trained pipelines**
 - Multi-task learning with pretrained **transformers** like BERT
 - Pretrained **word vectors**
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 493f96c49..5ef3a6469 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -1,21 +1,11 @@
 {
     "languages": [
-        {
-            "code": "zh",
-            "name": "Chinese",
-            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
-            "dependencies": [
-                {
-                    "name": "Jieba",
-                    "url": "https://github.com/fxsjy/jieba"
-                },
-                {
-                    "name": "PKUSeg",
-                    "url": "https://github.com/lancopku/PKUSeg-python"
-                }
-            ],
-            "has_examples": true
-        },
+        { "code": "af", "name": "Afrikaans" },
+        { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
+        { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
+        { "code": "bn", "name": "Bengali", "has_examples": true },
+        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
+        { "code": "cs", "name": "Czech", "has_examples": true },
         {
             "code": "da",
             "name": "Danish",
@@ -23,39 +13,10 @@
             "has_examples": true,
             "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
         },
-        {
-            "code": "nl",
-            "name": "Dutch",
-            "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
-            "example": "Dit is een zin.",
-            "has_examples": true
-        },
-        {
-            "code": "en",
-            "name": "English",
-            "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
-            "starters": [
-                "en_vectors_web_lg",
-                "en_trf_bertbaseuncased_lg",
-                "en_trf_robertabase_lg",
-                "en_trf_distilbertbaseuncased_lg",
-                "en_trf_xlnetbasecased_lg"
-            ],
-            "example": "This is a sentence.",
-            "has_examples": true
-        },
-        {
-            "code": "fr",
-            "name": "French",
-            "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
-            "example": "C'est une phrase.",
-            "has_examples": true
-        },
         {
             "code": "de",
             "name": "German",
-            "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
-            "starters": ["de_trf_bertbasecased_lg"],
+            "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
             "example": "Dies ist ein Satz.",
             "has_examples": true
         },
@@ -66,6 +27,46 @@
             "example": "Αυτή είναι μια πρόταση.",
             "has_examples": true
         },
+        {
+            "code": "en",
+            "name": "English",
+            "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
+            "starters": ["en_vectors_web_lg"],
+            "example": "This is a sentence.",
+            "has_examples": true
+        },
+        {
+            "code": "es",
+            "name": "Spanish",
+            "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
+            "example": "Esto es una frase.",
+            "has_examples": true
+        },
+        { "code": "et", "name": "Estonian" },
+        { "code": "eu", "name": "Basque", "has_examples": true },
+        { "code": "fa", "name": "Persian", "has_examples": true },
+        { "code": "fi", "name": "Finnish", "has_examples": true },
+        {
+            "code": "fr",
+            "name": "French",
+            "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
+            "example": "C'est une phrase.",
+            "has_examples": true
+        },
+        { "code": "ga", "name": "Irish" },
+        { "code": "gu", "name": "Gujarati", "has_examples": true },
+        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
+        { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
+        { "code": "hr", "name": "Croatian", "has_examples": true },
+        { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
+        { "code": "hy", "name": "Armenian", "has_examples": true },
+        {
+            "code": "id",
+            "name": "Indonesian",
+            "example": "Ini adalah sebuah kalimat.",
+            "has_examples": true
+        },
+        { "code": "is", "name": "Icelandic" },
         {
             "code": "it",
             "name": "Italian",
@@ -88,12 +89,37 @@
             "example": "これは文章です。",
             "has_examples": true
         },
+        { "code": "kn", "name": "Kannada", "has_examples": true },
+        {
+            "code": "ko",
+            "name": "Korean",
+            "dependencies": [
+                {
+                    "name": "mecab-ko",
+                    "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
+                },
+                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
+                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
+            ],
+            "example": "이것은 문장입니다.",
+            "has_examples": true
+        },
+        { "code": "lb", "name": "Luxembourgish", "has_examples": true },
+        {
+            "code": "lij",
+            "name": "Ligurian",
+            "example": "Sta chì a l'é unna fraxe.",
+            "has_examples": true
+        },
         {
             "code": "lt",
             "name": "Lithuanian",
             "has_examples": true,
             "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
         },
+        { "code": "lv", "name": "Latvian" },
+        { "code": "ml", "name": "Malayalam", "has_examples": true },
+        { "code": "mr", "name": "Marathi" },
         {
             "code": "nb",
             "name": "Norwegian Bokmål",
@@ -101,6 +127,14 @@
             "has_examples": true,
             "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
         },
+        { "code": "ne", "name": "Nepali", "has_examples": true },
+        {
+            "code": "nl",
+            "name": "Dutch",
+            "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
+            "example": "Dit is een zin.",
+            "has_examples": true
+        },
         {
             "code": "pl",
             "name": "Polish",
@@ -122,69 +156,26 @@
             "has_examples": true,
             "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
         },
-        {
-            "code": "es",
-            "name": "Spanish",
-            "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
-            "example": "Esto es una frase.",
-            "has_examples": true
-        },
-        { "code": "sv", "name": "Swedish", "has_examples": true },
-        { "code": "fi", "name": "Finnish", "has_examples": true },
-        { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
         {
             "code": "ru",
             "name": "Russian",
             "has_examples": true,
             "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
         },
-        {
-            "code": "uk",
-            "name": "Ukrainian",
-            "has_examples": true,
-            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
-        },
-        { "code": "hr", "name": "Croatian", "has_examples": true },
-        { "code": "eu", "name": "Basque", "has_examples": true },
-        { "code": "yo", "name": "Yoruba", "has_examples": true },
-        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
-        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
-        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
-        { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
-        { "code": "fa", "name": "Persian", "has_examples": true },
-        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
-        { "code": "tt", "name": "Tatar", "has_examples": true },
-        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
+        { "code": "sa", "name": "Sanskrit", "has_examples": true },
         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
-        { "code": "ga", "name": "Irish" },
-        { "code": "bn", "name": "Bengali", "has_examples": true },
-        { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
-        { "code": "mr", "name": "Marathi" },
-        { "code": "kn", "name": "Kannada" },
-        { "code": "ta", "name": "Tamil", "has_examples": true },
-        {
-            "code": "id",
-            "name": "Indonesian",
-            "example": "Ini adalah sebuah kalimat.",
-            "has_examples": true
-        },
-        { "code": "tl", "name": "Tagalog" },
-        { "code": "af", "name": "Afrikaans" },
-        { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
-        { "code": "cs", "name": "Czech" },
-        { "code": "is", "name": "Icelandic" },
-        { "code": "lv", "name": "Latvian" },
-        { "code": "sr", "name": "Serbian" },
-        { "code": "sk", "name": "Slovak" },
+        { "code": "sk", "name": "Slovak", "has_examples": true },
         { "code": "sl", "name": "Slovenian" },
-        { "code": "lb", "name": "Luxembourgish" },
         {
             "code": "sq",
             "name": "Albanian",
             "example": "Kjo është një fjali.",
             "has_examples": true
         },
-        { "code": "et", "name": "Estonian" },
+        { "code": "sr", "name": "Serbian", "has_examples": true },
+        { "code": "sv", "name": "Swedish", "has_examples": true },
+        { "code": "ta", "name": "Tamil", "has_examples": true },
+        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
         {
             "code": "th",
             "name": "Thai",
@@ -194,51 +185,43 @@
             "example": "นี่คือประโยค",
             "has_examples": true
         },
+        { "code": "tl", "name": "Tagalog" },
+        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
+        { "code": "tt", "name": "Tatar", "has_examples": true },
         {
-            "code": "ko",
-            "name": "Korean",
-            "dependencies": [
-                {
-                    "name": "mecab-ko",
-                    "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
-                },
-                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
-                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
-            ],
-            "example": "이것은 문장입니다.",
-            "has_examples": true
+            "code": "uk",
+            "name": "Ukrainian",
+            "has_examples": true,
+            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
         },
+        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
         {
             "code": "vi",
             "name": "Vietnamese",
             "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
         },
-        {
-            "code": "lij",
-            "name": "Ligurian",
-            "example": "Sta chì a l'é unna fraxe.",
-            "has_examples": true
-        },
-        {
-            "code": "hy",
-            "name": "Armenian",
-            "has_examples": true
-        },
-        {
-            "code": "gu",
-            "name": "Gujarati",
-            "has_examples": true
-        },
-        {
-            "code": "ml",
-            "name": "Malayalam",
-            "has_examples": true
-        },
         {
             "code": "xx",
             "name": "Multi-language",
             "models": ["xx_ent_wiki_sm"],
             "example": "This is a sentence about Facebook."
+        },
+        { "code": "yo", "name": "Yoruba", "has_examples": true },
+        {
+            "code": "zh",
+            "name": "Chinese",
+            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
+            "dependencies": [
+                {
+                    "name": "Jieba",
+                    "url": "https://github.com/fxsjy/jieba"
+                },
+                {
+                    "name": "PKUSeg",
+                    "url": "https://github.com/lancopku/PKUSeg-python"
+                }
+            ],
+            "has_examples": true
         }
     ],
     "licenses": [

From 930b116f004bf4413851da6710712a77ae118dbb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:35:21 +0200
Subject: [PATCH 102/133] Update docs [ci skip]

---
 website/docs/usage/v3.md         | 5 ++++-
 website/src/widgets/languages.js | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 406ba4b75..28bd02e3e 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
   [TransformerListener](/api/architectures#TransformerListener),
   [Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
-- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf)
+- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
+  [`de_dep_news_trf`](/models/de#de_dep_news_trf),
+  [`es_dep_news_trf`](/models/es#es_dep_news_trf),
+  [`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
 - **Implementation:**
   [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
 
diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js
index bb26e57cd..74d850182 100644
--- a/website/src/widgets/languages.js
+++ b/website/src/widgets/languages.js
@@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => (
         <Td>
             {models && models.length ? (
                 <Link to={`/models/${code}`}>
-                    {models.length} {models.length === 1 ? 'model' : 'models'}
+                    {models.length} {models.length === 1 ? 'package' : 'packages'}
                 </Link>
             ) : (
                 <em>none yet</em>
@@ -51,7 +51,7 @@ const Languages = () => (
                                 <Th>Language</Th>
                                 <Th>Code</Th>
                                 <Th>Language Data</Th>
-                                <Th>Models</Th>
+                                <Th>Pipelines</Th>
                             </Tr>
                         </thead>
                         <tbody>

From 566d0487538c547dc40c14a80341c92a73378399 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:43:51 +0200
Subject: [PATCH 103/133] Fix project repo link [ci skip]

---
 website/src/widgets/project.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js
index 8d309394d..9e23d60ea 100644
--- a/website/src/widgets/project.js
+++ b/website/src/widgets/project.js
@@ -16,7 +16,8 @@ export default function Project({
 }) {
     const repoArg = repo ? ` --repo ${repo}` : ''
     const text = `${COMMAND} ${id}${repoArg}`
-    const url = `${repo || projectsRepo}/${id}`
+    const defaultRepo = `https://github.com/${projectsRepo}`
+    const url = `${repo || defaultRepo}/${id}`
     const header = (
         <>
             {title}:{' '}

From 61235445db66b66181d76d217c92d2501128f699 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 09:45:32 +0200
Subject: [PATCH 104/133] Update README.md [ci skip]

---
 README.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 61cefb69a..3e5e5febe 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ state-of-the-art speed, convolutional **neural network models** for tagging,
 parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
 spaCy is commercial open-source software, released under the MIT license.
 
-💫 **Version 2.3 out now!**
+💫 **Version 3.0 out now!**
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
 
 ## 📖 Documentation
 
-| Documentation   |                                                                |
-| --------------- | -------------------------------------------------------------- |
-| [spaCy 101]     | New to spaCy? Here's everything you need to know!              |
-| [Usage Guides]  | How to use spaCy and its features.                             |
-| [New in v3.0]   | New features, backwards incompatibilities and migration guide. |
-| [API Reference] | The detailed reference for spaCy's API.                        |
-| [Models]        | Download statistical language models for spaCy.                |
-| [Universe]      | Libraries, extensions, demos, books and courses.               |
-| [Changelog]     | Changes and version history.                                   |
-| [Contribute]    | How to contribute to the spaCy project and code base.          |
+| Documentation       |                                                                |
+| ------------------- | -------------------------------------------------------------- |
+| [spaCy 101]         | New to spaCy? Here's everything you need to know!              |
+| [Usage Guides]      | How to use spaCy and its features.                             |
+| [New in v3.0]       | New features, backwards incompatibilities and migration guide. |
+| [Project Templates] | End-to-end workflows you can clone, modify and run.            |
+| [API Reference]     | The detailed reference for spaCy's API.                        |
+| [Models]            | Download statistical language models for spaCy.                |
+| [Universe]          | Libraries, extensions, demos, books and courses.               |
+| [Changelog]         | Changes and version history.                                   |
+| [Contribute]        | How to contribute to the spaCy project and code base.          |
 
 [spacy 101]: https://spacy.io/usage/spacy-101
 [new in v3.0]: https://spacy.io/usage/v3
@@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
 [api reference]: https://spacy.io/api/
 [models]: https://spacy.io/models
 [universe]: https://spacy.io/universe
+[project templates]: https://github.com/explosion/projects
 [changelog]: https://spacy.io/usage#changelog
 [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 

From 6ca06cb62cdbcddd1071fcc05871d675704c47a2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 10:14:27 +0200
Subject: [PATCH 105/133] Update docs and formatting [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 website/docs/api/top-level.md                 | 19 ++++++++++
 website/docs/usage/v3.md                      | 30 ++++++++-------
 website/src/components/infobox.js             | 37 ++++++++++---------
 4 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index c55374899..7241c5116 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -327,7 +327,7 @@ sents_f = 0.0
 ents_f = {{ (1.0 / components|length)|round(2) }}
 ents_p = 0.0
 ents_r = 0.0
-{%- endif -%}
+{%- endif %}
 {%- if "textcat" in components %}
 cats_score = {{ (1.0 / components|length)|round(2) }}
 {%- endif -%}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 2c082ae0b..f36be0806 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -632,6 +632,12 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or
 more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a
 single-token entity.
 
+<Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets">
+
+This method was previously available as `spacy.gold.biluo_tags_from_offsets`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -647,6 +653,7 @@ single-token entity.
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `doc`       | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~                                                             |
 | `entities`  | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
+| `missing`   | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~                                                                  |
 | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~                                                                                    |
 
 ### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
@@ -654,6 +661,12 @@ single-token entity.
 Encode per-token tags following the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
 
+<Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags">
+
+This method was previously available as `spacy.gold.offsets_from_biluo_tags`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -678,6 +691,12 @@ Encode per-token tags following the
 [`Span`](/api/span) objects. This can be used to create entity spans from
 token-based tags, e.g. to overwrite the `doc.ents`.
 
+<Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags">
+
+This method was previously available as `spacy.gold.spans_from_biluo_tags`.
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 88935e720..91d97cae2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -551,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 
 ### Removed or renamed API {#incompat-removed}
 
-| Removed                                                  | Replacement                                                                                                  |
-| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
-| `Language.disable_pipes`                                 | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
-| `GoldParse`                                              | [`Example`](/api/example)                                                                                    |
-| `GoldCorpus`                                             | [`Corpus`](/api/corpus)                                                                                      |
-| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`          | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                   |
-| `Matcher.pipe`, `PhraseMatcher.pipe`                     | not needed                                                                                                   |
-| `spacy init-model`                                       | [`spacy init vocab`](/api/cli#init-vocab)                                                                    |
-| `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)                                                                    |
-| `spacy profile`                                          | [`spacy debug profile`](/api/cli#debug-profile)                                                              |
-| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated                                                                          |
+| Removed                                                                                      | Replacement                                                                                                                                                                                                              |
+| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `Language.disable_pipes`                                                                     | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe)                                                                                                             |
+| `Doc.is_tagged`, `Doc.is_parsed`, ...                                                        | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                                                                                                          |
+| `GoldParse`                                                                                  | [`Example`](/api/example)                                                                                                                                                                                                |
+| `GoldCorpus`                                                                                 | [`Corpus`](/api/corpus)                                                                                                                                                                                                  |
+| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               |
+| `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               |
+| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
+| `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                |
+| `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                |
+| `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          |
+| `spacy link`, `util.set_data_path`, `util.get_data_path`                                     | not needed, symlinks are deprecated                                                                                                                                                                                      |
 
 The following deprecated methods, attributes and arguments were removed in v3.0.
 Most of them have been **deprecated for a while** and many would previously
@@ -971,9 +973,9 @@ python -m spacy package ./output ./packages
 
 #### Data utilities and gold module {#migrating-gold}
 
-The `spacy.gold` module has been renamed to `spacy.training` and the conversion 
-utilities now follow the naming format of `x_to_y`. This mostly
-affects internals, but if you've been using the span offset conversion utilities
+The `spacy.gold` module has been renamed to `spacy.training` and the conversion
+utilities now follow the naming format of `x_to_y`. This mostly affects
+internals, but if you've been using the span offset conversion utilities
 [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
 [`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
 [`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js
index 968b6cea8..b5a7af545 100644
--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@@ -20,24 +20,27 @@ export default function Infobox({
         [classes.danger]: variant === 'danger',
     })
     return (
-        <aside className={infoboxClassNames} id={id}>
-            {title && (
-                <h4 className={classes.title}>
-                    {variant !== 'default' && !emoji && (
-                        <Icon width={18} name={variant} inline className={classes.icon} />
-                    )}
-                    <span className={classes.titleText}>
-                        {emoji && (
-                            <span className={classes.emoji} aria-hidden="true">
-                                {emoji}
-                            </span>
+        <>
+            {id && <a id={id} />}
+            <aside className={infoboxClassNames}>
+                {title && (
+                    <h4 className={classes.title}>
+                        {variant !== 'default' && !emoji && (
+                            <Icon width={18} name={variant} inline className={classes.icon} />
                         )}
-                        {title}
-                    </span>
-                </h4>
-            )}
-            {children}
-        </aside>
+                        <span className={classes.titleText}>
+                            {emoji && (
+                                <span className={classes.emoji} aria-hidden="true">
+                                    {emoji}
+                                </span>
+                            )}
+                            {title}
+                        </span>
+                    </h4>
+                )}
+                {children}
+            </aside>
+        </>
     )
 }
 

From ae5dacf75f490c1b64257235cc2e4c93306d226e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 10:14:34 +0200
Subject: [PATCH 106/133] Tidy up and add types

---
 spacy/training/iob_utils.py | 54 +++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 03a502912..91fc40205 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -1,9 +1,11 @@
+from typing import List, Tuple, Iterable, Union, Iterator
 import warnings
+
 from ..errors import Errors, Warnings
-from ..tokens import Span
+from ..tokens import Span, Doc
 
 
-def iob_to_biluo(tags):
+def iob_to_biluo(tags: Iterable[str]) -> List[str]:
     out = []
     tags = list(tags)
     while tags:
@@ -12,7 +14,7 @@ def iob_to_biluo(tags):
     return out
 
 
-def biluo_to_iob(tags):
+def biluo_to_iob(tags: Iterable[str]) -> List[str]:
     out = []
     for tag in tags:
         if tag is None:
@@ -23,12 +25,12 @@ def biluo_to_iob(tags):
     return out
 
 
-def _consume_os(tags):
+def _consume_os(tags: List[str]) -> Iterator[str]:
     while tags and tags[0] == "O":
         yield tags.pop(0)
 
 
-def _consume_ent(tags):
+def _consume_ent(tags: List[str]) -> List[str]:
     if not tags:
         return []
     tag = tags.pop(0)
@@ -50,11 +52,7 @@ def _consume_ent(tags):
         return [start] + middle + [end]
 
 
-def biluo_tags_from_doc(doc, missing="O"):
-    return doc_to_biluo_tags(doc, missing)
-
-
-def doc_to_biluo_tags(doc, missing="O"):
+def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
     return offsets_to_biluo_tags(
         doc,
         [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
@@ -62,11 +60,9 @@ def doc_to_biluo_tags(doc, missing="O"):
     )
 
 
-def biluo_tags_from_offsets(doc, entities, missing="O"):
-    return offsets_to_biluo_tags(doc, entities, missing)
-
-
-def offsets_to_biluo_tags(doc, entities, missing="O"):
+def offsets_to_biluo_tags(
+    doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
+) -> List[str]:
     """Encode labelled spans into per-token tags, using the
     Begin/In/Last/Unit/Out scheme (BILUO).
 
@@ -77,7 +73,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
         the original string.
     RETURNS (list): A list of unicode strings, describing the tags. Each tag
         string will be of the form either "", "O" or "{action}-{label}", where
-        action is one of "B", "I", "L", "U". The string "-" is used where the
+        action is one of "B", "I", "L", "U". The missing label is used where the
         entity offsets don't align with the tokenization in the `Doc` object.
         The training algorithm will view these as missing values. "O" denotes a
         non-entity token. "B" denotes the beginning of a multi-token entity,
@@ -93,7 +89,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     """
     # Ensure no overlapping entity labels exist
     tokens_in_ents = {}
-
     starts = {token.idx: token.i for token in doc}
     ends = {token.idx + len(token): token.i for token in doc}
     biluo = ["-" for _ in doc]
@@ -117,7 +112,6 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
                         )
                     )
                 tokens_in_ents[token_index] = (start_char, end_char, label)
-
             start_token = starts.get(start_char)
             end_token = ends.get(end_char)
             # Only interested if the tokenization is correct
@@ -151,11 +145,7 @@ def offsets_to_biluo_tags(doc, entities, missing="O"):
     return biluo
 
 
-def spans_from_biluo_tags(doc, tags):
-    return biluo_tags_to_spans(doc, tags)
-
-
-def biluo_tags_to_spans(doc, tags):
+def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
     """Encode per-token tags following the BILUO scheme into Span object, e.g.
     to overwrite the doc.ents.
 
@@ -173,11 +163,9 @@ def biluo_tags_to_spans(doc, tags):
     return spans
 
 
-def offsets_from_biluo_tags(doc, tags):
-    return biluo_tags_to_offsets(doc, tags)
-
-
-def biluo_tags_to_offsets(doc, tags):
+def biluo_tags_to_offsets(
+    doc: Doc, tags: Iterable[str]
+) -> List[Tuple[int, int, Union[str, int]]]:
     """Encode per-token tags following the BILUO scheme into entity offsets.
 
     doc (Doc): The document that the BILUO tags refer to.
@@ -192,8 +180,8 @@ def biluo_tags_to_offsets(doc, tags):
     return [(span.start_char, span.end_char, span.label_) for span in spans]
 
 
-def tags_to_entities(tags):
-    """ Note that the end index returned by this function is inclusive.
+def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
+    """Note that the end index returned by this function is inclusive.
     To use it for Span creation, increment the end by 1."""
     entities = []
     start = None
@@ -225,3 +213,9 @@ def tags_to_entities(tags):
         else:
             raise ValueError(Errors.E068.format(tag=tag))
     return entities
+
+
+# Fallbacks to make backwards-compat easier
+offsets_from_biluo_tags = biluo_tags_to_offsets
+spans_from_biluo_tags = biluo_tags_to_spans
+biluo_tags_from_offsets = offsets_to_biluo_tags

From 20b0ec5dcf5b97a3c406ec6bd7aa3f32223c63fa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 10:37:12 +0200
Subject: [PATCH 107/133] avoid logging performance of frozen components

---
 spacy/cli/train.py        | 6 ++++--
 spacy/training/loggers.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index bf3749c9e..811a3ba86 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -152,7 +152,8 @@ def train(
         exclude=frozen_components,
     )
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    print_row, finalize_logger = train_logger(nlp)
+    with nlp.select_pipes(disable=[*frozen_components]):
+        print_row, finalize_logger = train_logger(nlp)
 
     try:
         progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@@ -163,7 +164,8 @@ def train(
                 progress.close()
                 print_row(info)
                 if is_best_checkpoint and output_path is not None:
-                    update_meta(T_cfg, nlp, info)
+                    with nlp.select_pipes(disable=[*frozen_components]):
+                        update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
                         nlp.to_disk(output_path / "model-best")
                 progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 92b598033..dddf20169 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -11,9 +11,11 @@ def console_logger():
     def setup_printer(
         nlp: "Language",
     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        # we assume here that only components are enabled that should be trained & logged
+        logged_pipes = nlp.pipe_names
         score_cols = list(nlp.config["training"]["score_weights"])
         score_widths = [max(len(col), 6) for col in score_cols]
-        loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
+        loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
         loss_widths = [max(len(col), 8) for col in loss_cols]
         table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
         table_header = [col.upper() for col in table_header]
@@ -26,7 +28,7 @@ def console_logger():
             try:
                 losses = [
                     "{0:.2f}".format(float(info["losses"][pipe_name]))
-                    for pipe_name in nlp.pipe_names
+                    for pipe_name in logged_pipes
                 ]
             except KeyError as e:
                 raise KeyError(

From 6435458d517e1ca689d2bcf6f996df59218957bf Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 12:12:38 +0200
Subject: [PATCH 108/133] simplify expression

---
 spacy/cli/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 811a3ba86..2900ef379 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -152,7 +152,7 @@ def train(
         exclude=frozen_components,
     )
     msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    with nlp.select_pipes(disable=[*frozen_components]):
+    with nlp.select_pipes(disable=frozen_components):
         print_row, finalize_logger = train_logger(nlp)
 
     try:
@@ -164,7 +164,7 @@ def train(
                 progress.close()
                 print_row(info)
                 if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=[*frozen_components]):
+                    with nlp.select_pipes(disable=frozen_components):
                         update_meta(T_cfg, nlp, info)
                     with nlp.use_params(optimizer.averages):
                         nlp.to_disk(output_path / "model-best")

From 02b69dd0d532fb4c8835868332268e2f6eead511 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 12:56:54 +0200
Subject: [PATCH 109/133] Update models directory [ci skip]

---
 website/src/templates/models.js | 108 +++++++++++++-------------------
 1 file changed, 44 insertions(+), 64 deletions(-)

diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 5061972b8..5d705048b 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -12,7 +12,6 @@ import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
 import Link from '../components/link'
-import Grid from '../components/grid'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@@ -31,10 +30,16 @@ const MODEL_META = {
     wiki: 'Wikipedia',
     uas: 'Unlabelled dependencies',
     las: 'Labelled dependencies',
+    token_acc: 'Tokenization',
+    tok: 'Tokenization',
     tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
-    ents_f: 'Entities (F-score)',
-    ents_p: 'Entities (precision)',
-    ents_r: 'Entities (recall)',
+    tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+    ents_f: 'Named entities (F-score)',
+    ents_p: 'Named entities (precision)',
+    ents_r: 'Named entities (recall)',
+    sent_f: 'Sentence segmentation (F-score)',
+    sent_p: 'Sentence segmentation (precision)',
+    sent_r: 'Sentence segmentation (recall)',
     cpu: 'words per second on CPU',
     gpu: 'words per second on GPU',
     pipeline: 'Active processing pipeline components in order',
@@ -83,25 +88,19 @@ function formatVectors(data) {
 }
 
 function formatAccuracy(data) {
-    if (!data) return null
-    const labels = {
-        las: 'LAS',
-        uas: 'UAS',
-        tags_acc: 'TAG',
-        ents_f: 'NER F',
-        ents_p: 'NER P',
-        ents_r: 'NER R',
-    }
-    const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
-    const isNer = key => key.startsWith('ents_')
+    if (!data) return []
     return Object.keys(data)
-        .filter(key => labels[key])
-        .map(key => ({
-            label: labels[key],
-            value: data[key].toFixed(2),
-            help: MODEL_META[key],
-            type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
-        }))
+        .map(label => {
+            const value = data[label]
+            return isNaN(value)
+                ? null
+                : {
+                      label,
+                      value: value.toFixed(2),
+                      help: MODEL_META[label],
+                  }
+        })
+        .filter(item => item)
 }
 
 function formatModelMeta(data) {
@@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
         { label: 'Author', content: author },
         { label: 'License', content: license },
     ]
-    const accuracy = [
-        {
-            label: 'Syntax Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
-        },
-        {
-            label: 'NER Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
-        },
-    ]
 
     const error = (
         <Infobox title="Unable to load model details from GitHub" variant="danger">
@@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
             </p>
         </Infobox>
     )
-
     return (
         <Section id={name}>
             <H2
@@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                     )}
                 </tbody>
             </Table>
-            <Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
-                {accuracy &&
-                    accuracy.map(({ label, items }, i) =>
-                        !items ? null : (
-                            <Table fixed key={i}>
-                                <thead>
-                                    <Tr>
-                                        <Th colSpan={2}>{label}</Th>
-                                    </Tr>
-                                </thead>
-                                <tbody>
-                                    {items.map((item, i) => (
-                                        <Tr key={i}>
-                                            <Td>
-                                                <Label>
-                                                    {item.label}{' '}
-                                                    {item.help && <Help>{item.help}</Help>}
-                                                </Label>
-                                            </Td>
-                                            <Td num>{item.value}</Td>
-                                        </Tr>
-                                    ))}
-                                </tbody>
-                            </Table>
-                        )
-                    )}
-            </Grid>
             {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
             {hasInteractiveCode && (
                 <CodeBlock title="Try out the model" lang="python" executable={true}>
@@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                         `import spacy`,
                         `from spacy.lang.${langId}.examples import sentences `,
                         ``,
-                        `nlp = spacy.load('${name}')`,
+                        `nlp = spacy.load("${name}")`,
                         `doc = nlp(sentences[0])`,
                         `print(doc.text)`,
                         `for token in doc:`,
@@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                     ].join('\n')}
                 </CodeBlock>
             )}
+            {meta.accuracy && (
+                <Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
+                    <Table>
+                        <tbody>
+                            {meta.accuracy.map(({ label, value, help }) => (
+                                <Tr key={`${name}-${label}`}>
+                                    <Td nowrap>
+                                        <InlineCode>{label.toUpperCase()}</InlineCode>
+                                    </Td>
+                                    <Td>{help}</Td>
+                                    <Td num style={{ textAlign: 'right' }}>
+                                        {value}
+                                    </Td>
+                                </Tr>
+                            ))}
+                        </tbody>
+                    </Table>
+                </Accordion>
+            )}
             {labels && (
                 <Accordion id={`${name}-labels`} title="Label Scheme">
                     <p>
@@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                                 const labelNames = labels[pipe] || []
                                 const help = LABEL_SCHEME_META[pipe]
                                 return (
-                                    <Tr key={pipe} evenodd={false} key={pipe}>
+                                    <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
                                         <Td style={{ width: '20%' }}>
                                             <Label>
                                                 {pipe} {help && <Help>{help}</Help>}
@@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
     const [initialized, setInitialized] = useState(false)
     const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta } = pageContext
+    const { id, title, meta, hasExamples } = pageContext
     const { models, isStarters } = meta
     const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
 
@@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
 
     const modelTitle = title
     const modelTeaser = `Available trained pipelines for ${title}`
-
     const starterTitle = `${title} starters`
     const starterTeaser = `Available transfer learning starter packs for ${title}`
 
@@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
                             baseUrl={baseUrl}
                             repo={repo}
                             licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
+                            hasExamples={meta.hasExamples}
                         />
                     ))
                 }

From a9da33c4d97abb0e9a09795d2383b3be3a10a3e9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 13:00:56 +0200
Subject: [PATCH 110/133] Fix infobox with ID [ci skip]

---
 website/src/components/infobox.js | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js
index b5a7af545..6df8426b8 100644
--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@@ -1,4 +1,4 @@
-import React from 'react'
+import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'
 
@@ -14,13 +14,14 @@ export default function Infobox({
     className,
     children,
 }) {
+    const Wrapper = id ? 'div' : Fragment
     const infoboxClassNames = classNames(classes.root, className, {
         [classes.list]: !!list,
         [classes.warning]: variant === 'warning',
         [classes.danger]: variant === 'danger',
     })
     return (
-        <>
+        <Wrapper>
             {id && <a id={id} />}
             <aside className={infoboxClassNames}>
                 {title && (
@@ -40,7 +41,7 @@ export default function Infobox({
                 )}
                 {children}
             </aside>
-        </>
+        </Wrapper>
     )
 }
 

From 7745d77a38a131f6ffec9b4ae43da8ef799c228e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 13:21:42 +0200
Subject: [PATCH 111/133] Fix whitespace in template [ci skip]

---
 spacy/cli/templates/quickstart_training.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 7241c5116..53fd99ee8 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -280,7 +280,7 @@ vectors = "{{ word_vectors }}"
 {% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
-{% endif %}
+{% endif -%}
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 

From 6c85fab3167a468953b23b25d4a25a7fbdb478cd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 13:35:09 +0200
Subject: [PATCH 112/133] state_type and extra_state_tokens instead of
 nr_feature_tokens

---
 spacy/cli/templates/quickstart_training.jinja | 12 ++++---
 spacy/ml/models/parser.py                     | 31 +++++++++----------
 spacy/pipeline/dep_parser.pyx                 |  3 +-
 spacy/pipeline/ner.pyx                        |  3 +-
 .../tests/serialize/test_serialize_config.py  |  9 ++++--
 website/docs/api/architectures.md             | 22 +++++++------
 website/docs/usage/embeddings-transformers.md |  3 +-
 7 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 7241c5116..9dde2237b 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -59,7 +59,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false
@@ -79,7 +80,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = false
@@ -183,7 +185,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = true
@@ -200,7 +203,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 868f9d6d2..0e10932d5 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -11,7 +11,8 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    nr_feature_tokens: int,
+    state_type: str,
+    extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
@@ -40,20 +41,12 @@ def build_tb_parser_model(
 
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
-    nr_feature_tokens (int): The number of tokens in the context to use to
-        construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
-        2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
-        feature sets are designed for the NER. The recommended feature sets are
-        3 for NER, and 8 for the dependency parser.
-
-        TODO: This feature should be split into two, state_type: ["deps", "ner"]
-        and extra_state_features: [True, False]. This would map into:
-
-        (deps, False): 8
-        (deps, True): 13
-        (ner, False): 3
-        (ner, True): 6
-
+    state_type (str):
+        String value denoting the type of parser model: "deps" or "ner"
+    extra_state_tokens (bool): Whether or not to use additional tokens in the context
+        to construct the state vector. Defaults to `False`, which means 3 and 8
+        for the NER and parser respectively. When set to `True`, this would become 6
+        feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
         Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
@@ -68,8 +61,14 @@ def build_tb_parser_model(
         Usually inferred from data at the beginning of training, or loaded from
         disk.
     """
+    if state_type == "deps":
+        nr_feature_tokens = 13 if extra_state_tokens else 8
+    elif state_type == "ner":
+        nr_feature_tokens = 6 if extra_state_tokens else 3
+    else:
+        raise ValueError(f"unknown state type {state_type}")  # TODO error
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
+    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
     tok2vec.set_dim("nO", hidden_width)
     lower = PrecomputableAffine(
         nO=hidden_width if use_upper else nO,
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index edd791e40..7d8c63815 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -15,7 +15,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 2fa5c6392..fc4f03473 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -13,7 +13,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1e17b3212..abfd4d725 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 99
+state_type = "deps"
+extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
 
@@ -95,7 +96,11 @@ def my_parser():
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )
     parser = build_tb_parser_model(
-        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
+        tok2vec=tok2vec,
+        state_type="deps",
+        extra_state_tokens=True,
+        hidden_width=65,
+        maxout_pieces=5,
     )
     return parser
 
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 30d863b17..0d283d805 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -414,7 +414,8 @@ one component.
 > ```ini
 > [model]
 > @architectures = "spacy.TransitionBasedParser.v1"
-> nr_feature_tokens = 6
+> state_type = "ner"
+> extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
 >
@@ -446,15 +447,16 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                | Description                                                                                                                                                                                                                                                                                                                                                             |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~       |
-| `hidden_width`      | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`     | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`         | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**         | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~                                                                                                                                                                                                                                                                               |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index a855d703c..d61172a5b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -448,7 +448,8 @@ factory = "ner"
 
 [nlp.pipeline.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false

From e4e7f5b00d46b0a6f75e419c509fbd0c73927121 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 15:44:40 +0200
Subject: [PATCH 113/133] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 33163f306..028746db0 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
 
 | System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           |      |      |      |
+| spaCy RoBERTa (2020)                                                           | 97.8 | 96.6 | 94.7 |
 | spaCy CNN (2020)                                                               |      |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |
@@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
 
 **Accuracy on the Penn Treebank.** See
 [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results.
+results. For spaCy's evaluation, see the
+[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
 
 </figcaption>
 

From 76bbed3466519d384834715f48f240140c43e02e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:00:03 +0200
Subject: [PATCH 114/133] Use Literal type for nr_feature_tokens

---
 requirements.txt                               |  1 +
 setup.cfg                                      |  1 +
 spacy/compat.py                                |  5 +++++
 spacy/ml/models/parser.py                      |  3 ++-
 spacy/tests/serialize/test_serialize_config.py | 14 ++++++++++++--
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4d6c1dfd0..a8b237aa1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ pytokenizations
 setuptools
 packaging
 importlib_metadata>=0.20; python_version < "3.8"
+typing_extensions>=3.7.4; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
diff --git a/setup.cfg b/setup.cfg
index dd0975800..9831402d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,6 +57,7 @@ install_requires =
     setuptools
     packaging
     importlib_metadata>=0.20; python_version < "3.8"
+    typing_extensions>=3.7.4; python_version < "3.8"
 
 [options.entry_points]
 console_scripts =
diff --git a/spacy/compat.py b/spacy/compat.py
index 2d51ff0ae..6eca18b80 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -22,6 +22,11 @@ try:
 except ImportError:
     cupy = None
 
+try:  # Python 3.8+
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401
 
 pickle = pickle
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 868f9d6d2..68cc20e9b 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,6 +2,7 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
+from ...compat import Literal
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -11,7 +12,7 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    nr_feature_tokens: int,
+    nr_feature_tokens: Literal[3, 6, 8, 13],
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 1e17b3212..5f25cbfe1 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 99
+nr_feature_tokens = 3
 hidden_width = 66
 maxout_pieces = 2
 
@@ -95,7 +95,7 @@ def my_parser():
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )
     parser = build_tb_parser_model(
-        tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
+        tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5
     )
     return parser
 
@@ -340,3 +340,13 @@ def test_config_auto_fill_extra_fields():
     assert "extra" not in nlp.config["training"]
     # Make sure the config generated is valid
     load_model_from_config(nlp.config)
+
+
+def test_config_validate_literal():
+    nlp = English()
+    config = Config().from_str(parser_config_string)
+    config["model"]["nr_feature_tokens"] = 666
+    with pytest.raises(ConfigValidationError):
+        nlp.add_pipe("parser", config=config)
+    config["model"]["nr_feature_tokens"] = 13
+    nlp.add_pipe("parser", config=config)

From 50a4425cdaed350653368c9c350f95717e9414d9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:03:32 +0200
Subject: [PATCH 115/133] Adjust docs

---
 spacy/ml/models/parser.py         | 4 ++--
 website/docs/api/architectures.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 68cc20e9b..5d091c590 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -42,8 +42,8 @@ def build_tb_parser_model(
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
     nr_feature_tokens (int): The number of tokens in the context to use to
-        construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
-        2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
+        construct the state vector. Valid choices are 3, 6, 8 and 13. The
+        8 and 13 feature sets are designed for the parser, while the 3 and 6
         feature sets are designed for the NER. The recommended feature sets are
         3 for NER, and 8 for the dependency parser.
 
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 30d863b17..8797b2f31 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -449,7 +449,7 @@ consists of either two or three subnetworks:
 | Name                | Description                                                                                                                                                                                                                                                                                                                                                             |
 | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tok2vec`           | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~       |
+| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~                      |
 | `hidden_width`      | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
 | `maxout_pieces`     | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
 | `use_upper`         | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |

From dd2292793f3bbd7cdfd2cf42bad205ec7428016a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 16:53:49 +0200
Subject: [PATCH 116/133] 'parser' instead of 'deps' for state_type

---
 spacy/cli/templates/quickstart_training.jinja  | 4 ++--
 spacy/ml/models/parser.py                      | 4 ++--
 spacy/pipeline/dep_parser.pyx                  | 2 +-
 spacy/tests/serialize/test_serialize_config.py | 4 ++--
 website/docs/api/architectures.md              | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9dde2237b..bc7e206f5 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -59,7 +59,7 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
@@ -185,7 +185,7 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 0e10932d5..b6e4b8d8a 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -42,7 +42,7 @@ def build_tb_parser_model(
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
     state_type (str):
-        String value denoting the type of parser model: "deps" or "ner"
+        String value denoting the type of parser model: "parser" or "ner"
     extra_state_tokens (bool): Whether or not to use additional tokens in the context
         to construct the state vector. Defaults to `False`, which means 3 and 8
         for the NER and parser respectively. When set to `True`, this would become 6
@@ -61,7 +61,7 @@ def build_tb_parser_model(
         Usually inferred from data at the beginning of training, or loaded from
         disk.
     """
-    if state_type == "deps":
+    if state_type == "parser":
         nr_feature_tokens = 13 if extra_state_tokens else 8
     elif state_type == "ner":
         nr_feature_tokens = 6 if extra_state_tokens else 3
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 7d8c63815..a49475c8e 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -15,7 +15,7 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index abfd4d725..10e0e132b 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,7 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-state_type = "deps"
+state_type = "parser"
 extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
@@ -97,7 +97,7 @@ def my_parser():
     )
     parser = build_tb_parser_model(
         tok2vec=tok2vec,
-        state_type="deps",
+        state_type="parser",
         extra_state_tokens=True,
         hidden_width=65,
         maxout_pieces=5,
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 0d283d805..ef2666ec0 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -450,7 +450,7 @@ consists of either two or three subnetworks:
 | Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
 | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `state_type`         | Which task to extract features for. Possible values are "ner" and "dependencies". ~~str~~                                                                                                                                                                                                                                                                               |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
 | `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
 | `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
 | `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |

From 3c3863654e2804223a30c8ed3cae3d2e73147ca6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 16:54:43 +0200
Subject: [PATCH 117/133] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index b57bbeda2..b0cdd562c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a21"
+__version__ = "3.0.0a22"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 25b34bba9406a3185406e79e8b0e45048e7f3914 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 16:57:14 +0200
Subject: [PATCH 118/133] throw custom error when state_type is invalid

---
 spacy/errors.py           | 2 ++
 spacy/ml/models/parser.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 153f8da0c..47a134c1f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,8 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E917 = ("Received invalid value {value} for 'state_type' in "
+            "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
             "values are an instance of spacy.vocab.Vocab or True to create one"
             " (default).")
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index b6e4b8d8a..dbea6b507 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,6 +2,7 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
+from ... import Errors
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -66,7 +67,7 @@ def build_tb_parser_model(
     elif state_type == "ner":
         nr_feature_tokens = 6 if extra_state_tokens else 3
     else:
-        raise ValueError(f"unknown state type {state_type}")  # TODO error
+        raise ValueError(Errors.E917.format(value=state_type))
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
     tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
     tok2vec.set_dim("nO", hidden_width)

From 5a9fdbc8ad8e6e03968b78e026b8ee75e4c4a3e1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 17:32:14 +0200
Subject: [PATCH 119/133] state_type as Literal

---
 spacy/ml/models/parser.py                      |  5 +++--
 spacy/tests/serialize/test_serialize_config.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index dbea6b507..2c40bb3ab 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,7 +2,8 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
-from ... import Errors
+from ...errors import Errors
+from ...compat import Literal
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
 from ..tb_framework import TransitionModel
@@ -12,7 +13,7 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    state_type: str,
+    state_type: Literal["parser", "ner"],
     extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 10e0e132b..6aad59272 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -345,3 +345,13 @@ def test_config_auto_fill_extra_fields():
     assert "extra" not in nlp.config["training"]
     # Make sure the config generated is valid
     load_model_from_config(nlp.config)
+
+
+def test_config_validate_literal():
+    nlp = English()
+    config = Config().from_str(parser_config_string)
+    config["model"]["state_type"] = "nonsense"
+    with pytest.raises(ConfigValidationError):
+        nlp.add_pipe("parser", config=config)
+    config["model"]["state_type"] = "ner"
+    nlp.add_pipe("parser", config=config)
\ No newline at end of file

From b816ace4bbd158524865b7e995da8fa23ee0bc2b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 23 Sep 2020 17:33:13 +0200
Subject: [PATCH 120/133] format

---
 spacy/tests/serialize/test_serialize_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 6aad59272..ec7544456 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -354,4 +354,4 @@ def test_config_validate_literal():
     with pytest.raises(ConfigValidationError):
         nlp.add_pipe("parser", config=config)
     config["model"]["state_type"] = "ner"
-    nlp.add_pipe("parser", config=config)
\ No newline at end of file
+    nlp.add_pipe("parser", config=config)

From 3f77eb749c411f78dc21135deb446ad8d5fde76c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 19:50:15 +0200
Subject: [PATCH 121/133] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index b0cdd562c..8d019897b 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a22"
+__version__ = "3.0.0a23"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From f25f05c503c83949c9831028e221f3d024358889 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 20:03:04 +0200
Subject: [PATCH 122/133] Adjust sort order [ci skip]

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 93000ea27..025fe5288 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
 # fmt: off
-CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
 # fmt: on
 
 

From c8bda92243b7752ad88be46e071368376704fb2b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 20:05:02 +0200
Subject: [PATCH 123/133] Update benchmarks [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 028746db0..c5ce95e2f 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
 
 | System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 97.8 | 96.6 | 94.7 |
+| spaCy RoBERTa (2020)                                                           | 98.0 | 96.8 | 95.0 |
 | spaCy CNN (2020)                                                               |      |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |

From 02008e9a55ea0d4a3ac41cb2324d89c9f837abcd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 23 Sep 2020 22:02:31 +0200
Subject: [PATCH 124/133] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 40 +++++++++++-------------
 website/docs/usage/facts-figures.md      | 19 +++++++++++
 website/src/widgets/landing.js           |  2 +-
 3 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index c5ce95e2f..1fe6e2bff 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -4,21 +4,16 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figure>
 
-| System                                                                    |            Parser |            Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
-| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
-| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3)                |                   |                   |      |                                                                     |                                                                 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)                  |                   |                   |      |                                                                     |                                                                    |
-| `en_core_web_lg` (spaCy v2)                                               |              91.9 |              97.2 | 85.9 |                                                                 10k |                                                                    |
-| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 |                                                                 234 |                                                                 2k |
-| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link>        |                 - |              97.9 | 89.3 |                                                                     |                                                                    |
+| System                                                     | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
+| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
+| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
+| `en_core_web_lg` (spaCy v2)                                |   91.9 |   97.2 | 85.9 |                                                                 10k |                                                                    |
 
 <figcaption class="caption">
 
 **Accuracy and speed on the
-[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. **
-[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_:
-Qi et al. don't report parsing and tagging results on OntoNotes. We're working
-on training Stanza on this corpus to allow direct comparison.
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
 
 </figcaption>
 
@@ -26,19 +21,22 @@ on training Stanza on this corpus to allow direct comparison.
 
 <figure>
 
-| System                                                                         |  POS |  UAS |  LAS |
-| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 98.0 | 96.8 | 95.0 |
-| spaCy CNN (2020)                                                               |      |      |      |
-| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
-| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |
+| Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
+| ------------------------------------------------------------------------------ | --------: | --------- |
+| spaCy RoBERTa (2020)                                                           |
+| spaCy CNN (2020)                                                               |           |
+| spaCy CNN (2017)                                                               |      86.4 |
+| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |
+| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |
 
 <figcaption class="caption">
 
-**Accuracy on the Penn Treebank.** See
-[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results. For spaCy's evaluation, see the
-[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
+**Named entity recognition accuracy** on the
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and
+[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
+[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
+more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
+**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/)
 
 </figcaption>
 
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 75f92070a..ad6776b2c 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -61,6 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 <Benchmarks />
 
+<figure>
+
+| System                                                                         |  UAS |  LAS |
+| ------------------------------------------------------------------------------ | ---: | ---: |
+| spaCy RoBERTa (2020)                                                           | 96.8 | 95.0 |
+| spaCy CNN (2020)                                                               | 93.7 | 91.8 |
+| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
+| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 |
+
+<figcaption class="caption">
+
+**Accuracy on the Penn Treebank.** See
+[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
+results.
+
+</figcaption>
+
+</figure>
+
 <Project id="benchmarks/parsing_penn_treebank">
 
 The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index 2e75c893a..6fe7f4cdf 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -297,7 +297,7 @@ const Landing = ({ data }) => {
                         to run.
                     </p>
                     <p>
-                        <Button to="/usage/facts-figures#benchmarks">See details</Button>
+                        <Button to="/usage/facts-figures#benchmarks">More results</Button>
                     </p>
                 </LandingCol>
 

From e2ffe51fb5c18b18397930d976fe323f75d02863 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:13:41 +0200
Subject: [PATCH 125/133] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md |  4 ++--
 website/docs/usage/facts-figures.md      | 16 +++++-----------
 website/docs/usage/projects.md           |  2 +-
 website/gatsby-config.js                 |  1 +
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 1fe6e2bff..a00229867 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -12,8 +12,8 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figcaption class="caption">
 
-**Accuracy and speed on the
-[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
+**Full pipeline accuracy and speed** on the
+[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
 
 </figcaption>
 
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index ad6776b2c..743dae74d 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -65,28 +65,22 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 | System                                                                         |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           | 96.8 | 95.0 |
-| spaCy CNN (2020)                                                               | 93.7 | 91.8 |
+| spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 |
+| spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.2 | 95.7 |
 
 <figcaption class="caption">
 
-**Accuracy on the Penn Treebank.** See
+**Dependency parsing accuracy** on the Penn Treebank. See
 [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results.
+results. **1. ** Project template:
+[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
 
 </figcaption>
 
 </figure>
 
-<Project id="benchmarks/parsing_penn_treebank">
-
-The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
-our project template.
-
-</Project>
-
 <!-- TODO: ## Citing spaCy {#citation}
 
 -->
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 95e20525a..8e093e8d6 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI
 pipelines.
 
 ```yaml
-https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
+%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
 ```
 
 | Section       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
diff --git a/website/gatsby-config.js b/website/gatsby-config.js
index 5e3b5b537..c1a2f9ab9 100644
--- a/website/gatsby-config.js
+++ b/website/gatsby-config.js
@@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master'
 // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
 const replacements = {
     GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
+    GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
 }
 
 /**

From ae51f580c1cd8a4168253d326fd9c1356fc88844 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:27:33 +0200
Subject: [PATCH 126/133] Fix handling of score_weights

---
 spacy/cli/templates/quickstart_training.jinja | 18 ---------
 spacy/cli/train.py                            |  5 ++-
 spacy/lang/bn/__init__.py                     |  1 -
 spacy/lang/el/__init__.py                     |  1 -
 spacy/lang/en/__init__.py                     |  1 -
 spacy/lang/fa/__init__.py                     |  1 -
 spacy/lang/fr/__init__.py                     |  1 -
 spacy/lang/nb/__init__.py                     |  1 -
 spacy/lang/nl/__init__.py                     |  1 -
 spacy/lang/pl/__init__.py                     |  1 -
 spacy/lang/ru/__init__.py                     |  1 -
 spacy/lang/sv/__init__.py                     |  1 -
 spacy/lang/uk/__init__.py                     |  1 -
 spacy/language.py                             | 20 ++++++----
 spacy/pipeline/dep_parser.pyx                 | 10 ++++-
 spacy/pipeline/entityruler.py                 |  8 +++-
 spacy/pipeline/lemmatizer.py                  |  1 -
 spacy/pipeline/morphologizer.pyx              |  3 +-
 spacy/pipeline/ner.pyx                        |  3 +-
 spacy/pipeline/sentencizer.pyx                |  1 -
 spacy/pipeline/senter.pyx                     |  1 -
 spacy/pipeline/tagger.pyx                     |  1 -
 spacy/pipeline/textcat.py                     | 23 ++++++-----
 spacy/schemas.py                              |  2 +-
 spacy/tests/pipeline/test_pipe_factories.py   | 23 ++++++++---
 spacy/util.py                                 | 11 ++++++
 website/docs/api/language.md                  | 39 +++++++++----------
 website/docs/usage/training.md                |  7 ++--
 28 files changed, 95 insertions(+), 92 deletions(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index a0ffa8f52..9a8b9d1d7 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -317,21 +317,3 @@ start = 100
 stop = 1000
 compound = 1.001
 {% endif %}
-
-[training.score_weights]
-{%- if "tagger" in components %}
-tag_acc = {{ (1.0 / components|length)|round(2) }}
-{%- endif -%}
-{%- if "parser" in components %}
-dep_uas = 0.0
-dep_las = {{ (1.0 / components|length)|round(2) }}
-sents_f = 0.0
-{%- endif %}
-{%- if "ner" in components %}
-ents_f = {{ (1.0 / components|length)|round(2) }}
-ents_p = 0.0
-ents_r = 0.0
-{%- endif %}
-{%- if "textcat" in components %}
-cats_score = {{ (1.0 / components|length)|round(2) }}
-{%- endif -%}
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 2900ef379..3485a4ff2 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -209,6 +209,8 @@ def create_train_batches(iterator, batcher, max_epochs: int):
 def create_evaluation_callback(
     nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
 ) -> Callable[[], Tuple[float, Dict[str, float]]]:
+    weights = {key: value for key, value in weights.items() if value is not None}
+
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
         scores = nlp.evaluate(dev_examples)
@@ -368,7 +370,8 @@ def update_meta(
 ) -> None:
     nlp.meta["performance"] = {}
     for metric in training["score_weights"]:
-        nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
+        if metric is not None:
+            nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
     for pipe_name in nlp.pipe_names:
         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
 
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 270185a4b..923e29a17 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -25,7 +25,6 @@ class Bengali(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 0c5e0672b..1a7b19914 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -30,7 +30,6 @@ class Greek(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 1a595b6e7..bf7e9987f 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -29,7 +29,6 @@ class English(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 244534120..f3a6635dc 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -28,7 +28,6 @@ class Persian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 42241cd8a..72e641d1f 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -33,7 +33,6 @@ class French(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index 28a2f0bf2..9672dfd6e 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -28,7 +28,6 @@ class Norwegian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index 1526e41f5..15b6b9de2 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -30,7 +30,6 @@ class Dutch(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 7ddad9893..573dbc6f9 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -35,7 +35,6 @@ class Polish(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pos_lookup", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index be770e3ec..4a296dd23 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -25,7 +25,6 @@ class Russian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pymorphy2", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 6db74cd39..ea314f487 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -31,7 +31,6 @@ class Swedish(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "rule", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index e9936cf7d..006a1cf7f 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -25,7 +25,6 @@ class Ukrainian(Language):
     "lemmatizer",
     assigns=["token.lemma"],
     default_config={"model": None, "mode": "pymorphy2", "lookups": None},
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/language.py b/spacy/language.py
index 4dffd9679..0b7deacad 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -248,9 +248,15 @@ class Language:
         self._config["nlp"]["pipeline"] = list(self.component_names)
         self._config["nlp"]["disabled"] = list(self.disabled)
         self._config["components"] = pipeline
-        if not self._config["training"].get("score_weights"):
-            combined_score_weights = combine_score_weights(score_weights)
-            self._config["training"]["score_weights"] = combined_score_weights
+        # We're merging the existing score weights back into the combined
+        # weights to make sure we're preserving custom settings in the config
+        # but also reflect updates (e.g. new components added)
+        prev_score_weights = self._config["training"].get("score_weights", {})
+        combined_score_weights = combine_score_weights(score_weights)
+        combined_score_weights.update(prev_score_weights)
+        # Combine the scores a second time to normalize them
+        combined_score_weights = combine_score_weights([combined_score_weights])
+        self._config["training"]["score_weights"] = combined_score_weights
         if not srsly.is_json_serializable(self._config):
             raise ValueError(Errors.E961.format(config=self._config))
         return self._config
@@ -412,7 +418,6 @@ class Language:
         assigns: Iterable[str] = SimpleFrozenList(),
         requires: Iterable[str] = SimpleFrozenList(),
         retokenizes: bool = False,
-        scores: Iterable[str] = SimpleFrozenList(),
         default_score_weights: Dict[str, float] = SimpleFrozenDict(),
         func: Optional[Callable] = None,
     ) -> Callable:
@@ -430,12 +435,11 @@ class Language:
             e.g. "token.ent_id". Used for pipeline analyis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
-        scores (Iterable[str]): All scores set by the component if it's trainable,
-            e.g. ["ents_f", "ents_r", "ents_p"].
         default_score_weights (Dict[str, float]): The scores to report during
             training, and their default weight towards the final score used to
             select the best model. Weights should sum to 1.0 per component and
-            will be combined and normalized for the whole pipeline.
+            will be combined and normalized for the whole pipeline. If None,
+            the score won't be shown in the logs or be weighted.
         func (Optional[Callable]): Factory function if not used as a decorator.
 
         DOCS: https://nightly.spacy.io/api/language#factory
@@ -475,7 +479,7 @@ class Language:
                 default_config=default_config,
                 assigns=validate_attrs(assigns),
                 requires=validate_attrs(requires),
-                scores=scores,
+                scores=list(default_score_weights.keys()),
                 default_score_weights=default_score_weights,
                 retokenizes=retokenizes,
             )
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index a49475c8e..a447434d2 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -43,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
         "min_action_freq": 30,
         "model": DEFAULT_PARSER_MODEL,
     },
-    scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"],
-    default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
+    default_score_weights={
+        "dep_uas": 0.5,
+        "dep_las": 0.5,
+        "dep_las_per_type": None,
+        "sents_p": None,
+        "sents_r": None,
+        "sents_f": 0.0,
+    },
 )
 def make_parser(
     nlp: Language,
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 24bbb067f..9166a69b8 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
         "overwrite_ents": False,
         "ent_id_sep": DEFAULT_ENT_ID_SEP,
     },
-    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
+    default_score_weights={
+        "ents_f": 1.0,
+        "ents_p": 0.0,
+        "ents_r": 0.0,
+        "ents_per_type": None,
+    },
 )
 def make_entity_ruler(
     nlp: Language,
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 0fd3482c4..c30d09f62 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -21,7 +21,6 @@ from .. import util
         "lookups": None,
         "overwrite": False,
     },
-    scores=["lemma_acc"],
     default_score_weights={"lemma_acc": 1.0},
 )
 def make_lemmatizer(
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 62ad9e0eb..5fee9a900 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
     "morphologizer",
     assigns=["token.morph", "token.pos"],
     default_config={"model": DEFAULT_MORPH_MODEL},
-    scores=["pos_acc", "morph_acc", "morph_per_feat"],
-    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
+    default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
 )
 def make_morphologizer(
     nlp: Language,
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index fc4f03473..c9b0a5031 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -39,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
         "update_with_oracle_cut_size": 100,
         "model": DEFAULT_NER_MODEL,
     },
-    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
-    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
+    default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
 
 )
 def make_ner(
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 5700c2b98..2882f6f8b 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -15,7 +15,6 @@ from .. import util
     "sentencizer",
     assigns=["token.is_sent_start", "doc.sents"],
     default_config={"punct_chars": None},
-    scores=["sents_p", "sents_r", "sents_f"],
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_sentencizer(
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index a7eb721fd..da85a9cf2 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
     "senter",
     assigns=["token.is_sent_start"],
     default_config={"model": DEFAULT_SENTER_MODEL},
-    scores=["sents_p", "sents_r", "sents_f"],
     default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
 )
 def make_senter(nlp: Language, name: str, model: Model):
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 0d78047ae..3efe29916 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
     "tagger",
     assigns=["token.tag"],
     default_config={"model": DEFAULT_TAGGER_MODEL},
-    scores=["tag_acc"],
     default_score_weights={"tag_acc": 1.0},
 )
 def make_tagger(nlp: Language, name: str, model: Model):
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index e7cb62a0d..6b8c0ca65 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -62,18 +62,17 @@ subword_features = true
         "positive_label": None,
         "model": DEFAULT_TEXTCAT_MODEL,
     },
-    scores=[
-        "cats_score",
-        "cats_score_desc",
-        "cats_p",
-        "cats_r",
-        "cats_f",
-        "cats_macro_f",
-        "cats_macro_auc",
-        "cats_f_per_type",
-        "cats_macro_auc_per_type",
-    ],
-    default_score_weights={"cats_score": 1.0},
+    default_score_weights={
+        "cats_score": 1.0,
+        "cats_score_desc": None,
+        "cats_p": None,
+        "cats_r": None,
+        "cats_f": None,
+        "cats_macro_f": None,
+        "cats_macro_auc": None,
+        "cats_f_per_type": None,
+        "cats_macro_auc_per_type": None,
+    },
 )
 def make_textcat(
     nlp: Language,
diff --git a/spacy/schemas.py b/spacy/schemas.py
index b0f26dcd7..e34841008 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel):
     seed: Optional[StrictInt] = Field(..., title="Random seed")
     gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
     accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
-    score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
+    score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
     raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
     optimizer: Optimizer = Field(..., title="The optimizer to use")
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 881460704..4ab1c4248 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -359,12 +359,8 @@ def test_language_factories_scores():
     func = lambda nlp, name: lambda doc: doc
     weights1 = {"a1": 0.5, "a2": 0.5}
     weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
-    Language.factory(
-        f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
-    )
-    Language.factory(
-        f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
-    )
+    Language.factory(f"{name}1", default_score_weights=weights1, func=func)
+    Language.factory(f"{name}2", default_score_weights=weights2, func=func)
     meta1 = Language.get_factory_meta(f"{name}1")
     assert meta1.default_score_weights == weights1
     meta2 = Language.get_factory_meta(f"{name}2")
@@ -376,6 +372,21 @@ def test_language_factories_scores():
     cfg = nlp.config["training"]
     expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
     assert cfg["score_weights"] == expected_weights
+    # Test with custom defaults
+    config = nlp.config.copy()
+    config["training"]["score_weights"]["a1"] = 0.0
+    config["training"]["score_weights"]["b3"] = 1.0
+    nlp = English.from_config(config)
+    score_weights = nlp.config["training"]["score_weights"]
+    expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59}
+    assert score_weights == expected
+    # Test with null values
+    config = nlp.config.copy()
+    config["training"]["score_weights"]["a1"] = None
+    nlp = English.from_config(config)
+    score_weights = nlp.config["training"]["score_weights"]
+    expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58}  # rounding :(
+    assert score_weights == expected
 
 
 def test_pipe_factories_from_source():
diff --git a/spacy/util.py b/spacy/util.py
index 025fe5288..f7c5cff59 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1209,8 +1209,19 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
     weights (List[dict]): The weights defined by the components.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
+    # We first need to extract all None/null values for score weights that
+    # shouldn't be shown in the table *or* be weighted
     result = {}
+    all_weights = []
     for w_dict in weights:
+        filtered_weights = {}
+        for key, value in w_dict.items():
+            if value is None:
+                result[key] = None
+            else:
+                filtered_weights[key] = value
+        all_weights.append(filtered_weights)
+    for w_dict in all_weights:
         # We need to account for weights that don't sum to 1.0 and normalize
         # the score weights accordingly, then divide score by the number of
         # components.
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index a7b9c0d88..dd3cc57dd 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -145,17 +145,16 @@ examples, see the
 > )
 > ```
 
-| Name                    | Description                                                                                                                                                                                                                                      |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                       |
-| _keyword-only_          |                                                                                                                                                                                                                                                  |
-| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                   |
-| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                             |
-| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     |
-| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
-| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                 |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name`                  | The name of the component factory. ~~str~~                                                                                                                                                                                                                                                                                         |
+| _keyword-only_          |                                                                                                                                                                                                                                                                                                                                    |
+| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     |
+| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               |
+| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
+| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                   |
 
 ## Language.\_\_call\_\_ {#call tag="method"}
 
@@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or
 component is defined and stored on the `Language` class for each component
 instance and factory instance.
 
-| Name                    | Description                                                                                                                                                                                                                                      |
-| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                            |
-| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                   |
-| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                               |
-| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                             |
-| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                     |
-| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `factory`               | The name of the registered component factory. ~~str~~                                                                                                                                                                                                                                                                              |
+| `default_config`        | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~                                                                                                                                                                                                                                     |
+| `assigns`               | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
+| `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               |
+| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
+| `scores`                | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                              |
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index b63145636..65afd0eb4 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -470,6 +470,7 @@ score.
 ```ini
 [training.score_weights]
 dep_las = 0.4
+dep_uas = null
 ents_f = 0.4
 tag_acc = 0.2
 token_acc = 0.0
@@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by
 combining and normalizing the default score weights of the pipeline components.
 The default score weights are defined by each pipeline component via the
 `default_score_weights` setting on the
-[`@Language.component`](/api/language#component) or
-[`@Language.factory`](/api/language#factory). By default, all pipeline
-components are weighted equally.
+[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline
+components are weighted equally. If a score weight is set to `null`, it will be
+excluded from the logs and the score won't be weighted.
 
 <Accordion title="Understanding the training output and score types" spaced>
 

From 17a6b0a1731321380914d3638e7e3bc25fd23a28 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 24 Sep 2020 10:30:42 +0200
Subject: [PATCH 127/133] Make project pull order insensitive (#6131)

---
 spacy/cli/project/pull.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index edcd410bd..3119d3a12 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -27,19 +27,32 @@ def project_pull_cli(
 
 
 def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
+    # TODO: We don't have tests for this :(. It would take a bit of mockery to
+    # set up. I guess see if it breaks first?
     config = load_project_config(project_dir)
     if remote in config.get("remotes", {}):
         remote = config["remotes"][remote]
     storage = RemoteStorage(project_dir, remote)
-    for cmd in config.get("commands", []):
-        deps = [project_dir / dep for dep in cmd.get("deps", [])]
-        if any(not dep.exists() for dep in deps):
-            continue
-        cmd_hash = get_command_hash("", "", deps, cmd["script"])
-        for output_path in cmd.get("outputs", []):
-            url = storage.pull(output_path, command_hash=cmd_hash)
-            yield url, output_path
+    commands = list(config.get("commands", []))
+    # We use a while loop here because we don't know how the commands
+    # will be ordered. A command might need dependencies from one that's later
+    # in the list.
+    while commands:
+        for i, cmd in enumerate(list(commands)):
+            deps = [project_dir / dep for dep in cmd.get("deps", [])]
+            if all(dep.exists() for dep in deps):
+                cmd_hash = get_command_hash("", "", deps, cmd["script"])
+                for output_path in cmd.get("outputs", []):
+                    url = storage.pull(output_path, command_hash=cmd_hash)
+                    yield url, output_path
 
-        out_locs = [project_dir / out for out in cmd.get("outputs", [])]
-        if all(loc.exists() for loc in out_locs):
-            update_lockfile(project_dir, cmd)
+                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
+                if all(loc.exists() for loc in out_locs):
+                    update_lockfile(project_dir, cmd)
+                # We remove the command from the list here, and break, so that
+                # we iterate over the loop again.
+                commands.remove(i)
+                break
+        else:
+            # If we didn't break the for loop, break the while loop.
+            break

From c645c4e7ceddbd819b7a56e56f013bb8447dea4b Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 24 Sep 2020 10:31:17 +0200
Subject: [PATCH 128/133] fix micro PRF for textcat (#6130)

* fix micro PRF for textcat

* small fix
---
 spacy/scorer.py                      |  8 ++++----
 spacy/tests/pipeline/test_textcat.py | 29 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index da22d59d4..c50de3d43 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -240,7 +240,7 @@ class Scorer:
                             pred_per_feat[field].add((gold_i, feat))
             for field in per_feat:
                 per_feat[field].score_set(
-                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                 )
         result = {k: v.to_dict() for k, v in per_feat.items()}
         return {f"{attr}_per_feat": result}
@@ -418,9 +418,9 @@ class Scorer:
                     f_per_type[pred_label].fp += 1
         micro_prf = PRFScore()
         for label_prf in f_per_type.values():
-            micro_prf.tp = label_prf.tp
-            micro_prf.fn = label_prf.fn
-            micro_prf.fp = label_prf.fp
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
         n_cats = len(f_per_type) + 1e-100
         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 99b5132ca..232b53e1d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -8,6 +8,7 @@ from spacy.language import Language
 from spacy.pipeline import TextCategorizer
 from spacy.tokens import Doc
 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
+from spacy.scorer import Scorer
 
 from ..util import make_tempdir
 from ...cli.train import verify_textcat_config
@@ -224,3 +225,31 @@ def test_positive_class_not_binary():
     assert textcat.labels == ("SOME", "THING", "POS")
     with pytest.raises(ValueError):
         verify_textcat_config(nlp, pipe_config)
+
+def test_textcat_evaluation():
+    train_examples = []
+    nlp = English()
+    ref1 = nlp("one")
+    ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0}
+    pred1 = nlp("one")
+    pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
+    train_examples.append(Example(pred1, ref1))
+
+    ref2 = nlp("two")
+    ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
+    pred2 = nlp("two")
+    pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
+    train_examples.append(Example(pred2, ref2))
+
+    scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
+    assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
+    assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
+    assert scores["cats_f_per_type"]["summer"]["p"] == 0
+    assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
+    assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
+    assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
+    assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
+    assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
+
+    assert scores["cats_micro_p"] == 4/5
+    assert scores["cats_micro_r"] == 4/6

From 4bbe41f017ffc6334a35f2a682804cf6365dfd9e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 10:42:47 +0200
Subject: [PATCH 129/133] Fix combined scores and update test

---
 spacy/language.py                           |  7 ++-----
 spacy/tests/pipeline/test_pipe_factories.py |  4 ++--
 spacy/util.py                               | 10 ++++++++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 0b7deacad..a52391419 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -251,11 +251,8 @@ class Language:
         # We're merging the existing score weights back into the combined
         # weights to make sure we're preserving custom settings in the config
         # but also reflect updates (e.g. new components added)
-        prev_score_weights = self._config["training"].get("score_weights", {})
-        combined_score_weights = combine_score_weights(score_weights)
-        combined_score_weights.update(prev_score_weights)
-        # Combine the scores a second time to normalize them
-        combined_score_weights = combine_score_weights([combined_score_weights])
+        prev_weights = self._config["training"].get("score_weights", {})
+        combined_score_weights = combine_score_weights(score_weights, prev_weights)
         self._config["training"]["score_weights"] = combined_score_weights
         if not srsly.is_json_serializable(self._config):
             raise ValueError(Errors.E961.format(config=self._config))
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 4ab1c4248..4c197005e 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -378,14 +378,14 @@ def test_language_factories_scores():
     config["training"]["score_weights"]["b3"] = 1.0
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.59}
+    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
     assert score_weights == expected
     # Test with null values
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = None
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.15, "b1": 0.06, "b2": 0.21, "b3": 0.58}  # rounding :(
+    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
     assert score_weights == expected
 
 
diff --git a/spacy/util.py b/spacy/util.py
index f7c5cff59..709da8d29 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1202,11 +1202,16 @@ def get_arg_names(func: Callable) -> List[str]:
     return list(set([*argspec.args, *argspec.kwonlyargs]))
 
 
-def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
+def combine_score_weights(
+    weights: List[Dict[str, float]],
+    overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
+) -> Dict[str, float]:
     """Combine and normalize score weights defined by components, e.g.
     {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
 
     weights (List[dict]): The weights defined by the components.
+    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
+        should be preserved.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
     # We first need to extract all None/null values for score weights that
@@ -1216,6 +1221,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
     for w_dict in weights:
         filtered_weights = {}
         for key, value in w_dict.items():
+            value = overrides.get(key, value)
             if value is None:
                 result[key] = None
             else:
@@ -1227,7 +1233,7 @@ def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
         # components.
         total = sum(w_dict.values())
         for key, value in w_dict.items():
-            weight = round(value / total / len(weights), 2)
+            weight = round(value / total / len(all_weights), 2)
             result[key] = result.get(key, 0.0) + weight
     return result
 

From 4eb39b5c43c74f8eabc1b2a8fa3b68e8baa02d3a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 11:04:35 +0200
Subject: [PATCH 130/133] Fix logging

---
 spacy/errors.py           |  1 +
 spacy/training/loggers.py | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 47a134c1f..ee2091225 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,7 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
     E917 = ("Received invalid value {value} for 'state_type' in "
             "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index dddf20169..d35b5a4bd 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -13,7 +13,8 @@ def console_logger():
     ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
         # we assume here that only components are enabled that should be trained & logged
         logged_pipes = nlp.pipe_names
-        score_cols = list(nlp.config["training"]["score_weights"])
+        score_weights = nlp.config["training"]["score_weights"]
+        score_cols = [col for col, value in score_weights.items() if value is not None]
         score_widths = [max(len(col), 6) for col in score_cols]
         loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
         loss_widths = [max(len(col), 8) for col in loss_cols]
@@ -40,10 +41,15 @@ def console_logger():
                 ) from None
             scores = []
             for col in score_cols:
-                score = float(info["other_scores"].get(col, 0.0))
-                if col != "speed":
-                    score *= 100
-                scores.append("{0:.2f}".format(score))
+                score = info["other_scores"].get(col, 0.0)
+                try:
+                    score = float(score)
+                    if col != "speed":
+                        score *= 100
+                    scores.append("{0:.2f}".format(score))
+                except TypeError:
+                    err = Errors.E916.format(name=col, score_type=type(score))
+                    raise TypeError(err) from None
             data = (
                 [info["epoch"], info["step"]]
                 + losses

From f69fea8b252ac5f28c4daac40046df507ab6f07f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 11:29:07 +0200
Subject: [PATCH 131/133] Improve error handling around non-number scores

---
 spacy/cli/train.py        | 7 ++++++-
 spacy/errors.py           | 4 ++++
 spacy/training/loggers.py | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3485a4ff2..eabc82be0 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -214,7 +214,12 @@ def create_evaluation_callback(
     def evaluate() -> Tuple[float, Dict[str, float]]:
         dev_examples = list(dev_corpus(nlp))
         scores = nlp.evaluate(dev_examples)
-        # Calculate a weighted sum based on score_weights for the main score
+        # Calculate a weighted sum based on score_weights for the main score.
+        # We can only consider scores that are ints/floats, not dicts like
+        # entity scores per type etc.
+        for key, value in scores.items():
+            if key in weights and not isinstance(value, (int, float)):
+                raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
         try:
             weighted_score = sum(
                 scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
diff --git a/spacy/errors.py b/spacy/errors.py
index ee2091225..dce5cf51c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,10 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
+            "float or int but got: {score_type}. To exclude the score from the "
+            "final score, set its weight to null in the [training.score_weights] "
+            "section of your training config.")
     E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
     E917 = ("Received invalid value {value} for 'state_type' in "
             "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index d35b5a4bd..0f054d433 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -49,7 +49,7 @@ def console_logger():
                     scores.append("{0:.2f}".format(score))
                 except TypeError:
                     err = Errors.E916.format(name=col, score_type=type(score))
-                    raise TypeError(err) from None
+                    raise ValueError(err) from None
             data = (
                 [info["epoch"], info["step"]]
                 + losses

From d7ab6a2ffe8e11ee644286ea815bae8cf59bfabb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 12:37:21 +0200
Subject: [PATCH 132/133] Update docs [ci skip]

---
 website/docs/usage/_benchmarks-models.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index a00229867..4b25418b5 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -22,12 +22,13 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 <figure>
 
 | Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
-| ------------------------------------------------------------------------------ | --------: | --------- |
-| spaCy RoBERTa (2020)                                                           |
-| spaCy CNN (2020)                                                               |           |
-| spaCy CNN (2017)                                                               |      86.4 |
-| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |
-| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |
+| ------------------------------------------------------------------------------ | --------: | --------: |
+| spaCy RoBERTa (2020)                                                           |           |      92.2 |
+| spaCy CNN (2020)                                                               |           |      88.4 |
+| spaCy CNN (2017)                                                               |      86.4 |           |
+| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 |
+| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 |
+| BERT Base<sup>3</sup>                                                          |         - |      92.4 |
 
 <figcaption class="caption">
 
@@ -36,7 +37,8 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 [CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
 [NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
 more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
-**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/)
+**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3.
+** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
 
 </figcaption>
 

From 6836b664330926a401d05f16fe95cf475febff08 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 24 Sep 2020 13:41:25 +0200
Subject: [PATCH 133/133] Update docs and resolve todos [ci skip]

---
 website/docs/usage/_benchmarks-models.md      | 8 ++++----
 website/docs/usage/embeddings-transformers.md | 2 --
 website/docs/usage/facts-figures.md           | 2 +-
 website/docs/usage/linguistic-features.md     | 9 ++++++---
 website/docs/usage/processing-pipelines.md    | 7 +++++--
 website/docs/usage/projects.md                | 5 ++++-
 6 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 4b25418b5..5b193d3a4 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -1,10 +1,10 @@
 import { Help } from 'components/typography'; import Link from 'components/link'
 
-<!-- TODO: update, add project template -->
+<!-- TODO: update numbers -->
 
 <figure>
 
-| System                                                     | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
+| Pipeline                                                   | Parser | Tagger |  NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
 | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
 | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) |        |        |      |                                                                     |                                                                 6k |
 | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3)   |        |        |      |                                                                     |                                                                    |
@@ -21,10 +21,10 @@ import { Help } from 'components/typography'; import Link from 'components/link'
 
 <figure>
 
-| Named Entity Recognition Model                                                 | OntoNotes | CoNLL '03 |
+| Named Entity Recognition System                                                | OntoNotes | CoNLL '03 |
 | ------------------------------------------------------------------------------ | --------: | --------: |
 | spaCy RoBERTa (2020)                                                           |           |      92.2 |
-| spaCy CNN (2020)                                                               |           |      88.4 |
+| spaCy CNN (2020)                                                               |      85.3 |      88.4 |
 | spaCy CNN (2017)                                                               |      86.4 |           |
 | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup>      |      88.8 |      92.1 |
 | <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> |      89.7 |      93.1 |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index d61172a5b..b00760e62 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -235,8 +235,6 @@ The `Transformer` component sets the
 [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
 which lets you access the transformers outputs at runtime.
 
-<!-- TODO: update/confirm once we have final models trained -->
-
 ```cli
 $ python -m spacy download en_core_trf_lg
 ```
diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md
index 743dae74d..a31559b04 100644
--- a/website/docs/usage/facts-figures.md
+++ b/website/docs/usage/facts-figures.md
@@ -63,7 +63,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
 
 <figure>
 
-| System                                                                         |  UAS |  LAS |
+| Dependency Parsing System                                                      |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: |
 | spaCy RoBERTa (2020)<sup>1</sup>                                               | 96.8 | 95.0 |
 | spaCy CNN (2020)<sup>1</sup>                                                   | 93.7 | 91.8 |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 914e18acb..d9a894398 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
 component that only provides sentence boundaries. Along with being faster and
 smaller than the parser, its primary advantage is that it's easier to train
 because it only requires annotated sentence boundaries rather than full
-dependency parses.
-
-<!-- TODO: update/confirm usage once we have final models trained -->
+dependency parses. spaCy's [trained pipelines](/models) include both a parser
+and a trained sentence segmenter, which is
+[disabled](/usage/processing-pipelines#disabling) by default. If you only need
+sentence boundaries and no parser, you can use the `enable` and `disable`
+arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
+disable the parser.
 
 > #### senter vs. parser
 >
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 97806dc2a..dbf0881ac 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -253,8 +253,6 @@ different mechanisms you can use:
 Disabled and excluded component names can be provided to
 [`spacy.load`](/api/top-level#spacy.load) as a list.
 
-<!-- TODO: update with info on our models shipped with optional components -->
-
 > #### 💡 Optional pipeline components
 >
 > The `disable` mechanism makes it easy to distribute pipeline packages with
@@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to
 > your pipeline may include a statistical _and_ a rule-based component for
 > sentence segmentation, and you can choose which one to run depending on your
 > use case.
+>
+> For example, spaCy's [trained pipelines](/models) like
+> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and
+> `senter` that perform sentence segmentation, but the `senter` is disabled by
+> default.
 
 ```python
 # Load the pipeline without the entity recognizer
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 8e093e8d6..6d5746308 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC.
 <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
 
 The Prodigy integration will require a nightly version of Prodigy that supports
-spaCy v3+.
+spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
+exporting your data with
+[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
+[`spacy convert`](/api/cli#convert) to convert it to the binary format.
 
 </Infobox>