From 35a393106404d8f69d69e6c12d62e21a7d517065 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 16:36:27 +0200
Subject: [PATCH 01/30] fix typo

---
 spacy/cli/debug_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index a4899a458..58908c5e8 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -54,7 +54,7 @@ def debug_model_cli(
     config_overrides = parse_config_overrides(ctx.args)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=config_overrides)
-        nlp, config = util.load_model_from_config(config_path)
+        nlp, config = util.load_model_from_config(config)
     seed = config["training"]["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")

From e4fc7e0222621c40b6d0aa025d3fc0450a672079 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 17 Sep 2020 22:34:36 +0200
Subject: [PATCH 02/30] fixing output sample to proper 2D array

---
 spacy/cli/debug_model.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 58908c5e8..04a14bdc9 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -60,13 +60,12 @@ def debug_model_cli(
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
     pipe = nlp.get_pipe(component)
-    if hasattr(pipe, "model"):
-        model = pipe.model
-    else:
+    if not hasattr(pipe, "model"):
         msg.fail(
             f"The component '{component}' does not specify an object that holds a Model.",
             exits=1,
         )
+    model = pipe.model
     debug_model(model, print_settings=print_settings)
 
 
@@ -87,7 +86,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops.xp)
+    Y = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=Y)
@@ -113,9 +112,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
+    msg.good(f"Succesfully ended analysis - model looks good!")
+
 
 def get_gradient(model, Y):
-    goldY = _get_output(model.ops.xp)
+    goldY = _get_output(model.ops)
     return Y - goldY
 
 
@@ -133,8 +134,14 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _get_output(ops):
+    docs = len(_get_docs())
+    labels = 6
+    output = ops.alloc2f(d0=docs, d1=labels)
+    for i in range(docs):
+        for j in range(labels):
+            output[i, j] = 1 / (i+j+0.01)
+    return ops.xp.asarray(output)
 
 
 def _print_model(model, print_settings):

From 73ff52b9ec9e61ae2d7faeacfef1b7bee53ea10e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 18 Sep 2020 16:43:15 +0200
Subject: [PATCH 03/30] hack for tok2vec listener

---
 spacy/cli/debug_model.py | 26 +++++++++++++++++---------
 spacy/errors.py          |  3 ++-
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 04a14bdc9..1d8d043fd 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -66,10 +66,12 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    debug_model(model, print_settings=print_settings)
+    # call _link_components directly as we won't call nlp.begin_training
+    nlp._link_components()
+    debug_model(nlp, model, print_settings=print_settings)
 
 
-def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -86,10 +88,10 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    Y = _get_output(model.ops)
+    goldY = _get_output(model.ops)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=Y)
+        model.initialize(X=X, Y=goldY)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -97,9 +99,16 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     # STEP 2: Updating the model and printing again
     optimizer = Adam(0.001)
     set_dropout_rate(model, 0.2)
+    # ugly hack to deal with Tok2Vec listeners
+    tok2vec = None
+    if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
+        tok2vec = nlp.get_pipe("tok2vec")
+        tok2vec.model.initialize(X=X)
     for e in range(3):
-        Y, get_dX = model.begin_update(_get_docs())
-        dY = get_gradient(model, Y)
+        if tok2vec:
+            tok2vec.predict(X)
+        Y, get_dX = model.begin_update(X)
+        dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -107,7 +116,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
         _print_model(model, print_settings)
 
     # STEP 3: the final prediction
-    prediction = model.predict(_get_docs())
+    prediction = model.predict(X)
     if print_settings.get("print_prediction"):
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
@@ -115,8 +124,7 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
     msg.good(f"Succesfully ended analysis - model looks good!")
 
 
-def get_gradient(model, Y):
-    goldY = _get_output(model.ops)
+def get_gradient(goldY, Y):
     return Y - goldY
 
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 173aedab9..af307e069 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -545,7 +545,8 @@ class Errors:
     E949 = ("Can only create an alignment when the texts are the same.")
     E952 = ("The section '{name}' is not a valid section in the provided config.")
     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
-    E954 = ("The Tok2Vec listener did not receive a valid input.")
+    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
+            "component.")
     E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.")
     E956 = ("Can't find component '{name}' in [components] block in the config. "
             "Available components: {opts}")

From 6db1d5dc0dff848dded3d2990543f749707afc45 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 19 Sep 2020 19:11:30 +0200
Subject: [PATCH 04/30] trying some stuff

---
 spacy/cli/debug_model.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 1d8d043fd..09feaf671 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -89,6 +89,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
     goldY = _get_output(model.ops)
+    # _set_output_dim(nO=goldY.shape[-1], model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
         model.initialize(X=X, Y=goldY)
@@ -108,6 +109,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
+        print("get_dX", get_dX)
         dY = get_gradient(goldY, Y)
         get_dX(dY)
         model.finish_update(optimizer)
@@ -152,6 +154,10 @@ def _get_output(ops):
     return ops.xp.asarray(output)
 
 
+def _get_output_old(xp):
+    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+
+
 def _print_model(model, print_settings):
     layers = print_settings.get("layers", "")
     parameters = print_settings.get("parameters", False)
@@ -200,3 +206,12 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
+
+
+def _set_output_dim(model, nO):
+    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 3aa57ce6c9ab162715cad72563b25f5aecb28966 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:07:20 +0200
Subject: [PATCH 05/30] Update alignment mode in Doc.char_span docs

---
 website/docs/api/doc.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..44316ea1e 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -187,8 +187,8 @@ Remove a previously registered extension.
 ## Doc.char_span {#char_span tag="method" new="2"}
 
 Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
-`None` if the character indices don't map to a valid span using the default mode
-`"strict".
+`None` if the character indices don't map to a valid span using the default
+alignment mode `"strict".
 
 > #### Example
 >
@@ -198,15 +198,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
 > assert span.text == "New York"
 > ```
 
-| Name                                 | Description                                                                                                                                                                                                                                                                 |
-| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                       |
-| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                      |
-| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                 |
-| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                   |
-| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                              |
-| `mode`                               | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"inside"` (span of all tokens completely within the character span), `"outside"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
-| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                  |
+| Name                                 | Description                                                                                                                                                                                                                                                                  |
+| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start`                              | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        |
+| `end`                                | The index of the last character after the span. ~int~~                                                                                                                                                                                                                       |
+| `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  |
+| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    |
+| `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               |
+| `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
+| **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 

From cc71ec901f26ae1c3bfb62b6bd776295200f418e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 09:08:55 +0200
Subject: [PATCH 06/30] Fix typo in saving and loading usage docs

---
 website/docs/usage/saving-loading.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index 3a95bf6aa..06fb18591 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -299,9 +299,10 @@ installed in the same environment – that's it.
 
 When you load a pipeline, spaCy will generally use its `config.cfg` to set up
 the language class and construct the pipeline. The pipeline is specified as a
-list of strings, e.g. `pipeline = ["tagger", "paser", "ner"]`. For each of those
-strings, spaCy will call `nlp.add_pipe` and look up the name in all factories
-defined by the decorators [`@Language.component`](/api/language#component) and
+list of strings, e.g. `pipeline = ["tagger", "parser", "ner"]`. For each of
+those strings, spaCy will call `nlp.add_pipe` and look up the name in all
+factories defined by the decorators
+[`@Language.component`](/api/language#component) and
 [`@Language.factory`](/api/language#factory). This means that you have to import
 your custom components _before_ loading the pipeline.
 

From 5497acf49aef93a1d6d451da11cc9f3d2841b345 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 11:25:10 +0200
Subject: [PATCH 07/30] Support config overrides via environment variables

---
 spacy/cli/_util.py      | 58 ++++++++++++++++++++++++++++++++---------
 spacy/tests/test_cli.py | 16 ++++++++++--
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 040434c05..0159dd473 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -11,9 +11,10 @@ from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
 from configparser import InterpolationError
+import os
 
 from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry
+from ..util import import_file, run_command, make_tempdir, registry, logger
 
 if TYPE_CHECKING:
     from pathy import Pathy  # noqa: F401
@@ -61,16 +62,38 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
+def parse_config_env_overrides(
+    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+) -> Dict[str, Any]:
+    """Generate a dictionary of config overrides based on environment variables,
+    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
+    setting.
+
+    prefix (str): The env variable prefix for config overrides.
+    dot (str): String used to represent the "dot", e.g. in training.batch_size.
+    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
+    """
+    result = {}
+    for env_key, value in os.environ.items():
+        if env_key.startswith(prefix):
+            opt = env_key[len(prefix) :].lower().replace(dot, ".")
+            if "." in opt:
+                result[opt] = try_json_loads(value)
+    return result
+
+
+def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
     args (List[str]): The extra arguments from the command line.
+    env_vars (bool): Include environment variables.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    result = {}
+    env_overrides = parse_config_env_overrides() if env_vars else {}
+    cli_overrides = {}
     while args:
         opt = args.pop(0)
         err = f"Invalid CLI argument '{opt}'"
@@ -87,18 +110,27 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
                     value = "true"
                 else:
                     value = args.pop(0)
-            # Just like we do in the config, we're calling json.loads on the
-            # values. But since they come from the CLI, it'd be unintuitive to
-            # explicitly mark strings with escaped quotes. So we're working
-            # around that here by falling back to a string if parsing fails.
-            # TODO: improve logic to handle simple types like list of strings?
-            try:
-                result[opt] = srsly.json_loads(value)
-            except ValueError:
-                result[opt] = str(value)
+            if opt not in env_overrides:
+                cli_overrides[opt] = try_json_loads(value)
         else:
             msg.fail(f"{err}: override option should start with --", exits=1)
-    return result
+    if cli_overrides:
+        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def try_json_loads(value: Any) -> Any:
+    # Just like we do in the config, we're calling json.loads on the
+    # values. But since they come from the CLI, it'd be unintuitive to
+    # explicitly mark strings with escaped quotes. So we're working
+    # around that here by falling back to a string if parsing fails.
+    # TODO: improve logic to handle simple types like list of strings?
+    try:
+        return srsly.json_loads(value)
+    except ValueError:
+        return str(value)
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 422ae74b4..d81437f18 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,15 +1,15 @@
 import pytest
 from click import NoSuchOption
-
 from spacy.training import docs_to_json, biluo_tags_from_offsets
 from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list
+from spacy.cli._util import string_to_list, parse_config_env_overrides
 from thinc.config import ConfigValidationError
 import srsly
+import os
 
 from .util import make_tempdir
 
@@ -341,6 +341,18 @@ def test_parse_config_overrides_invalid_2(args):
         parse_config_overrides(args)
 
 
+def test_parse_cli_overrides():
+    prefix = "SPACY_CONFIG_"
+    dot = "__"
+    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
+    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
+    os.environ[prefix] = "bad"
+    result = parse_config_env_overrides(prefix=prefix, dot=dot)
+    assert len(result) == 2
+    assert result["training.batch_size"] == 123
+    assert result["foo.bar.baz"] == "hello"
+
+
 @pytest.mark.parametrize("lang", ["en", "nl"])
 @pytest.mark.parametrize(
     "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]

From 758ead8a476fa5f5e55c64c3c4bd242c7cb83d1e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 12:50:13 +0200
Subject: [PATCH 08/30] Sync overrides with CLI overrides

---
 spacy/cli/_util.py      | 80 ++++++++++++++++++-----------------------
 spacy/tests/test_cli.py | 26 ++++++++------
 2 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0159dd473..0dd2ee380 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -7,6 +7,7 @@ import srsly
 import hashlib
 import typer
 from click import NoSuchOption
+from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.config import Config, ConfigValidationError
@@ -38,6 +39,7 @@ commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
+OVERRIDES_ENV_VAR = "SPACY_CONFIG_OVERRIDES"
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@@ -62,46 +64,41 @@ def setup_cli() -> None:
     command(prog_name=COMMAND)
 
 
-def parse_config_env_overrides(
-    *, prefix: str = "SPACY_CONFIG_", dot: str = "__"
+def parse_config_overrides(
+    args: List[str], env_var: Optional[str] = OVERRIDES_ENV_VAR
 ) -> Dict[str, Any]:
-    """Generate a dictionary of config overrides based on environment variables,
-    e.g. SPACY_CONFIG_TRAINING__BATCH_SIZE=123 overrides the training.batch_size
-    setting.
-
-    prefix (str): The env variable prefix for config overrides.
-    dot (str): String used to represent the "dot", e.g. in training.batch_size.
-    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
-    """
-    result = {}
-    for env_key, value in os.environ.items():
-        if env_key.startswith(prefix):
-            opt = env_key[len(prefix) :].lower().replace(dot, ".")
-            if "." in opt:
-                result[opt] = try_json_loads(value)
-    return result
-
-
-def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str, Any]:
     """Generate a dictionary of config overrides based on the extra arguments
     provided on the CLI, e.g. --training.batch_size to override
     "training.batch_size". Arguments without a "." are considered invalid,
     since the config only allows top-level sections to exist.
 
-    args (List[str]): The extra arguments from the command line.
-    env_vars (bool): Include environment variables.
+    env_vars (Optional[str]): Optional environment variable to read from.
     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
     """
-    env_overrides = parse_config_env_overrides() if env_vars else {}
-    cli_overrides = {}
+    env_string = os.environ.get(env_var, "") if env_var else ""
+    env_overrides = _parse_overrides(split_arg_string(env_string))
+    cli_overrides = _parse_overrides(args, is_cli=True)
+    if cli_overrides:
+        keys = [k for k in cli_overrides if k not in env_overrides]
+        logger.debug(f"Config overrides from CLI: {keys}")
+    if env_overrides:
+        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+    return {**cli_overrides, **env_overrides}
+
+
+def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
+    result = {}
     while args:
         opt = args.pop(0)
-        err = f"Invalid CLI argument '{opt}'"
+        err = f"Invalid config override '{opt}'"
         if opt.startswith("--"):  # new argument
             orig_opt = opt
             opt = opt.replace("--", "")
             if "." not in opt:
-                raise NoSuchOption(orig_opt)
+                if is_cli:
+                    raise NoSuchOption(orig_opt)
+                else:
+                    msg.fail(f"{err}: can't override top-level sections", exits=1)
             if "=" in opt:  # we have --opt=value
                 opt, value = opt.split("=", 1)
                 opt = opt.replace("-", "_")
@@ -110,27 +107,18 @@ def parse_config_overrides(args: List[str], env_vars: bool = True) -> Dict[str,
                     value = "true"
                 else:
                     value = args.pop(0)
-            if opt not in env_overrides:
-                cli_overrides[opt] = try_json_loads(value)
+            # Just like we do in the config, we're calling json.loads on the
+            # values. But since they come from the CLI, it'd be unintuitive to
+            # explicitly mark strings with escaped quotes. So we're working
+            # around that here by falling back to a string if parsing fails.
+            # TODO: improve logic to handle simple types like list of strings?
+            try:
+                result[opt] = srsly.json_loads(value)
+            except ValueError:
+                result[opt] = str(value)
         else:
-            msg.fail(f"{err}: override option should start with --", exits=1)
-    if cli_overrides:
-        logger.debug(f"Config overrides from CLI: {list(cli_overrides)}")
-    if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
-    return {**cli_overrides, **env_overrides}
-
-
-def try_json_loads(value: Any) -> Any:
-    # Just like we do in the config, we're calling json.loads on the
-    # values. But since they come from the CLI, it'd be unintuitive to
-    # explicitly mark strings with escaped quotes. So we're working
-    # around that here by falling back to a string if parsing fails.
-    # TODO: improve logic to handle simple types like list of strings?
-    try:
-        return srsly.json_loads(value)
-    except ValueError:
-        return str(value)
+            msg.fail(f"{err}: name should start with --", exits=1)
+    return result
 
 
 def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index d81437f18..a9c9d8ca5 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -6,7 +6,7 @@ from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
-from spacy.cli._util import string_to_list, parse_config_env_overrides
+from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
 from thinc.config import ConfigValidationError
 import srsly
 import os
@@ -342,15 +342,21 @@ def test_parse_config_overrides_invalid_2(args):
 
 
 def test_parse_cli_overrides():
-    prefix = "SPACY_CONFIG_"
-    dot = "__"
-    os.environ[f"{prefix}TRAINING{dot}BATCH_SIZE"] = "123"
-    os.environ[f"{prefix}FOO{dot}BAR{dot}BAZ"] = "hello"
-    os.environ[prefix] = "bad"
-    result = parse_config_env_overrides(prefix=prefix, dot=dot)
-    assert len(result) == 2
-    assert result["training.batch_size"] == 123
-    assert result["foo.bar.baz"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x.foo bar --x.bar=12 --x.baz false --y.foo=hello"
+    result = parse_config_overrides([])
+    assert len(result) == 4
+    assert result["x.foo"] == "bar"
+    assert result["x.bar"] == 12
+    assert result["x.baz"] is False
+    assert result["y.foo"] == "hello"
+    os.environ[OVERRIDES_ENV_VAR] = "--x"
+    assert parse_config_overrides([], env_var=None) == {}
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    os.environ[OVERRIDES_ENV_VAR] = "hello world"
+    with pytest.raises(SystemExit):
+        parse_config_overrides([])
+    del os.environ[OVERRIDES_ENV_VAR]
 
 
 @pytest.mark.parametrize("lang", ["en", "nl"])

From bc02e864943a790cfc7ec991c67d20cc774417df Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:01:26 +0200
Subject: [PATCH 09/30] Extend Doc.__init__ with additional annotation

Mostly copying from `spacy.tests.util.get_doc`, add additional kwargs to
`Doc.__init__` to initialize the most common doc/token values.
---
 spacy/errors.py                          |  5 +-
 spacy/tests/util.py                      | 60 ++----------------
 spacy/tokens/doc.pyx                     | 77 ++++++++++++++++++++++--
 spacy/training/converters/conllu2docs.py | 35 ++++++-----
 website/docs/api/doc.md                  | 19 ++++--
 5 files changed, 118 insertions(+), 78 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 81e3616be..f219496a5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -57,7 +57,10 @@ class Warnings:
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
             "the Knowledge Base.")
-    W026 = ("Unable to set all sentence boundaries from dependency parses.")
+    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
+            "you are constructing a parse tree incrementally by setting "
+            "token.head values, you can probably ignore this warning. Consider "
+            "using Doc(words, ..., heads=heads, deps=deps) instead.")
     W027 = ("Found a large training file of {size} bytes. Note that it may "
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 741753c89..7bc32bf34 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -30,60 +30,12 @@ def get_doc(
     morphs=None,
 ):
     """Create Doc object from given vocab, words and annotations."""
-    if deps and not heads:
-        heads = [0] * len(deps)
-    headings = []
-    values = []
-    annotations = [pos, heads, deps, lemmas, tags, morphs]
-    possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
-    for a, annot in enumerate(annotations):
-        if annot is not None:
-            if len(annot) != len(words):
-                raise ValueError(Errors.E189)
-            headings.append(possible_headings[a])
-            if annot is not heads:
-                values.extend(annot)
-    for value in values:
-        vocab.strings.add(value)
-
-    doc = Doc(vocab, words=words)
-
-    # if there are any other annotations, set them
-    if headings:
-        attrs = doc.to_array(headings)
-
-        j = 0
-        for annot in annotations:
-            if annot:
-                if annot is heads:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = heads[i]
-                        else:
-                            attrs[i, j] = heads[i]
-                elif annot is morphs:
-                    for i in range(len(words)):
-                        morph_key = vocab.morphology.add(morphs[i])
-                        if attrs.ndim == 1:
-                            attrs[i] = morph_key
-                        else:
-                            attrs[i, j] = morph_key
-                else:
-                    for i in range(len(words)):
-                        if attrs.ndim == 1:
-                            attrs[i] = doc.vocab.strings[annot[i]]
-                        else:
-                            attrs[i, j] = doc.vocab.strings[annot[i]]
-                j += 1
-        doc.from_array(headings, attrs)
-
-    # finally, set the entities
-    if ents:
-        doc.ents = [
-            Span(doc, start, end, label=doc.vocab.strings[label])
-            for start, end, label in ents
-        ]
-    return doc
+    if heads is not None:
+        heads = [i + head for i, head in enumerate(heads)]
+    if ents is not None:
+        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
+    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
+        ents=ents, lemmas=lemmas, morphs=morphs)
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2d9de278b..de7e0f862 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -158,17 +158,38 @@ cdef class Doc:
             raise ValueError(Errors.E046.format(name=name))
         return Underscore.doc_extensions.pop(name)
 
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
+    def __init__(
+        self,
+        Vocab vocab,
+        words=None,
+        spaces=None,
+        user_data=None,
+        *,
+        tags=None,
+        pos=None,
+        morphs=None,
+        lemmas=None,
+        heads=None,
+        deps=None,
+        ents=None,
+    ):
         """Create a Doc object.
 
         vocab (Vocab): A vocabulary object, which must match any models you
             want to use (e.g. tokenizer, parser, entity recognizer).
-        words (list or None): A list of unicode strings to add to the document
+        words (Optional[List[str]]): A list of unicode strings to add to the document
             as words. If `None`, defaults to empty list.
-        spaces (list or None): A list of boolean values, of the same length as
+        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
+        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -217,6 +238,55 @@ cdef class Doc:
                 lexeme = self.vocab.get_by_orth(self.mem, word)
             self.push_back(lexeme, has_space)
 
+        if heads is not None:
+            heads = [head - i for i, head in enumerate(heads)]
+        if deps and not heads:
+            heads = [0] * len(deps)
+        headings = []
+        values = []
+        annotations = [pos, heads, deps, lemmas, tags, morphs]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        for a, annot in enumerate(annotations):
+            if annot is not None:
+                if len(annot) != len(words):
+                    raise ValueError(Errors.E189)
+                headings.append(possible_headings[a])
+                if annot is not heads:
+                    values.extend(annot)
+        for value in values:
+            self.vocab.strings.add(value)
+
+        # if there are any other annotations, set them
+        if headings:
+            attrs = self.to_array(headings)
+
+            j = 0
+            for annot in annotations:
+                if annot:
+                    if annot is heads:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = heads[i]
+                            else:
+                                attrs[i, j] = heads[i]
+                    elif annot is morphs:
+                        for i in range(len(words)):
+                            morph_key = vocab.morphology.add(morphs[i])
+                            if attrs.ndim == 1:
+                                attrs[i] = morph_key
+                            else:
+                                attrs[i, j] = morph_key
+                    else:
+                        for i in range(len(words)):
+                            if attrs.ndim == 1:
+                                attrs[i] = self.vocab.strings[annot[i]]
+                            else:
+                                attrs[i, j] = self.vocab.strings[annot[i]]
+                    j += 1
+            self.from_array(headings, attrs)
+        if ents is not None:
+            self.ents = ents
+
     @property
     def _(self):
         """Custom extension attributes registered via `set_extension`."""
@@ -1344,7 +1414,6 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
-
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.
diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py
index ebd123375..b4d8b3ac4 100644
--- a/spacy/training/converters/conllu2docs.py
+++ b/spacy/training/converters/conllu2docs.py
@@ -199,13 +199,17 @@ def doc_from_conllu_sentence(
         heads.append(head)
         deps.append(dep)
 
-    doc = Doc(vocab, words=words, spaces=spaces)
+    doc = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        pos=poses,
+        deps=deps,
+        lemmas=lemmas,
+        heads=heads,
+    )
     for i in range(len(doc)):
-        doc[i].tag_ = tags[i]
-        doc[i].pos_ = poses[i]
-        doc[i].dep_ = deps[i]
-        doc[i].lemma_ = lemmas[i]
-        doc[i].head = doc[heads[i]]
         doc[i]._.merged_orth = words[i]
         doc[i]._.merged_morph = morphs[i]
         doc[i]._.merged_lemma = lemmas[i]
@@ -232,14 +236,17 @@ def doc_from_conllu_sentence(
         heads.append(t.head.i)
         deps.append(t.dep_)
 
-    doc_x = Doc(vocab, words=words, spaces=spaces)
-    for i in range(len(doc)):
-        doc_x[i].tag_ = tags[i]
-        doc_x[i].morph_ = morphs[i]
-        doc_x[i].lemma_ = lemmas[i]
-        doc_x[i].pos_ = poses[i]
-        doc_x[i].dep_ = deps[i]
-        doc_x[i].head = doc_x[heads[i]]
+    doc_x = Doc(
+        vocab,
+        words=words,
+        spaces=spaces,
+        tags=tags,
+        morphs=morphs,
+        lemmas=lemmas,
+        pos=poses,
+        deps=deps,
+        heads=heads,
+    )
     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
 
     return doc_x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 380f6a172..680523c60 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -30,11 +30,20 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name     | Description                                                                                                                                                                                  |
-| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`  | A storage container for lexical types. ~~Vocab~~                                                                                                                                             |
-| `words`  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                           |
-| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
+| Name           | Description                                                                                                                                                                                    |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
+| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
+| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
+| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
+| _keyword-only_ |                                                                                                                                                                                                |
+| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
+| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
+| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 

From 9b8d0b7f904f8751a804f112825a38cebe102ce9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:46:21 +0200
Subject: [PATCH 10/30] Alphabetize API sidebars

---
 website/meta/sidebars.json | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 76d5e63d6..e27817c92 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -75,63 +75,63 @@
             {
                 "label": "Containers",
                 "items": [
-                    { "text": "Language", "url": "/api/language" },
                     { "text": "Doc", "url": "/api/doc" },
-                    { "text": "Token", "url": "/api/token" },
-                    { "text": "Span", "url": "/api/span" },
-                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "DocBin", "url": "/api/docbin" },
                     { "text": "Example", "url": "/api/example" },
-                    { "text": "DocBin", "url": "/api/docbin" }
+                    { "text": "Language", "url": "/api/language" },
+                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "Span", "url": "/api/span" },
+                    { "text": "Token", "url": "/api/token" }
                 ]
             },
             {
                 "label": "Pipeline",
                 "items": [
-                    { "text": "Tokenizer", "url": "/api/tokenizer" },
-                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
-                    { "text": "Transformer", "url": "/api/transformer" },
-                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
-                    { "text": "Morphologizer", "url": "/api/morphologizer" },
-                    { "text": "Tagger", "url": "/api/tagger" },
                     { "text": "AttributeRuler", "url": "/api/attributeruler" },
                     { "text": "DependencyParser", "url": "/api/dependencyparser" },
+                    { "text": "EntityLinker", "url": "/api/entitylinker" },
                     { "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
                     { "text": "EntityRuler", "url": "/api/entityruler" },
-                    { "text": "EntityLinker", "url": "/api/entitylinker" },
-                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
-                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Lemmatizer", "url": "/api/lemmatizer" },
+                    { "text": "Morphologizer", "url": "/api/morphologizer" },
+                    { "text": "Pipe", "url": "/api/pipe" },
                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
-                    { "text": "Other Functions", "url": "/api/pipeline-functions" },
-                    { "text": "Pipe", "url": "/api/pipe" }
+                    { "text": "Sentencizer", "url": "/api/sentencizer" },
+                    { "text": "Tagger", "url": "/api/tagger" },
+                    { "text": "TextCategorizer", "url": "/api/textcategorizer" },
+                    { "text": "Tok2Vec", "url": "/api/tok2vec" },
+                    { "text": "Tokenizer", "url": "/api/tokenizer" },
+                    { "text": "Transformer", "url": "/api/transformer" },
+                    { "text": "Other Functions", "url": "/api/pipeline-functions" }
                 ]
             },
             {
                 "label": "Matchers",
                 "items": [
+                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" },
                     { "text": "Matcher", "url": "/api/matcher" },
-                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" },
-                    { "text": "DependencyMatcher", "url": "/api/dependencymatcher" }
+                    { "text": "PhraseMatcher", "url": "/api/phrasematcher" }
                 ]
             },
             {
                 "label": "Other",
                 "items": [
-                    { "text": "Vocab", "url": "/api/vocab" },
-                    { "text": "StringStore", "url": "/api/stringstore" },
-                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Corpus", "url": "/api/corpus" },
+                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
                     { "text": "Morphology", "url": "/api/morphology" },
-                    { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Scorer", "url": "/api/scorer" },
-                    { "text": "Corpus", "url": "/api/corpus" }
+                    { "text": "StringStore", "url": "/api/stringstore" },
+                    { "text": "Vectors", "url": "/api/vectors" },
+                    { "text": "Vocab", "url": "/api/vocab" }
                 ]
             },
             {
                 "label": "Cython",
                 "items": [
                     { "text": "Architecture", "url": "/api/cython" },
-                    { "text": "Structs", "url": "/api/cython-structs" },
-                    { "text": "Classes", "url": "/api/cython-classes" }
+                    { "text": "Classes", "url": "/api/cython-classes" },
+                    { "text": "Structs", "url": "/api/cython-structs" }
                 ]
             }
         ]

From ce455f30ca847fc8038d034f39977cb6f3ed53c3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 13:52:46 +0200
Subject: [PATCH 11/30] Fix formatting

---
 spacy/tests/util.py  | 13 +++++++++++--
 spacy/tokens/doc.pyx |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 7bc32bf34..6c67d2ee1 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -34,8 +34,17 @@ def get_doc(
         heads = [i + head for i, head in enumerate(heads)]
     if ents is not None:
         ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(vocab, words=words, pos=pos, heads=heads, deps=deps, tags=tags,
-        ents=ents, lemmas=lemmas, morphs=morphs)
+    return Doc(
+        vocab,
+        words=words,
+        pos=pos,
+        heads=heads,
+        deps=deps,
+        tags=tags,
+        ents=ents,
+        lemmas=lemmas,
+        morphs=morphs,
+    )
 
 
 def get_batch(batch_size):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index de7e0f862..13167c2d4 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1414,6 +1414,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
         if tokens[i].head == 0:
             tokens[tokens[i].l_edge].sent_start = 1
 
+
 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
     # May be called multiple times due to non-projectivity. See issues #3170
     # and #4688.

From e548654aca291621ddcbd8739f620b74c9932166 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 14:46:55 +0200
Subject: [PATCH 12/30] Update docs [ci skip]

---
 website/docs/usage/training.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 071434162..b63145636 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -214,6 +214,24 @@ overrides. Overrides are added before [variables](#config-interpolation) are
 resolved, by the way – so if you need to use a value in multiple places,
 reference it across your config and override it on the CLI once.
 
+> #### 💡 Tip: Verbose logging
+>
+> If you're using config overrides, you can set the `--verbose` flag on
+> [`spacy train`](/api/cli#train) to make spaCy log more info, including which
+> overrides were set via the CLI and environment variables.
+
+#### Adding overrides via environment variables {#config-overrides-env}
+
+Instead of defining the overrides as CLI arguments, you can also use the
+`SPACY_CONFIG_OVERRIDES` environment variable using the same argument syntax.
+This is especially useful if you're training models as part of an automated
+process. Environment variables **take precedence** over CLI overrides and values
+defined in the config file.
+
+```cli
+$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
+```
+
 ### Defining pipeline components {#config-components}
 
 You typically train a [pipeline](/usage/processing-pipelines) of **one or more

From 6aa91c7ca02acd0df8d5dfba236faf09c3a5a477 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 16:00:06 +0200
Subject: [PATCH 13/30] Make user_data keyword-only

---
 spacy/tokens/doc.pyx    | 2 +-
 website/docs/api/doc.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 13167c2d4..27efa6cef 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -163,8 +163,8 @@ cdef class Doc:
         Vocab vocab,
         words=None,
         spaces=None,
-        user_data=None,
         *,
+        user_data=None,
         tags=None,
         pos=None,
         morphs=None,
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 680523c60..baf264b80 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -35,8 +35,8 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
 | `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
 | `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| `user_data`    | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | _keyword-only_ |                                                                                                                                                                                                |
+| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
 | tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |

From e8bcaa44f17be63302feca946997a6fe20761cd7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:01:46 +0200
Subject: [PATCH 14/30] Don't auto-decompress archives with smart_open [ci
 skip]

---
 spacy/cli/_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0dd2ee380..797a701b9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -306,7 +306,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
     if dest.exists() and not force:
         return None
     src = str(src)
-    with smart_open.open(src, mode="rb") as input_file:
+    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
         with dest.open(mode="wb") as output_file:
             output_file.write(input_file.read())
 

From b3327c1e45d14c6ef03c70455e09f449ed8ad6f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 16:04:30 +0200
Subject: [PATCH 15/30] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 4fb6dfff1..ec3c168a5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a19"
+__version__ = "3.0.0a20"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 447b3e5787dec59f2ed4b8a96c4b2ceb808d182f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 16:58:40 +0200
Subject: [PATCH 16/30] Merge remote-tracking branch 'upstream/develop' into
 fix/debug_model

# Conflicts:
#	spacy/cli/debug_model.py
---
 spacy/cli/debug_model.py | 48 ++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index af961d033..3d76cdbde 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
@@ -93,11 +93,10 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    goldY = _get_output(model.ops)
-    # _set_output_dim(nO=goldY.shape[-1], model=model)
+    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X, Y=goldY)
+        model.initialize(X=X)
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -110,12 +109,15 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
         tok2vec.model.initialize(X=X)
+    goldY = None
     for e in range(3):
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        print("get_dX", get_dX)
-        dY = get_gradient(goldY, Y)
+        # simulate a goldY value
+        if not goldY:
+            goldY = _simulate_gold(Y)
+        dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)
         model.finish_update(optimizer)
     if print_settings.get("print_after_training"):
@@ -128,11 +130,20 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
 
-    msg.good(f"Succesfully ended analysis - model looks good!")
+    msg.good(f"Succesfully ended analysis - model looks good.")
 
 
-def get_gradient(goldY, Y):
-    return Y - goldY
+def _simulate_gold(element, counter=1):
+    if isinstance(element, Iterable):
+        for i in range(len(element)):
+            element[i] = _simulate_gold(element[i], counter+i)
+        return element
+    else:
+        return 1/counter
+
+
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
 
 
 def _sentences():
@@ -149,18 +160,13 @@ def _get_docs(lang: str = "en"):
     return list(nlp.pipe(_sentences()))
 
 
-def _get_output(ops):
-    docs = len(_get_docs())
-    labels = 6
-    output = ops.alloc2f(d0=docs, d1=labels)
-    for i in range(docs):
-        for j in range(labels):
-            output[i, j] = 1 / (i+j+0.01)
-    return ops.xp.asarray(output)
-
-
-def _get_output_old(xp):
-    return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
+def _set_output_dim(model, nO):
+    # simulating dim inference by directly setting the nO argument of the model
+    if model.has_dim("nO") is None:
+        model.set_dim("nO", nO)
+    if model.has_ref("output_layer"):
+        if model.get_ref("output_layer").has_dim("nO") is None:
+            model.get_ref("output_layer").set_dim("nO", nO)
 
 
 def _print_model(model, print_settings):

From f212303729cb0775bb00eebb6eef0a6c646f92da Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 21 Sep 2020 17:59:09 +0200
Subject: [PATCH 17/30] Add sent_starts to Doc.__init__

Add sent_starts to `Doc.__init__`. Officially specify `is_sent_start`
values but also convert to and accept `sent_start` internally.
---
 spacy/tests/doc/test_doc_api.py | 20 ++++++++++++++
 spacy/tokens/doc.pyx            | 46 +++++++++++++++++++++++----------
 website/docs/api/doc.md         |  1 +
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c979931b1..0579642c4 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -9,6 +9,26 @@ from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 from ..util import get_doc
 
 
+def test_doc_api_init(en_vocab):
+    # set sent_start by sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # set sent_start by heads
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+    # heads override sent_starts
+    doc = Doc(
+        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+    )
+    assert [t.is_sent_start for t in doc] == [True, False, True, False]
+
+
 @pytest.mark.parametrize("text", [["one", "two", "three"]])
 def test_doc_api_compare_by_string_position(en_vocab, text):
     doc = Doc(en_vocab, words=text)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 27efa6cef..c5f1f6801 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -171,6 +171,7 @@ cdef class Doc:
         lemmas=None,
         heads=None,
         deps=None,
+        sent_starts=None,
         ents=None,
     ):
         """Create a Doc object.
@@ -183,13 +184,24 @@ cdef class Doc:
             words. True means that the word is followed by a space, False means
             it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
-        tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None.
-        pos (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.pos. Defaults to None.
-        morphs (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.morph. Defaults to None.
-        lemmas (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.lemma. Defaults to None.
-        heads (Optional[List[int]]): A list of values, of the same length as words, to assign as heads. Head indices are the position of the head in the doc. Defaults to None.
-        deps (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.dep. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents. Defaults to None.
+        tags (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.tag. Defaults to None.
+        pos (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.pos. Defaults to None.
+        morphs (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.morph. Defaults to None.
+        lemmas (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.lemma. Defaults to None.
+        heads (Optional[List[int]]): A list of values, of the same length as
+            words, to assign as heads. Head indices are the position of the
+            head in the doc. Defaults to None.
+        deps (Optional[List[str]]): A list of unicode strings, of the same
+            length as words, to assign as token.dep. Defaults to None.
+        sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
+            the same length as words, to assign as token.is_sent_start. Will be
+            overridden by heads if heads is provided. Defaults to None.
+        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
+            Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -242,16 +254,24 @@ cdef class Doc:
             heads = [head - i for i, head in enumerate(heads)]
         if deps and not heads:
             heads = [0] * len(deps)
+        if sent_starts is not None:
+            for i in range(len(sent_starts)):
+                if sent_starts[i] is True:
+                    sent_starts[i] = 1
+                elif sent_starts[i] is False:
+                    sent_starts[i] = -1
+                elif sent_starts[i] is None or sent_starts[i] not in [-1, 0, 1]:
+                    sent_starts[i] = 0
         headings = []
         values = []
-        annotations = [pos, heads, deps, lemmas, tags, morphs]
-        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
+        annotations = [pos, heads, deps, lemmas, tags, morphs, sent_starts]
+        possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH, SENT_START]
         for a, annot in enumerate(annotations):
             if annot is not None:
                 if len(annot) != len(words):
                     raise ValueError(Errors.E189)
                 headings.append(possible_headings[a])
-                if annot is not heads:
+                if annot is not heads and annot is not sent_starts:
                     values.extend(annot)
         for value in values:
             self.vocab.strings.add(value)
@@ -263,12 +283,12 @@ cdef class Doc:
             j = 0
             for annot in annotations:
                 if annot:
-                    if annot is heads:
+                    if annot is heads or annot is sent_starts:
                         for i in range(len(words)):
                             if attrs.ndim == 1:
-                                attrs[i] = heads[i]
+                                attrs[i] = annot[i]
                             else:
-                                attrs[i, j] = heads[i]
+                                attrs[i, j] = annot[i]
                     elif annot is morphs:
                         for i in range(len(words)):
                             morph_key = vocab.morphology.add(morphs[i])
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index baf264b80..52f94a83d 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -43,6 +43,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
 | heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
 | deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
+| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
 | ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}

From 67fbcb3da57c9830be34bf56518d8ec659ed65b6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 21 Sep 2020 20:43:54 +0200
Subject: [PATCH 18/30] Tidy up tests and docs

---
 CONTRIBUTING.md                               |   4 +-
 spacy/errors.py                               |   2 +-
 spacy/tests/README.md                         |  84 ++++-------
 spacy/tests/conftest.py                       |   5 +
 spacy/tests/doc/test_add_entities.py          |  10 +-
 spacy/tests/doc/test_array.py                 |   8 +-
 spacy/tests/doc/test_doc_api.py               |  74 ++++------
 spacy/tests/doc/test_retokenize_merge.py      |  78 ++++------
 spacy/tests/doc/test_retokenize_split.py      |  10 +-
 spacy/tests/doc/test_span.py                  |  14 +-
 spacy/tests/doc/test_to_json.py               |   7 +-
 spacy/tests/doc/test_token_api.py             |  89 ++++-------
 spacy/tests/lang/de/test_parser.py            |  26 ++--
 spacy/tests/lang/en/test_noun_chunks.py       |   9 +-
 spacy/tests/lang/en/test_parser.py            |  57 +++-----
 spacy/tests/lang/en/test_sbd.py               |  22 +--
 spacy/tests/lang/ru/test_lemmatizer.py        |  15 +-
 spacy/tests/lang/sv/test_noun_chunks.py       |  16 +-
 .../tests/matcher/test_dependency_matcher.py  |  13 +-
 spacy/tests/matcher/test_phrase_matcher.py    |   9 +-
 spacy/tests/parser/test_nonproj.py            |  25 +---
 spacy/tests/parser/test_parse.py              |  94 ++++++------
 spacy/tests/parser/test_parse_navigate.py     | 120 ++++++++-------
 spacy/tests/parser/test_space_attachment.py   |  46 +++---
 spacy/tests/pipeline/test_attributeruler.py   |  19 +--
 spacy/tests/pipeline/test_functions.py        |  47 ++----
 spacy/tests/regression/test_issue1-1000.py    |   9 +-
 spacy/tests/regression/test_issue1501-2000.py |  21 +--
 spacy/tests/regression/test_issue2001-2500.py |  11 +-
 spacy/tests/regression/test_issue2501-3000.py |  10 +-
 spacy/tests/regression/test_issue3001-3500.py |  26 +---
 spacy/tests/regression/test_issue3501-4000.py |  12 +-
 spacy/tests/regression/test_issue5001-5500.py | 138 ++++++++++++++++++
 spacy/tests/regression/test_issue5048.py      |  32 ----
 spacy/tests/regression/test_issue5082.py      |  37 -----
 spacy/tests/regression/test_issue5137.py      |  32 ----
 spacy/tests/regression/test_issue5141.py      |  11 --
 spacy/tests/regression/test_issue5152.py      |  20 ---
 spacy/tests/regression/test_issue5458.py      |  23 ---
 spacy/tests/regression/test_issue5918.py      |   4 +-
 spacy/tests/test_displacy.py                  |  18 +--
 spacy/tests/test_scorer.py                    |  23 +--
 spacy/tests/training/test_training.py         |  55 ++-----
 spacy/tests/util.py                           |  35 +----
 spacy/tokens/doc.pyx                          |  10 +-
 spacy/training/example.pyx                    |   4 +-
 website/docs/api/doc.md                       |  44 ++++--
 website/docs/usage/v3.md                      |   9 +-
 48 files changed, 612 insertions(+), 875 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5001-5500.py
 delete mode 100644 spacy/tests/regression/test_issue5048.py
 delete mode 100644 spacy/tests/regression/test_issue5082.py
 delete mode 100644 spacy/tests/regression/test_issue5137.py
 delete mode 100644 spacy/tests/regression/test_issue5141.py
 delete mode 100644 spacy/tests/regression/test_issue5152.py
 delete mode 100644 spacy/tests/regression/test_issue5458.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0abde2abf..70324d8fd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -224,7 +224,7 @@ for that particular code. Here's an example:
 ```python
 # fmt: off
 text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
 deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
         "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
         "poss", "nsubj", "ccomp", "punct"]
@@ -421,7 +421,7 @@ Tests that require the model to be loaded should be marked with
 `@pytest.mark.models`. Loading the models is expensive and not necessary if
 you're not actually testing the model performance. If all you need is a `Doc`
 object with annotations like heads, POS tags or the dependency parse, you can
-use the `get_doc()` utility function to construct it manually.
+use the `Doc` constructor to construct it manually.
 
 📖 **For more guidelines and information on how to add tests, check out the [tests README](spacy/tests/README.md).**
 
diff --git a/spacy/errors.py b/spacy/errors.py
index f219496a5..406ea603b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -455,7 +455,7 @@ class Errors:
             "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
     E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
     E187 = ("Only unicode strings are supported as labels.")
-    E189 = ("Each argument to `get_doc` should be of equal length.")
+    E189 = ("Each argument to Doc.__init__ should be of equal length.")
     E190 = ("Token head out of range in `Doc.from_array()` for token index "
             "'{index}' with value '{value}' (equivalent to relative head "
             "index: '{rel_head_index}'). The head indices should be relative "
diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 7aa7f6166..86bbd52da 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -17,7 +17,6 @@ Tests for spaCy modules and classes live in their own directories of the same na
 5. [Helpers and utilities](#helpers-and-utilities)
 6. [Contributing to the tests](#contributing-to-the-tests)
 
-
 ## Running the tests
 
 To show print statements, run the tests with `py.test -s`. To abort after the
@@ -41,17 +40,16 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
 
-* **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
-* If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-* Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-* Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
-* If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-* Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
-* **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are  available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
-* If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-* Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-* Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
-
+- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
+- If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
+- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
+- If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
+- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
 
 ## Parameters
 
@@ -64,7 +62,7 @@ def test_tokenizer_keep_urls(tokenizer, text):
     assert len(tokens) == 1
 ```
 
-This will run the test once for each `text` value. Even if you're only testing  one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
+This will run the test once for each `text` value. Even if you're only testing one example, it's usually best to specify it as a parameter. This will later make it easier for others to quickly add additional test cases without having to modify the test.
 
 You can also specify parameters as tuples to test with multiple values per test:
 
@@ -81,18 +79,17 @@ To test for combinations of parameters, you can add several `parametrize` marker
 
 This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
 
-
 ## Fixtures
 
 Fixtures to create instances of spaCy objects and other components should only be defined once in the global [`conftest.py`](conftest.py). We avoid having per-directory conftest files, as this can easily lead to confusion.
 
 These are the main fixtures that are currently available:
 
-| Fixture | Description |
-| --- | --- |
-| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
-| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
-| `en_vocab` | Creates an instance of the English `Vocab`. |
+| Fixture                             | Description                                                                  |
+| ----------------------------------- | ---------------------------------------------------------------------------- |
+| `tokenizer`                         | Basic, language-independent tokenizer. Identical to the `xx` language class. |
+| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer.                                   |
+| `en_vocab`                          | Creates an instance of the English `Vocab`.                                  |
 
 The fixtures can be used in all tests by simply setting them as an argument, like this:
 
@@ -107,59 +104,32 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
+### Constructing a `Doc` object manually with
 
-### Constructing a `Doc` object manually with `get_doc()`
-
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can use `get_doc()` to construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
-def test_doc_token_api_strings(en_tokenizer):
+def test_doc_token_api_strings(en_vocab):
     text = "Give it back! He pleaded."
     pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
 
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
+    doc = Doc(en_vocab, [t.text for t in tokens], pos=pos, heads=heads, deps=deps)
     assert doc[0].text == 'Give'
     assert doc[0].lower_ == 'give'
     assert doc[0].pos_ == 'VERB'
     assert doc[0].dep_ == 'ROOT'
 ```
 
-You can construct a `Doc` with the following arguments:
-
-| Argument | Description |
-| --- | --- |
-| `vocab` | `Vocab` instance to use. If you're tokenizing before creating a `Doc`, make sure to use the tokenizer's vocab. Otherwise, you can also use the `en_vocab` fixture. **(required)** |
-| `words` | List of words, for example `[t.text for t in tokens]`. **(required)** |
-| `heads` | List of heads as integers. |
-| `pos` | List of POS tags as text values. |
-| `tag` | List of tag names as text values. |
-| `dep` | List of dependencies as text values. |
-| `ents` | List of entity tuples with `start`, `end`, `label` (for example `(0, 2, 'PERSON')`). The `label` will be looked up in `vocab.strings[label]`. |
-
-Here's how to quickly get these values from within spaCy:
-
-```python
-doc = nlp(u'Some text here')
-print([token.head.i-token.i for token in doc])
-print([token.tag_ for token in doc])
-print([token.pos_ for token in doc])
-print([token.dep_ for token in doc])
-print([(ent.start, ent.end, ent.label_) for ent in doc.ents])
-```
-
-**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
-
 ### Other utilities
 
-| Name | Description |
-| --- | --- |
-| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
-| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
-| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
-| `assert_docs_equal(doc1, doc2)` | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
+| Name                                               | Description                                                                                                   |
+| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state.                          |
+| `add_vecs_to_vocab(vocab, vectors)`                | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
+| `get_cosine(vec1, vec2)`                           | Get cosine for two given vectors.                                                                             |
+| `assert_docs_equal(doc1, doc2)`                    | Compare two `Doc` objects and `assert` that they're equal. Tests for tokens, tags, dependencies and entities. |
 
 ## Contributing to the tests
 
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e17199a08..3a9a1f26b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -59,6 +59,11 @@ def de_tokenizer():
     return get_lang_class("de")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def de_vocab():
+    return get_lang_class("de")().vocab
+
+
 @pytest.fixture(scope="session")
 def el_tokenizer():
     return get_lang_class("el")().tokenizer
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 0c2a2a40b..40aff8e31 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -1,12 +1,10 @@
+from spacy.pipeline.ner import DEFAULT_NER_MODEL
 from spacy.training import Example
 from spacy.pipeline import EntityRecognizer
 from spacy.tokens import Span, Doc
 from spacy import registry
 import pytest
 
-from ..util import get_doc
-from spacy.pipeline.ner import DEFAULT_NER_MODEL
-
 
 def _ner_example(ner):
     doc = Doc(
@@ -19,7 +17,7 @@ def _ner_example(ner):
 
 def test_doc_add_entities_set_ents_iob(en_vocab):
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -41,7 +39,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
 def test_ents_reset(en_vocab):
     """Ensure that resetting doc.ents does not change anything"""
     text = ["This", "is", "a", "lion"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     config = {
         "learn_tokens": False,
         "min_action_freq": 30,
@@ -59,7 +57,7 @@ def test_ents_reset(en_vocab):
 
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]
-    doc = get_doc(en_vocab, text)
+    doc = Doc(en_vocab, words=text)
     entity = Span(doc, 0, 4, label=391)
     doc.ents = [entity]
 
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index e721b3f09..9c050f740 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -2,8 +2,6 @@ import pytest
 from spacy.tokens import Doc
 from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_array_attr_of_token(en_vocab):
     doc = Doc(en_vocab, words=["An", "example", "sentence"])
@@ -35,7 +33,7 @@ def test_doc_scalar_attr_of_token(en_vocab):
 def test_doc_array_tag(en_vocab):
     words = ["A", "nice", "sentence", "."]
     pos = ["DET", "ADJ", "NOUN", "PUNCT"]
-    doc = get_doc(en_vocab, words=words, pos=pos)
+    doc = Doc(en_vocab, words=words, pos=pos)
     assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
     feats_array = doc.to_array((ORTH, POS))
     assert feats_array[0][1] == doc[0].pos
@@ -47,7 +45,7 @@ def test_doc_array_tag(en_vocab):
 def test_doc_array_morph(en_vocab):
     words = ["Eat", "blue", "ham"]
     morph = ["Feat=V", "Feat=J", "Feat=N"]
-    doc = get_doc(en_vocab, words=words, morphs=morph)
+    doc = Doc(en_vocab, words=words, morphs=morph)
     assert morph[0] == doc[0].morph_
     assert morph[1] == doc[1].morph_
     assert morph[2] == doc[2].morph_
@@ -61,7 +59,7 @@ def test_doc_array_morph(en_vocab):
 def test_doc_array_dep(en_vocab):
     words = ["A", "nice", "sentence", "."]
     deps = ["det", "amod", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, deps=deps)
+    doc = Doc(en_vocab, words=words, deps=deps)
     feats_array = doc.to_array((ORTH, DEP))
     assert feats_array[0][1] == doc[0].dep
     assert feats_array[1][1] == doc[1].dep
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 0579642c4..2c22926e9 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -6,25 +6,20 @@ from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 
-from ..util import get_doc
-
 
 def test_doc_api_init(en_vocab):
+    words = ["a", "b", "c", "d"]
+    heads = [0, 0, 2, 2]
     # set sent_start by sent_starts
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True, False, True, False]
-    )
+    doc = Doc(en_vocab, words=words, sent_starts=[True, False, True, False])
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
     # set sent_start by heads
-    doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], heads=[0, 0, 2, 2], deps=["dep"] * 4
-    )
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * 4)
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
-
     # heads override sent_starts
     doc = Doc(
-        en_vocab, words=["a", "b", "c", "d"], sent_starts=[True] * 4, heads=[0, 0, 2, 2], deps=["dep"] * 4
+        en_vocab, words=words, sent_starts=[True] * 4, heads=heads, deps=["dep"] * 4,
     )
     assert [t.is_sent_start for t in doc] == [True, False, True, False]
 
@@ -178,7 +173,7 @@ def test_doc_api_runtime_error(en_tokenizer):
             "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
     nps = []
     for np in doc.noun_chunks:
         while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
@@ -195,17 +190,19 @@ def test_doc_api_runtime_error(en_tokenizer):
             retokenizer.merge(np, attrs=attrs)
 
 
-def test_doc_api_right_edge(en_tokenizer):
+def test_doc_api_right_edge(en_vocab):
     """Test for bug occurring from Unshift action, causing incorrect right edge"""
     # fmt: off
-    text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
-    heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-             -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
+    words = [
+        "I", "have", "proposed", "to", "myself", ",", "for", "the", "sake",
+        "of", "such", "as", "live", "under", "the", "government", "of", "the",
+        "Romans", ",", "to", "translate", "those", "books", "into", "the",
+        "Greek", "tongue", "."
+    ]
+    heads = [2, 2, 2, 2, 3, 2, 21, 8, 6, 8, 11, 8, 11, 12, 15, 13, 15, 18, 16, 12, 21, 2, 23, 21, 21, 27, 27, 24, 2]
     deps = ["dep"] * len(heads)
     # fmt: on
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].text == "for"
     subtree = [w.text for w in doc[6].subtree]
     # fmt: off
@@ -233,16 +230,16 @@ def test_doc_api_similarity_match():
 
 
 @pytest.mark.parametrize(
-    "sentence,heads,lca_matrix",
+    "words,heads,lca_matrix",
     [
         (
-            "the lazy dog slept",
-            [2, 1, 1, 0],
+            ["the", "lazy", "dog", "slept"],
+            [2, 2, 3, 3],
             numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
         ),
         (
-            "The lazy dog slept. The quick fox jumped",
-            [2, 1, 1, 0, -1, 2, 1, 1, 0],
+            ["The", "lazy", "dog", "slept", ".", "The", "quick", "fox", "jumped"],
+            [2, 2, 3, 3, 3, 7, 7, 8, 8],
             numpy.array(
                 [
                     [0, 2, 2, 3, 3, -1, -1, -1, -1],
@@ -259,11 +256,8 @@ def test_doc_api_similarity_match():
         ),
     ],
 )
-def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
-    tokens = en_tokenizer(sentence)
-    doc = get_doc(
-        tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
-    )
+def test_lowest_common_ancestor(en_vocab, words, heads, lca_matrix):
+    doc = Doc(en_vocab, words, heads=heads, deps=["dep"] * len(heads))
     lca = doc.get_lca_matrix()
     assert (lca == lca_matrix).all()
     assert lca[1, 1] == 1
@@ -287,26 +281,23 @@ def test_doc_is_nered(en_vocab):
 
 
 def test_doc_from_array_sent_starts(en_vocab):
-    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
-    heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
+    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
     deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     # HEAD overrides SENT_START without warning
     attrs = [SENT_START, HEAD]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     # no warning using default attrs
     attrs = doc._get_array_attrs()
     arr = doc.to_array(attrs)
     with pytest.warns(None) as record:
         new_doc.from_array(attrs, arr)
         assert len(record) == 0
-
     # only SENT_START uses SENT_START
     attrs = [SENT_START]
     arr = doc.to_array(attrs)
@@ -314,7 +305,6 @@ def test_doc_from_array_sent_starts(en_vocab):
     new_doc.from_array(attrs, arr)
     assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
     assert not new_doc.has_annotation("DEP")
-
     # only HEAD uses HEAD
     attrs = [HEAD, DEP]
     arr = doc.to_array(attrs)
@@ -325,19 +315,17 @@ def test_doc_from_array_sent_starts(en_vocab):
 
 
 def test_doc_from_array_morph(en_vocab):
-    words = ["I", "live", "in", "New", "York", "."]
     # fmt: off
+    words = ["I", "live", "in", "New", "York", "."]
     morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
     # fmt: on
     doc = Doc(en_vocab, words=words)
     for i, morph in enumerate(morphs):
         doc[i].morph_ = morph
-
     attrs = [MORPH]
     arr = doc.to_array(attrs)
     new_doc = Doc(en_vocab, words=words)
     new_doc.from_array(attrs, arr)
-
     assert [t.morph_ for t in new_doc] == morphs
     assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
 
@@ -349,15 +337,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs = [en_tokenizer(text) for text in en_texts]
     docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
-    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (
-        True,
-        None,
-        None,
-        None,
-    )
-
+    expected = (True, None, None, None)
+    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
     assert Doc.from_docs([]) is None
-
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
 
diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index bc9567b2a..806c4b46f 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -3,8 +3,6 @@ from spacy.attrs import LEMMA
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_merge(en_tokenizer):
     text = "WKRO played songs by the beach boys all night"
@@ -88,9 +86,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
 
 def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     text = "Los Angeles start."
-    heads = [1, 1, 0, -1]
+    heads = [1, 2, 2, 2]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert len(doc) == 4
     assert doc[0].head.text == "Angeles"
     assert doc[1].head.text == "start"
@@ -103,17 +101,12 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
     assert doc[0].ent_type_ == "GPE"
 
 
-def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
-    text = "The players start."
-    heads = [1, 1, 0, -1]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
+    words = ["The", "players", "start", "."]
+    heads = [1, 2, 2, 2]
+    tags = ["DT", "NN", "VBZ", "."]
+    pos = ["DET", "NOUN", "VERB", "PUNCT"]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -124,13 +117,7 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[0].text == "The players"
     assert doc[0].tag_ == "NN"
     assert doc[0].pos_ == "NOUN"
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        tags=["DT", "NN", "VBZ", "."],
-        pos=["DET", "NOUN", "VERB", "PUNCT"],
-        heads=heads,
-    )
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
     assert len(doc) == 4
     assert doc[0].text == "The"
     assert doc[0].tag_ == "DT"
@@ -147,11 +134,10 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
     assert doc[1].pos_ == "VERB"
 
 
-def test_doc_retokenize_spans_merge_heads(en_tokenizer):
-    text = "I found a pilates class near work."
-    heads = [1, 0, 2, 1, -3, -1, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_doc_retokenize_spans_merge_heads(en_vocab):
+    words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
+    heads = [1, 1, 4, 6, 1, 4, 5, 1]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 8
     with doc.retokenize() as retokenizer:
         attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
@@ -182,9 +168,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
 
 def test_doc_retokenize_span_np_merges(en_tokenizer):
     text = "displaCy is a parse tool built with Javascript"
-    heads = [1, 0, 2, 1, -3, -1, -1, -1]
+    heads = [1, 1, 4, 4, 1, 4, 5, 6]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     assert doc[4].head.i == 1
     with doc.retokenize() as retokenizer:
         attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
@@ -192,18 +178,18 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
     assert doc[2].head.i == 1
 
     text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
-    heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
+    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
             retokenizer.merge(ent, attrs=attrs)
 
     text = "One test with entities like New York City so the ents list is not void"
-    heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
+    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     with doc.retokenize() as retokenizer:
         for ent in doc.ents:
             retokenizer.merge(ent)
@@ -212,12 +198,12 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
 def test_doc_retokenize_spans_entity_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
     tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
-    ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
+    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
     )
     assert len(doc) == 17
@@ -282,13 +268,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # if there is a parse, span.root provides default values
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, -3, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-fg")]
+    heads = [0, 0, 3, 0, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-fg", 5, 7)]
     deps = ["dep"] * len(words)
     en_vocab.strings.add("ent-de")
     en_vocab.strings.add("ent-fg")
     en_vocab.strings.add("dep")
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     assert doc[2:4].root == doc[3]  # root of 'c d' is d
     assert doc[4:6].root == doc[4]  # root is 'e f' is e
     with doc.retokenize() as retokenizer:
@@ -305,10 +291,10 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 
     # check that B is preserved if span[start] is B
     words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
-    heads = [0, -1, 1, 1, -4, -5, -1, -7, -8]
-    ents = [(3, 5, "ent-de"), (5, 7, "ent-de")]
+    heads = [0, 0, 3, 4, 0, 0, 5, 0, 0]
+    ents = [("ent-de", 3, 5), ("ent-de", 5, 7)]
     deps = ["dep"] * len(words)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
     with doc.retokenize() as retokenizer:
         retokenizer.merge(doc[3:5])
         retokenizer.merge(doc[5:7])
@@ -322,13 +308,13 @@ def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
 def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
-    heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
+    heads = [1, 2, 2, 4, 2, 4, 4, 2, 9, 9, 9, 10, 9, 9, 15, 13, 9]
     deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
             'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
             'compound', 'dobj', 'punct']
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
@@ -343,13 +329,13 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
 def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
     # fmt: off
     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
-    heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
+    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12]
     deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
             "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
             "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     sent1 = list(doc.sents)[0]
     init_len = len(list(sent1.root.subtree))
     with doc.retokenize() as retokenizer:
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 5f40da425..4d4b170f9 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -2,13 +2,11 @@ import pytest
 from spacy.vocab import Vocab
 from spacy.tokens import Doc, Token
 
-from ..util import get_doc
-
 
 def test_doc_retokenize_split(en_vocab):
     words = ["LosAngeles", "start", "."]
-    heads = [1, 1, 0]
-    doc = get_doc(en_vocab, words=words, heads=heads)
+    heads = [1, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert len(doc) == 3
     assert len(str(doc)) == 19
     assert doc[0].head.text == "start"
@@ -88,11 +86,11 @@ def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
     # fmt: off
     words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
              "lives", "in", "England", "and", "loves", "JoePasquale", "."]
-    heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
+    heads = [1, 1, 3, 5, 3, 1, 1, 8, 8, 8, 9, 8, 8, 14, 12]
     deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
             "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     sent1, sent2 = list(doc.sents)
     init_len = len(sent1)
     init_len2 = len(sent2)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 0c538a0eb..2f562deb7 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -4,19 +4,17 @@ from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 
-from ..util import get_doc
-
 
 @pytest.fixture
 def doc(en_tokenizer):
     # fmt: off
     text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
     tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -69,10 +67,10 @@ def test_spans_string_fn(doc):
 
 def test_spans_root2(en_tokenizer):
     text = "through North and South Carolina"
-    heads = [0, 3, -1, -2, -4]
+    heads = [0, 4, 1, 1, 0]
     deps = ["dep"] * len(heads)
     tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
     assert doc[-2:].root.text == "Carolina"
 
 
@@ -92,10 +90,10 @@ def test_spans_span_sent(doc, doc_not_parsed):
 def test_spans_lca_matrix(en_tokenizer):
     """Test span's lca matrix generation"""
     tokens = en_tokenizer("the lazy dog slept")
-    doc = get_doc(
+    doc = Doc(
         tokens.vocab,
         words=[t.text for t in tokens],
-        heads=[2, 1, 1, 0],
+        heads=[2, 2, 3, 3],
         deps=["dep"] * 4,
     )
     lca = doc[:2].get_lca_matrix()
diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py
index da3bc7dbb..c9bcafcfa 100644
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@@ -1,6 +1,5 @@
 import pytest
 from spacy.tokens import Doc
-from ..util import get_doc
 
 
 @pytest.fixture()
@@ -8,10 +7,10 @@ def doc(en_vocab):
     words = ["c", "d", "e"]
     pos = ["VERB", "NOUN", "NOUN"]
     tags = ["VBP", "NN", "NN"]
-    heads = [0, -1, -2]
+    heads = [0, 0, 0]
     deps = ["ROOT", "dobj", "dobj"]
-    ents = [(1, 2, "ORG")]
-    return get_doc(
+    ents = [("ORG", 1, 2)]
+    return Doc(
         en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
     )
 
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 1308df67b..3c5c063bd 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -5,31 +5,24 @@ from spacy.symbols import VERB
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
 
-from ..util import get_doc
-
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 10, 12, 10, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
-def test_doc_token_api_strings(en_tokenizer):
-    text = "Give it back! He pleaded."
+def test_doc_token_api_strings(en_vocab):
+    words = ["Give", "it", "back", "!", "He", "pleaded", "."]
     pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
-    heads = [0, -1, -2, -3, 1, 0, -1]
+    heads = [0, 0, 0, 0, 5, 5, 5]
     deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
-
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, heads=heads, deps=deps)
     assert doc[0].orth_ == "Give"
     assert doc[0].text == "Give"
     assert doc[0].text_with_ws == "Give "
@@ -97,88 +90,70 @@ def test_doc_token_api_vectors():
     assert doc[0].similarity(doc[1]) == cosine
 
 
-def test_doc_token_api_ancestors(en_tokenizer):
+def test_doc_token_api_ancestors(en_vocab):
     # the structure of this sentence depends on the English annotation scheme
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
     assert [t.text for t in doc[1].ancestors] == ["saw"]
     assert [t.text for t in doc[2].ancestors] == []
-
     assert doc[2].is_ancestor(doc[7])
     assert not doc[6].is_ancestor(doc[2])
 
 
-def test_doc_token_api_head_setter(en_tokenizer):
-    text = "Yesterday I saw a dog that barked loudly."
-    heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
+def test_doc_token_api_head_setter(en_vocab):
+    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
+    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 1
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 0
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 3
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[6].head = doc[3]
-
     assert doc[6].n_lefts == 1
     assert doc[6].n_rights == 1
     assert doc[6].left_edge.i == 5
     assert doc[6].right_edge.i == 7
-
     assert doc[3].n_lefts == 0
     assert doc[3].n_rights == 1
     assert doc[3].left_edge.i == 3
     assert doc[3].right_edge.i == 7
-
     assert doc[4].n_lefts == 1
     assert doc[4].n_rights == 0
     assert doc[4].left_edge.i == 3
     assert doc[4].right_edge.i == 7
-
     assert doc[2].left_edge.i == 0
     assert doc[2].right_edge.i == 8
 
     doc[0].head = doc[5]
-
     assert doc[5].left_edge.i == 0
     assert doc[6].left_edge.i == 0
     assert doc[3].left_edge.i == 0
     assert doc[4].left_edge.i == 0
     assert doc[2].left_edge.i == 0
-
     # head token must be from the same document
-    doc2 = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc2 = Doc(en_vocab, words=words, heads=heads)
     with pytest.raises(ValueError):
         doc[0].head = doc2[0]
-
     # test sentence starts when two sentences are joined
-    text = "This is one sentence. This is another sentence."
-    heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
+    # fmt: off
+    words = ["This", "is", "one", "sentence", ".", "This", "is", "another", "sentence", "."]
+    heads = [0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
+    # fmt: on
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     # initially two sentences
     assert doc[0].is_sent_start
     assert doc[5].is_sent_start
@@ -186,7 +161,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # modifying with a sentence doesn't change sent starts
     doc[2].head = doc[3]
     assert doc[0].is_sent_start
@@ -195,7 +169,6 @@ def test_doc_token_api_head_setter(en_tokenizer):
     assert doc[0].right_edge == doc[4]
     assert doc[5].left_edge == doc[5]
     assert doc[5].right_edge == doc[9]
-
     # attach the second sentence to the first, resulting in one sentence
     doc[5].head = doc[0]
     assert doc[0].is_sent_start
@@ -252,28 +225,28 @@ def test_tokenlast_has_sent_end_true():
 
 
 def test_token_api_conjuncts_chain(en_vocab):
-    words = "The boy and the girl and the man went .".split()
-    heads = [1, 7, -1, 1, -3, -1, 1, -3, 0, -1]
+    words = ["The", "boy", "and", "the", "girl", "and", "the", "man", "went", "."]
+    heads = [1, 8, 1, 4, 1, 4, 7, 4, 8, 8]
     deps = ["det", "nsubj", "cc", "det", "conj", "cc", "det", "conj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["girl", "man"]
     assert [w.text for w in doc[4].conjuncts] == ["boy", "man"]
     assert [w.text for w in doc[7].conjuncts] == ["boy", "girl"]
 
 
 def test_token_api_conjuncts_simple(en_vocab):
-    words = "They came and went .".split()
-    heads = [1, 0, -1, -2, -1]
+    words = ["They", "came", "and", "went", "."]
+    heads = [1, 1, 1, 1, 3]
     deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["went"]
     assert [w.text for w in doc[3].conjuncts] == ["came"]
 
 
 def test_token_api_non_conjuncts(en_vocab):
-    words = "They came .".split()
-    heads = [1, 0, -1]
+    words = ["They", "came", "."]
+    heads = [1, 1, 1]
     deps = ["nsubj", "ROOT", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[0].conjuncts] == []
     assert [w.text for w in doc[1].conjuncts] == []
diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py
index e2154b4c0..8c858a4cb 100644
--- a/spacy/tests/lang/de/test_parser.py
+++ b/spacy/tests/lang/de/test_parser.py
@@ -1,30 +1,26 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_de_parser_noun_chunks_standard_de(de_tokenizer):
-    text = "Eine Tasse steht auf dem Tisch."
-    heads = [1, 1, 0, -1, 1, -2, -4]
+def test_de_parser_noun_chunks_standard_de(de_vocab):
+    words = ["Eine", "Tasse", "steht", "auf", "dem", "Tisch", "."]
+    heads = [1, 2, 2, 2, 5, 3, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "Eine Tasse "
     assert chunks[1].text_with_ws == "dem Tisch "
 
 
-def test_de_extended_chunk(de_tokenizer):
-    text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
-    heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
+def test_de_extended_chunk(de_vocab):
+    # fmt: off
+    words = ["Die", "Sängerin", "singt", "mit", "einer", "Tasse", "Kaffee", "Arien", "."]
+    heads = [1, 2, 2, 2, 5, 3, 5, 2, 2]
     pos = ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN", "NOUN", "NOUN", "PUNCT"]
     deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
-    tokens = de_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    # fmt: on
+    doc = Doc(de_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Die Sängerin "
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
index fa3a134bd..0189a26d4 100644
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@@ -2,13 +2,10 @@ import numpy
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
 from spacy.lang.en.syntax_iterators import noun_chunks
-
+from spacy.tokens import Doc
 import pytest
 
 
-from ...util import get_doc
-
-
 def test_noun_chunks_is_parsed(en_tokenizer):
     """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
     """
@@ -19,9 +16,9 @@ def test_noun_chunks_is_parsed(en_tokenizer):
 
 def test_en_noun_chunks_not_nested(en_vocab):
     words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
-    heads = [1, 0, 4, 3, -1, -2, -5]
+    heads = [1, 1, 6, 6, 3, 3, 1]
     deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.from_array(
         [HEAD, DEP],
         numpy.asarray(
diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py
index 4d06ff8ef..426605566 100644
--- a/spacy/tests/lang/en/test_parser.py
+++ b/spacy/tests/lang/en/test_parser.py
@@ -1,63 +1,51 @@
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
-def test_en_parser_noun_chunks_standard(en_tokenizer):
-    text = "A base phrase should be recognized."
-    heads = [2, 1, 3, 2, 1, 0, -1]
+def test_en_parser_noun_chunks_standard(en_vocab):
+    words = ["A", "base", "phrase", "should", "be", "recognized", "."]
+    heads = [2, 2, 5, 5, 5, 5, 5]
     pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
     deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 1
     assert chunks[0].text_with_ws == "A base phrase "
 
 
-def test_en_parser_noun_chunks_coordinated(en_tokenizer):
+def test_en_parser_noun_chunks_coordinated(en_vocab):
     # fmt: off
-    text = "A base phrase and a good phrase are often the same."
-    heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
+    words = ["A", "base", "phrase", "and", "a", "good", "phrase", "are", "often", "the", "same", "."]
+    heads = [2, 2, 7, 2, 6, 6, 2, 7, 7, 10, 7, 7]
     pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
     deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A base phrase "
     assert chunks[1].text_with_ws == "a good phrase "
 
 
-def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
-    text = "A phrase with another phrase occurs."
-    heads = [1, 4, -1, 1, -2, 0, -1]
+def test_en_parser_noun_chunks_pp_chunks(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs", "."]
+    heads = [1, 5, 1, 4, 2, 5, 5]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 2
     assert chunks[0].text_with_ws == "A phrase "
     assert chunks[1].text_with_ws == "another phrase "
 
 
-def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
+def test_en_parser_noun_chunks_appositional_modifiers(en_vocab):
     # fmt: off
-    text = "Sam, my brother, arrived to the house."
-    heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
+    words = ["Sam", ",", "my", "brother", ",", "arrived", "to", "the", "house", "."]
+    heads = [5, 0, 3, 0, 0, 5, 5, 8, 6, 5]
     pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "Sam "
@@ -65,15 +53,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
     assert chunks[2].text_with_ws == "the house "
 
 
-def test_en_parser_noun_chunks_dative(en_tokenizer):
-    text = "She gave Bob a raise."
-    heads = [1, 0, -1, 1, -3, -4]
+def test_en_parser_noun_chunks_dative(en_vocab):
+    words = ["She", "gave", "Bob", "a", "raise", "."]
+    heads = [1, 1, 1, 4, 1, 1]
     pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
     deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
-    )
+    doc = Doc(en_vocab, words=words, pos=pos, deps=deps, heads=heads)
     chunks = list(doc.noun_chunks)
     assert len(chunks) == 3
     assert chunks[0].text_with_ws == "She "
diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py
index ee1e6be17..39d8d3b59 100644
--- a/spacy/tests/lang/en/test_sbd.py
+++ b/spacy/tests/lang/en/test_sbd.py
@@ -1,15 +1,16 @@
 import pytest
+from spacy.tokens import Doc
 
-from ...util import get_doc, apply_transition_sequence
+from ...util import apply_transition_sequence
 
 
-@pytest.mark.parametrize("text", ["A test sentence"])
+@pytest.mark.parametrize("words", [["A", "test", "sentence"]])
 @pytest.mark.parametrize("punct", [".", "!", "?", ""])
-def test_en_sbd_single_punct(en_tokenizer, text, punct):
-    heads = [2, 1, 0, -1] if punct else [2, 1, 0]
+def test_en_sbd_single_punct(en_vocab, words, punct):
+    heads = [2, 2, 2, 2] if punct else [2, 2, 2]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text + punct)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    words = [*words, punct] if punct else words
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(doc) == 4 if punct else 3
     assert len(list(doc.sents)) == 1
     assert sum(len(sent) for sent in doc.sents) == len(doc)
@@ -18,17 +19,16 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_en_sentence_breaks(en_tokenizer, en_parser):
+def test_en_sentence_breaks(en_vocab, en_parser):
     # fmt: off
-    text = "This is a sentence . This is another one ."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "one", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct"]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
                   "L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     apply_transition_sequence(en_parser, doc, transition)
     assert len(list(doc.sents)) == 2
     for token in doc:
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index bcf103b65..3810323bf 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_ru_doc_lemmatization(ru_lemmatizer):
@@ -11,7 +10,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
         "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
         "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
     ]
-    doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+    doc = Doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
     doc = ru_lemmatizer(doc)
     lemmas = [token.lemma_ for token in doc]
     assert lemmas == ["мама", "мыть", "рама"]
@@ -28,7 +27,7 @@ def test_ru_doc_lemmatization(ru_lemmatizer):
     ],
 )
 def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert sorted(result_lemmas) == lemmas
 
@@ -51,7 +50,7 @@ def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
 def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ru_lemmatizer, text, pos, morph, lemma
 ):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
@@ -66,13 +65,13 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms(
     ],
 )
 def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):
-    doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
+    doc = Doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph])
     result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0])
     assert result_lemmas == [lemma]
 
 
 def test_ru_lemmatizer_punct(ru_lemmatizer):
-    doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
-    doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
+    doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
     assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
index 458cdadd5..3791d8021 100644
--- a/spacy/tests/lang/sv/test_noun_chunks.py
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -1,6 +1,5 @@
 import pytest
-
-from ...util import get_doc
+from spacy.tokens import Doc
 
 
 def test_noun_chunks_is_parsed_sv(sv_tokenizer):
@@ -16,21 +15,21 @@ SV_NP_TEST_EXAMPLES = [
         "En student läste en bok",  # A student read a book
         ["DET", "NOUN", "VERB", "DET", "NOUN"],
         ["det", "nsubj", "ROOT", "det", "dobj"],
-        [1, 1, 0, 1, -2],
+        [1, 2, 2, 4, 2],
         ["En student", "en bok"],
     ),
     (
         "Studenten läste den bästa boken.",  # The student read the best book
         ["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
         ["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
-        [1, 0, 2, 1, -3, -4],
+        [1, 1, 4, 4, 1, 1],
         ["Studenten", "den bästa boken"],
     ),
     (
         "De samvetslösa skurkarna hade stulit de största juvelerna på söndagen",  # The remorseless crooks had stolen the largest jewels that sunday
         ["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
         ["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
-        [2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
+        [2, 2, 4, 4, 4, 7, 7, 4, 9, 4],
         ["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
     ),
 ]
@@ -41,12 +40,9 @@ SV_NP_TEST_EXAMPLES = [
 )
 def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
     tokens = sv_tokenizer(text)
-
     assert len(heads) == len(pos)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
-    )
-
+    words = [t.text for t in tokens]
+    doc = Doc(tokens.vocab, words=words, heads=heads, deps=deps, pos=pos)
     noun_chunks = list(doc.noun_chunks)
     assert len(noun_chunks) == len(expected_noun_chunks)
     for i, np in enumerate(noun_chunks):
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 6361a10ce..e18a8f6d8 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -4,16 +4,15 @@ import re
 import copy
 from mock import Mock
 from spacy.matcher import DependencyMatcher
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
 def doc(en_vocab):
-    text = "The quick brown fox jumped over the lazy fox"
-    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "fox"]
+    heads = [3, 3, 3, 4, 4, 4, 8, 8, 5]
     deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
-    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
-    return doc
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
@@ -236,10 +235,10 @@ def test_dependency_matcher_callback(en_vocab, doc):
 @pytest.mark.parametrize("op,num_matches", [(".", 8), (".*", 20), (";", 8), (";*", 20)])
 def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
     # two sentences to test that all matches are within the same sentence
-    doc = get_doc(
+    doc = Doc(
         en_vocab,
         words=["a", "b", "c", "d", "e"] * 2,
-        heads=[0, -1, -2, -3, -4] * 2,
+        heads=[0, 0, 0, 0, 0, 5, 5, 5, 5, 5],
         deps=["dep"] * 10,
     )
     match_count = 0
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 9caf284a3..522356ffc 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -3,7 +3,6 @@ import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span
-from ..util import get_doc
 
 
 def test_matcher_phrase_matcher(en_vocab):
@@ -140,10 +139,10 @@ def test_phrase_matcher_string_attrs(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
     pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 1
     match_id, start, end = matches[0]
@@ -158,10 +157,10 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
     pos1 = ["PRON", "VERB", "NOUN"]
     words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
     pos2 = ["X", "X", "X"]
-    pattern = get_doc(en_vocab, words=words1, pos=pos1)
+    pattern = Doc(en_vocab, words=words1, pos=pos1)
     matcher = PhraseMatcher(en_vocab, attr="POS")
     matcher.add("TEST", [pattern])
-    doc = get_doc(en_vocab, words=words2, pos=pos2)
+    doc = Doc(en_vocab, words=words2, pos=pos2)
     matches = matcher(doc)
     assert len(matches) == 0
 
diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py
index 41da7cf49..544701a4c 100644
--- a/spacy/tests/parser/test_nonproj.py
+++ b/spacy/tests/parser/test_nonproj.py
@@ -2,8 +2,7 @@ import pytest
 from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
 from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
 from spacy.pipeline._parser_internals import nonproj
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
@@ -74,16 +73,10 @@ def test_parser_is_nonproj_tree(
     assert is_nonproj_tree(multirooted_tree) is True
 
 
-def test_parser_pseudoprojectivity(en_tokenizer):
+def test_parser_pseudoprojectivity(en_vocab):
     def deprojectivize(proj_heads, deco_labels):
-        tokens = en_tokenizer("whatever " * len(proj_heads))
-        rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
-        doc = get_doc(
-            tokens.vocab,
-            words=[t.text for t in tokens],
-            deps=deco_labels,
-            heads=rel_proj_heads,
-        )
+        words = ["whatever "] * len(proj_heads)
+        doc = Doc(en_vocab, words=words, deps=deco_labels, heads=proj_heads)
         nonproj.deprojectivize(doc)
         return [t.head.i for t in doc], [token.dep_ for token in doc]
 
@@ -94,49 +87,39 @@ def test_parser_pseudoprojectivity(en_tokenizer):
     labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
     labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
     # fmt: on
-
     assert nonproj.decompose("X||Y") == ("X", "Y")
     assert nonproj.decompose("X") == ("X", "")
     assert nonproj.is_decorated("X||Y") is True
     assert nonproj.is_decorated("X") is False
-
     nonproj._lift(0, tree)
     assert tree == [2, 2, 2]
-
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
     assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
-
     # fmt: off
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
     assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
     assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                            "nsubj", "acl||dobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree
     assert undeco_labels == labels
-
     proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
     assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
     assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
                            "det", "dobj", "det", "nmod", "aux", "nmod||dobj",
                            "advmod", "det", "amod", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == nonproj_tree2
     assert undeco_labels == labels2
-
     # if decoration is wrong such that there is no head with the desired label
     # the structure is kept and the label is undecorated
     proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
     deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
                    "acl||iobj", "punct"]
-
     deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
     assert deproj_heads == proj_heads
     assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                              "nsubj", "acl", "punct"]
-
     # if there are two potential new heads, the first one is chosen even if
     # it"s wrong
     proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 9e760c1e7..8648f2018 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,9 +1,11 @@
 import pytest
-
 from spacy.lang.en import English
-from ..util import get_doc, apply_transition_sequence, make_tempdir
-from ... import util
-from ...training import Example
+from spacy.training import Example
+from spacy.tokens import Doc
+from spacy import util
+
+from ..util import apply_transition_sequence, make_tempdir
+
 
 TRAIN_DATA = [
     (
@@ -23,12 +25,11 @@ TRAIN_DATA = [
 ]
 
 
-def test_parser_root(en_tokenizer):
-    text = "i don't have other assistance"
-    heads = [3, 2, 1, 0, 1, -2]
+def test_parser_root(en_vocab):
+    words = ["i", "do", "n't", "have", "other", "assistance"]
+    heads = [3, 3, 3, 3, 5, 3]
     deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for t in doc:
         assert t.dep != 0, t.text
 
@@ -36,13 +37,9 @@ def test_parser_root(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-@pytest.mark.parametrize("text", ["Hello"])
-def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
-
+@pytest.mark.parametrize("words", [["Hello"]])
+def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
+    doc = Doc(en_vocab, words=words, heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     with en_parser.step_through(doc) as _:  # noqa: F841
         pass
@@ -52,24 +49,22 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_initial(en_tokenizer, en_parser):
-    text = "I ate the pizza with anchovies."
-    # heads = [1, 0, 1, -2, -3, -1, -5]
+def test_parser_initial(en_vocab, en_parser):
+    words = ["I", "ate", "the", "pizza", "with", "anchovies", "."]
     transition = ["L-nsubj", "S", "L-det"]
-    tokens = en_tokenizer(text)
-    apply_transition_sequence(en_parser, tokens, transition)
-    assert tokens[0].head.i == 1
-    assert tokens[1].head.i == 1
-    assert tokens[2].head.i == 3
-    assert tokens[3].head.i == 3
+    doc = Doc(en_vocab, words=words)
+    apply_transition_sequence(en_parser, doc, transition)
+    assert doc[0].head.i == 1
+    assert doc[1].head.i == 1
+    assert doc[2].head.i == 3
+    assert doc[3].head.i == 3
 
 
-def test_parser_parse_subtrees(en_tokenizer, en_parser):
-    text = "The four wheels on the bus turned quickly"
-    heads = [2, 1, 4, -1, 1, -2, 0, -1]
+def test_parser_parse_subtrees(en_vocab, en_parser):
+    words = ["The", "four", "wheels", "on", "the", "bus", "turned", "quickly"]
+    heads = [2, 2, 6, 2, 5, 3, 6, 6]
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc[2].lefts)) == 2
     assert len(list(doc[2].rights)) == 1
     assert len(list(doc[2].children)) == 3
@@ -79,15 +74,12 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
     assert len(list(doc[2].subtree)) == 6
 
 
-def test_parser_merge_pp(en_tokenizer):
-    text = "A phrase with another phrase occurs"
-    heads = [1, 4, -1, 1, -2, 0]
+def test_parser_merge_pp(en_vocab):
+    words = ["A", "phrase", "with", "another", "phrase", "occurs"]
+    heads = [1, 5, 1, 4, 2, 5]
     deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
     pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
-    )
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads, pos=pos)
     with doc.retokenize() as retokenizer:
         for np in doc.noun_chunks:
             retokenizer.merge(np, attrs={"lemma": np.lemma_})
@@ -100,12 +92,11 @@ def test_parser_merge_pp(en_tokenizer):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
-    text = "a b c d e"
-
+def test_parser_arc_eager_finalize_state(en_vocab, en_parser):
+    words = ["a", "b", "c", "d", "e"]
     # right branching
     transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -140,7 +131,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 
     # left branching
     transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
-    tokens = en_tokenizer(text)
+    tokens = Doc(en_vocab, words=words)
     apply_transition_sequence(en_parser, tokens, transition)
 
     assert tokens[0].n_lefts == 0
@@ -177,10 +168,10 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
 def test_parser_set_sent_starts(en_vocab):
     # fmt: off
     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
-    heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
+    heads = [1, 1, 1, 30, 4, 4, 7, 4, 7, 17, 14, 14, 11, 14, 17, 16, 17, 6, 17, 20, 11, 20, 26, 22, 26, 26, 20, 26, 29, 31, 31, 25, 31, 32, 17, 4, 4, 36]
     deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
     # fmt: on
-    doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
+    doc = Doc(en_vocab, words=words, deps=deps, heads=heads)
     for i in range(len(words)):
         if i == 0 or i == 3:
             assert doc[i].is_sent_start is True
@@ -201,24 +192,21 @@ def test_overfitting_IO():
         for dep in annotations.get("deps", []):
             parser.add_label(dep)
     optimizer = nlp.begin_training()
-
     for i in range(100):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses["parser"] < 0.0001
-
     # test the trained model
     test_text = "I like securities."
     doc = nlp(test_text)
-    assert doc[0].dep_ is "nsubj"
-    assert doc[2].dep_ is "dobj"
-    assert doc[3].dep_ is "punct"
-
+    assert doc[0].dep_ == "nsubj"
+    assert doc[2].dep_ == "dobj"
+    assert doc[3].dep_ == "punct"
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
-        assert doc2[0].dep_ is "nsubj"
-        assert doc2[2].dep_ is "dobj"
-        assert doc2[3].dep_ is "punct"
+        assert doc2[0].dep_ == "nsubj"
+        assert doc2[2].dep_ == "dobj"
+        assert doc2[3].dep_ == "punct"
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index f181a799a..8ca4039a2 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -1,59 +1,75 @@
 import pytest
-
-from ..util import get_doc
+from spacy.tokens import Doc
 
 
 @pytest.fixture
-def text():
-    return """
-It was a bright cold day in April, and the clocks were striking thirteen.
-Winston Smith, his chin nuzzled into his breast in an effort to escape the
-vile wind, slipped quickly through the glass doors of Victory Mansions,
-though not quickly enough to prevent a swirl of gritty dust from entering
-along with him.
-
-The hallway smelt of boiled cabbage and old rag mats. At one end of it a
-coloured poster, too large for indoor display, had been tacked to the wall.
-It depicted simply an enormous face, more than a metre wide: the face of a
-man of about forty-five, with a heavy black moustache and ruggedly handsome
-features. Winston made for the stairs. It was no use trying the lift. Even at
-the best of times it was seldom working, and at present the electric current
-was cut off during daylight hours. It was part of the economy drive in
-preparation for Hate Week. The flat was seven flights up, and Winston, who
-was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
-resting several times on the way. On each landing, opposite the lift-shaft,
-the poster with the enormous face gazed from the wall. It was one of those
-pictures which are so contrived that the eyes follow you about when you move.
-BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
-"""
+def words():
+    # fmt: off
+    return [
+        "\n", "It", "was", "a", "bright", "cold", "day", "in", "April", ",",
+        "and", "the", "clocks", "were", "striking", "thirteen", ".", "\n",
+        "Winston", "Smith", ",", "his", "chin", "nuzzled", "into", "his",
+        "breast", "in", "an", "effort", "to", "escape", "the", "\n", "vile",
+        "wind", ",", "slipped", "quickly", "through", "the", "glass", "doors",
+        "of", "Victory", "Mansions", ",", "\n", "though", "not", "quickly",
+        "enough", "to", "prevent", "a", "swirl", "of", "gritty", "dust",
+        "from", "entering", "\n", "along", "with", "him", ".", "\n\n", "The",
+        "hallway", "smelt", "of", "boiled", "cabbage", "and", "old", "rag",
+        "mats", ".", "At", "one", "end", "of", "it", "a", "\n", "coloured",
+        "poster", ",", "too", "large", "for", "indoor", "display", ",", "had",
+        "been", "tacked", "to", "the", "wall", ".", "\n", "It", "depicted",
+        "simply", "an", "enormous", "face", ",", "more", "than", "a", "metre",
+        "wide", ":", "the", "face", "of", "a", "\n", "man", "of", "about",
+        "forty", "-", "five", ",", "with", "a", "heavy", "black", "moustache",
+        "and", "ruggedly", "handsome", "\n", "features", ".", "Winston", "made",
+        "for", "the", "stairs", ".", "It", "was", "no", "use", "trying", "the",
+        "lift", ".", "Even", "at", "\n", "the", "best", "of", "times", "it",
+        "was", "seldom", "working", ",", "and", "at", "present", "the",
+        "electric", "current", "\n", "was", "cut", "off", "during", "daylight",
+        "hours", ".", "It", "was", "part", "of", "the", "economy", "drive",
+        "in", "\n", "preparation", "for", "Hate", "Week", ".", "The", "flat",
+        "was", "seven", "flights", "up", ",", "and", "Winston", ",", "who",
+        "\n", "was", "thirty", "-", "nine", "and", "had", "a", "varicose",
+        "ulcer", "above", "his", "right", "ankle", ",", "went", "slowly", ",",
+        "\n", "resting", "several", "times", "on", "the", "way", ".", "On",
+        "each", "landing", ",", "opposite", "the", "lift", "-", "shaft", ",",
+        "\n", "the", "poster", "with", "the", "enormous", "face", "gazed",
+        "from", "the", "wall", ".", "It", "was", "one", "of", "those", "\n",
+        "pictures", "which", "are", "so", "contrived", "that", "the", "eyes",
+        "follow", "you", "about", "when", "you", "move", ".", "\n", "BIG",
+        "BROTHER", "IS", "WATCHING", "YOU", ",", "the", "caption", "beneath",
+        "it", "ran", ".", "\n", ]
+    # fmt: on
 
 
 @pytest.fixture
 def heads():
     # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
-            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
-            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
-            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
-            0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
-            -1, 0, -1, -1]
+    return [
+        1, 2, 2, 6, 6, 6, 2, 6, 7, 2, 2, 12, 14, 14, 2, 14, 14, 16, 19, 23, 23,
+        22, 23, 23, 23, 26, 24, 23, 29, 27, 31, 29, 35, 32, 35, 31, 23, 23, 37,
+        37, 42, 42, 39, 42, 45, 43, 37, 46, 37, 50, 51, 37, 53, 51, 55, 53, 55,
+        58, 56, 53, 59, 60, 60, 62, 63, 23, 65, 68, 69, 69, 69, 72, 70, 72, 76,
+        76, 72, 69, 96, 80, 78, 80, 81, 86, 83, 86, 96, 96, 89, 96, 89, 92, 90,
+        96, 96, 96, 96, 96, 99, 97, 96, 100, 103, 103, 103, 107, 107, 103, 107,
+        111, 111, 112, 113, 107, 103, 116, 136, 116, 120, 118, 117, 120, 125,
+        125, 125, 121, 116, 116, 131, 131, 131, 127, 131, 134, 131, 134, 136,
+        136, 139, 139, 139, 142, 140, 139, 145, 145, 147, 145, 147, 150, 148,
+        145, 153, 162, 153, 156, 162, 156, 157, 162, 162, 162, 162, 162, 162,
+        172, 165, 169, 169, 172, 169, 172, 162, 172, 172, 176, 174, 172, 179,
+        179, 179, 180, 183, 181, 179, 184, 185, 185, 187, 190, 188, 179, 193,
+        194, 194, 196, 194, 196, 194, 194, 218, 200, 204, 202, 200, 207, 207,
+        204, 204, 204, 212, 212, 209, 212, 216, 216, 213, 200, 194, 218, 218,
+        220, 218, 224, 222, 222, 227, 225, 218, 246, 231, 229, 246, 246, 237,
+        237, 237, 233, 246, 238, 241, 246, 241, 245, 245, 242, 246, 246, 249,
+        247, 246, 252, 252, 252, 253, 257, 255, 254, 259, 257, 261, 259, 265,
+        264, 265, 261, 265, 265, 270, 270, 267, 252, 271, 274, 275, 275, 276,
+        283, 283, 280, 283, 280, 281, 283, 283, 284]
     # fmt: on
 
 
-def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for head in doc:
         for child in head.lefts:
             assert child.head == head
@@ -61,15 +77,8 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
             assert child.head == head
 
 
-def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        deps=["dep"] * len(heads),
-    )
-
+def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
     lefts = {}
     rights = {}
     for head in doc:
@@ -99,9 +108,8 @@ def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
         assert not children
 
 
-def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+def test_parser_parse_navigate_edges(en_vocab, words, heads):
+    doc = Doc(en_vocab, words=words, heads=heads)
     for token in doc:
         subtree = list(token.subtree)
         debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index 3672dabea..2b80272d6 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -1,42 +1,40 @@
 import pytest
+from spacy.tokens import Doc
 
-from spacy.tokens.doc import Doc
-
-from ..util import get_doc, apply_transition_sequence
+from ..util import apply_transition_sequence
 
 
-def test_parser_space_attachment(en_tokenizer):
-    text = "This is a test.\nTo ensure  spaces are attached well."
-    heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
+def test_parser_space_attachment(en_vocab):
+    # fmt: off
+    words = ["This", "is", "a", "test", ".", "\n", "To", "ensure", " ", "spaces", "are", "attached", "well", "."]
+    heads = [1, 1, 3, 1, 1, 4, 7, 11, 7, 11, 11, 11, 11, 11]
+    # fmt: on
     deps = ["dep"] * len(heads)
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     for sent in doc.sents:
         if len(sent) == 1:
             assert not sent[-1].is_space
 
 
-def test_parser_sentence_space(en_tokenizer):
+def test_parser_sentence_space(en_vocab):
     # fmt: off
-    text = "I look forward to using Thingamajig.  I've been told it will make my life easier..."
-    heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
+    words = ["I", "look", "forward", "to", "using", "Thingamajig", ".", " ", "I", "'ve", "been", "told", "it", "will", "make", "my", "life", "easier", "..."]
+    heads = [1, 1, 1, 1, 3, 4, 1, 6, 11, 11, 11, 11, 14, 14, 11, 16, 17, 14, 11]
     deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
             "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
             "poss", "nsubj", "ccomp", "punct"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert len(list(doc.sents)) == 2
 
 
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_leading(en_tokenizer, en_parser):
-    text = "\t \n This is a sentence ."
-    heads = [1, 1, 0, 1, -2, -3]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+def test_parser_space_attachment_leading(en_vocab, en_parser):
+    words = ["\t", "\n", "This", "is", "a", "sentence", "."]
+    heads = [1, 2, 2, 4, 2, 2]
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[0].is_space
     assert doc[1].is_space
     assert doc[2].text == "This"
@@ -50,18 +48,16 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
-    text = "This is \t a \t\n \n sentence . \n\n \n"
-    heads = [1, 0, -1, 2, -1, -4, -5, -1]
+def test_parser_space_attachment_intermediate_trailing(en_vocab, en_parser):
+    words = ["This", "is", "\t", "a", "\t\n", "\n", "sentence", ".", "\n\n", "\n"]
+    heads = [1, 1, 1, 5, 3, 1, 1, 6]
     transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
+    doc = Doc(en_vocab, words=words, heads=heads)
     assert doc[2].is_space
     assert doc[4].is_space
     assert doc[5].is_space
     assert doc[8].is_space
     assert doc[9].is_space
-
     apply_transition_sequence(en_parser, doc, transition)
     for token in doc:
         assert token.dep != 0 or token.is_space
@@ -72,7 +68,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
 @pytest.mark.skip(
     reason="The step_through API was removed (but should be brought back)"
 )
-def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
+def test_parser_space_attachment_space(en_parser, text, length):
     doc = Doc(en_parser.vocab, words=text)
     assert len(doc) == length
     with en_parser.step_through(doc) as _:  # noqa: F841
diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py
index a66b34bc0..b9e5894dd 100644
--- a/spacy/tests/pipeline/test_attributeruler.py
+++ b/spacy/tests/pipeline/test_attributeruler.py
@@ -4,8 +4,9 @@ from spacy.training import Example
 from spacy.lang.en import English
 from spacy.pipeline import AttributeRuler
 from spacy import util, registry
+from spacy.tokens import Doc
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.fixture
@@ -66,7 +67,6 @@ def test_attributeruler_init(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     for p in pattern_dicts:
         a.add(**p)
-
     doc = nlp("This is a test.")
     assert doc[2].lemma_ == "the"
     assert doc[2].morph_ == "Case=Nom|Number=Plur"
@@ -129,7 +129,7 @@ def test_attributeruler_rule_order(nlp):
         {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
     ]
     a.add_patterns(patterns)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
@@ -141,13 +141,12 @@ def test_attributeruler_rule_order(nlp):
 def test_attributeruler_tag_map(nlp, tag_map):
     a = AttributeRuler(nlp.vocab)
     a.load_from_tag_map(tag_map)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "a", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i == 4:
             assert doc[i].pos_ == "PUNCT"
@@ -160,13 +159,12 @@ def test_attributeruler_tag_map(nlp, tag_map):
 def test_attributeruler_morph_rules(nlp, morph_rules):
     a = AttributeRuler(nlp.vocab)
     a.load_from_morph_rules(morph_rules)
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=["This", "is", "the", "test", "."],
         tags=["DT", "VBZ", "DT", "NN", "."],
     )
     doc = a(doc)
-
     for i in range(len(doc)):
         if i != 2:
             assert doc[i].pos_ == ""
@@ -193,7 +191,6 @@ def test_attributeruler_indices(nlp):
 
     text = "This is a test."
     doc = nlp(text)
-
     for i in range(len(doc)):
         if i == 1:
             assert doc[i].lemma_ == "was"
@@ -205,12 +202,10 @@ def test_attributeruler_indices(nlp):
             assert doc[i].lemma_ == "cat"
         else:
             assert doc[i].morph_ == ""
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
     with pytest.raises(ValueError):
         doc = nlp(text)
-
     # raises an error when trying to modify a token outside of the match
     a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
     with pytest.raises(ValueError):
@@ -220,7 +215,6 @@ def test_attributeruler_indices(nlp):
 def test_attributeruler_patterns_prop(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     for p1, p2 in zip(pattern_dicts, a.patterns):
         assert p1["patterns"] == p2["patterns"]
         assert p1["attrs"] == p2["attrs"]
@@ -231,18 +225,15 @@ def test_attributeruler_patterns_prop(nlp, pattern_dicts):
 def test_attributeruler_serialize(nlp, pattern_dicts):
     a = nlp.add_pipe("attribute_ruler")
     a.add_patterns(pattern_dicts)
-
     text = "This is a test."
     attrs = ["ORTH", "LEMMA", "MORPH"]
     doc = nlp(text)
-
     # bytes roundtrip
     a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
     assert a.to_bytes() == a_reloaded.to_bytes()
     doc1 = a_reloaded(nlp.make_doc(text))
     numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
     assert a.patterns == a_reloaded.patterns
-
     # disk roundtrip
     with make_tempdir() as tmp_dir:
         nlp.to_disk(tmp_dir)
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index ee9e34df3..025ac04af 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -1,57 +1,38 @@
 import pytest
 from spacy.pipeline.functions import merge_subtokens
 from spacy.language import Language
-from spacy.tokens import Span
-
-from ..util import get_doc
+from spacy.tokens import Span, Doc
 
 
 @pytest.fixture
-def doc(en_tokenizer):
+def doc(en_vocab):
     # fmt: off
-    text = "This is a sentence. This is another sentence. And a third."
-    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+    words = ["This", "is", "a", "sentence", ".", "This", "is", "another", "sentence", ".", "And", "a", "third", "."]
+    heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 11, 12, 13, 13]
     deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
             "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
     # fmt: on
-    tokens = en_tokenizer(text)
-    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(en_vocab, words=words, heads=heads, deps=deps)
 
 
 @pytest.fixture
-def doc2(en_tokenizer):
-    text = "I like New York in Autumn."
-    heads = [1, 0, 1, -2, -3, -1, -5]
+def doc2(en_vocab):
+    words = ["I", "like", "New", "York", "in", "Autumn", "."]
+    heads = [1, 1, 3, 1, 1, 4, 1]
     tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(
-        tokens.vocab,
-        words=[t.text for t in tokens],
-        heads=heads,
-        tags=tags,
-        pos=pos,
-        deps=deps,
-    )
-    doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
+    doc = Doc(en_vocab, words=words, heads=heads, tags=tags, pos=pos, deps=deps)
+    doc.ents = [Span(doc, 2, 4, label="GPE")]
     return doc
 
 
 def test_merge_subtokens(doc):
     doc = merge_subtokens(doc)
-    # get_doc() doesn't set spaces, so the result is "And a third ."
-    assert [t.text for t in doc] == [
-        "This",
-        "is",
-        "a sentence",
-        ".",
-        "This",
-        "is",
-        "another sentence",
-        ".",
-        "And a third .",
-    ]
+    # Doc doesn't have spaces, so the result is "And a third ."
+    # fmt: off
+    assert [t.text for t in doc] == ["This", "is", "a sentence", ".", "This", "is", "another sentence", ".", "And a third ."]
+    # fmt: on
 
 
 def test_factories_merge_noun_chunks(doc2):
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index 30f66fb1d..d841ee24b 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -9,7 +9,7 @@ from spacy.lang.en import English
 from spacy.lookups import Lookups
 from spacy.tokens import Doc, Span
 
-from ..util import get_doc, make_tempdir
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize(
@@ -88,12 +88,9 @@ def test_issue242(en_tokenizer):
         doc.ents += tuple(matches)
 
 
-def test_issue309(en_tokenizer):
+def test_issue309(en_vocab):
     """Test Issue #309: SBD fails on empty string"""
-    tokens = en_tokenizer(" ")
-    doc = get_doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
-    )
+    doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"])
     assert len(doc) == 1
     sents = list(doc.sents)
     assert len(sents) == 1
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 71ed2ea03..dce3e8298 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
 from spacy.attrs import HEAD, DEP
 from spacy.matcher import Matcher
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 def test_issue1506():
@@ -197,32 +197,21 @@ def test_issue1807():
 def test_issue1834():
     """Test that sentence boundaries & parse/tag flags are not lost
     during serialization."""
-    string = "This is a first sentence . And another one"
-    words = string.split()
-    doc = get_doc(Vocab(), words=words)
+    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
+    doc = Doc(Vocab(), words=words)
     doc[6].is_sent_start = True
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
     assert new_doc[6].sent_start
     assert not new_doc.has_annotation("DEP")
     assert not new_doc.has_annotation("TAG")
-    doc = get_doc(
+    doc = Doc(
         Vocab(),
         words=words,
         tags=["TAG"] * len(words),
-        heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
+        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
         deps=["dep"] * len(words),
     )
-    print(
-        doc.has_annotation("DEP"),
-        [t.head.i for t in doc],
-        [t.is_sent_start for t in doc],
-    )
     new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
-    print(
-        new_doc.has_annotation("DEP"),
-        [t.head.i for t in new_doc],
-        [t.is_sent_start for t in new_doc],
-    )
     assert new_doc[6].sent_start
     assert new_doc.has_annotation("DEP")
     assert new_doc.has_annotation("TAG")
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 3bea5d3f6..c4c755153 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -7,7 +7,7 @@ from spacy.training import iob_to_biluo
 from spacy.lang.it import Italian
 from spacy.lang.en import English
 
-from ..util import add_vecs_to_vocab, get_doc
+from ..util import add_vecs_to_vocab
 
 
 @pytest.mark.skip(
@@ -69,9 +69,10 @@ def test_issue2219(en_vocab):
     assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
 
 
-def test_issue2361(de_tokenizer):
+def test_issue2361(de_vocab):
     chars = ("&lt;", "&gt;", "&amp;", "&quot;")
-    doc = de_tokenizer('< > & " ')
+    words = ["<", ">", "&", '"']
+    doc = Doc(de_vocab, words=words, deps=["dep"] * len(words))
     html = render(doc)
     for char in chars:
         assert char in html
@@ -105,7 +106,7 @@ def test_issue2385_biluo(tags):
 
 def test_issue2396(en_vocab):
     words = ["She", "created", "a", "test", "for", "spacy"]
-    heads = [1, 0, 1, -2, -1, -1]
+    heads = [1, 1, 3, 1, 3, 4]
     deps = ["dep"] * len(heads)
     matrix = numpy.array(
         [
@@ -118,7 +119,7 @@ def test_issue2396(en_vocab):
         ],
         dtype=numpy.int32,
     )
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span = doc[:]
     assert (doc.get_lca_matrix() == matrix).all()
     assert (span.get_lca_matrix() == matrix).all()
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 9267a7346..5895b616e 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -12,8 +12,6 @@ from spacy.compat import pickle
 import numpy
 import random
 
-from ..util import get_doc
-
 
 def test_issue2564():
     """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
@@ -117,12 +115,14 @@ def test_issue2754(en_tokenizer):
 
 def test_issue2772(en_vocab):
     """Test that deprojectivization doesn't mess up sentence boundaries."""
-    words = "When we write or communicate virtually , we can hide our true feelings .".split()
+    # fmt: off
+    words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."]
+    # fmt: on
     # A tree with a non-projective (i.e. crossing) arc
     # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
+    heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9]
     deps = ["dep"] * len(heads)
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[1].is_sent_start is False
 
 
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index d848467dd..a64dc53e4 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -10,10 +10,8 @@ from spacy.vocab import Vocab
 from spacy.attrs import ENT_IOB, ENT_TYPE
 from spacy.compat import pickle
 from spacy import displacy
-import numpy
-
 from spacy.vectors import Vectors
-from ..util import get_doc
+import numpy
 
 
 def test_issue3002():
@@ -47,7 +45,7 @@ def test_issue3009(en_vocab):
     words = ["also", "has", "to", "do", "with"]
     tags = ["RB", "VBZ", "TO", "VB", "IN"]
     pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos)
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
     matcher = Matcher(en_vocab)
     for i, pattern in enumerate(patterns):
         matcher.add(str(i), [pattern])
@@ -61,19 +59,15 @@ def test_issue3012(en_vocab):
     words = ["This", "is", "10", "%", "."]
     tags = ["DT", "VBZ", "CD", "NN", "."]
     pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [(2, 4, "PERCENT")]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+    ents = [("PERCENT", 2, 4)]
+    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
     assert doc.has_annotation("TAG")
-
     expected = ("10", "NUM", "CD", "PERCENT")
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     header = [ENT_IOB, ENT_TYPE]
     ent_array = doc.to_array(header)
     doc.from_array(header, ent_array)
-
     assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
     # Serializing then deserializing
     doc_bytes = doc.to_bytes()
     doc2 = Doc(en_vocab).from_bytes(doc_bytes)
@@ -85,12 +79,8 @@ def test_issue3199():
     is available. To make this test future-proof, we're constructing a Doc
     with a new Vocab here and a parse tree to make sure the noun chunks run.
     """
-    doc = get_doc(
-        Vocab(),
-        words=["This", "is", "a", "sentence"],
-        heads=[0, -1, -2, -3],
-        deps=["dep"] * 4,
-    )
+    words = ["This", "is", "a", "sentence"]
+    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
     assert list(doc[0:3].noun_chunks) == []
 
 
@@ -147,9 +137,9 @@ def test_issue3288(en_vocab):
     """Test that retokenization works correctly via displaCy when punctuation
     is merged onto the preceeding token and tensor is resized."""
     words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-    heads = [1, 0, -1, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 4, 4, 6, 4, 4]
     deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
     displacy.render(doc)
 
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index 8c483d877..a79be6638 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -20,7 +20,7 @@ import spacy
 import srsly
 import numpy
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
@@ -355,7 +355,7 @@ def test_issue3882(en_vocab):
     """Test that displaCy doesn't serialize the doc.user_data when making a
     copy of the Doc.
     """
-    doc = Doc(en_vocab, words=["Hello", "world"])
+    doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"])
     doc.user_data["test"] = set()
     parse_deps(doc)
 
@@ -398,10 +398,10 @@ def test_issue3962(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
-    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = doc[1:5]  # "jests at scars ,"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
@@ -436,10 +436,10 @@ def test_issue3962_long(en_vocab):
     This is achieved by setting the head to itself if it would lie out of the span otherwise."""
     # fmt: off
     words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
-    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7]
     deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
     # fmt: on
-    two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     span2 = two_sent_doc[1:7]  # "jests at scars. They never"
     doc2 = span2.as_doc()
     doc2_json = doc2.to_json()
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
new file mode 100644
index 000000000..dbfe78679
--- /dev/null
+++ b/spacy/tests/regression/test_issue5001-5500.py
@@ -0,0 +1,138 @@
+import numpy
+from spacy.tokens import Doc, DocBin
+from spacy.attrs import DEP, POS, TAG
+from spacy.lang.en import English
+from spacy.language import Language
+from spacy.lang.en.syntax_iterators import noun_chunks
+from spacy.vocab import Vocab
+import spacy
+import pytest
+
+from ...util import make_tempdir
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+    strings = en_vocab.strings
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+    doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
+
+
+def test_issue5082():
+    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
+    nlp = English()
+    vocab = nlp.vocab
+    array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32)
+    array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32)
+    array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32)
+    array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32)
+    array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32)
+    vocab.set_vector("I", array1)
+    vocab.set_vector("like", array2)
+    vocab.set_vector("David", array3)
+    vocab.set_vector("Bowie", array4)
+    text = "I like David Bowie"
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    parsed_vectors_1 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_1) == 4
+    numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
+    numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
+    nlp.add_pipe("merge_entities")
+    parsed_vectors_2 = [t.vector for t in nlp(text)]
+    assert len(parsed_vectors_2) == 3
+    numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
+    numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
+    numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
+
+
+def test_issue5137():
+    @Language.factory("my_component")
+    class MyComponent:
+        def __init__(self, nlp, name="my_component", categories="all_categories"):
+            self.nlp = nlp
+            self.categories = categories
+            self.name = name
+
+        def __call__(self, doc):
+            pass
+
+        def to_disk(self, path, **kwargs):
+            pass
+
+        def from_disk(self, path, **cfg):
+            pass
+
+    nlp = English()
+    my_component = nlp.add_pipe("my_component")
+    assert my_component.categories == "all_categories"
+    with make_tempdir() as tmpdir:
+        nlp.to_disk(tmpdir)
+        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
+        nlp2 = spacy.load(tmpdir, config=overrides)
+        assert nlp2.get_pipe("my_component").categories == "my_categories"
+
+
+def test_issue5141(en_vocab):
+    """ Ensure an empty DocBin does not crash on serialization """
+    doc_bin = DocBin(attrs=["DEP", "HEAD"])
+    assert list(doc_bin.get_docs(en_vocab)) == []
+    doc_bin_bytes = doc_bin.to_bytes()
+    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
+    assert list(doc_bin_2.get_docs(en_vocab)) == []
+
+
+def test_issue5152():
+    # Test that the comparison between a Span and a Token, goes well
+    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
+    nlp = English()
+    text = nlp("Talk about being boring!")
+    text_var = nlp("Talk of being boring!")
+    y = nlp("Let")
+    span = text[0:3]  # Talk about being
+    span_2 = text[0:3]  # Talk about being
+    span_3 = text_var[0:3]  # Talk of being
+    token = y[0]  # Let
+    with pytest.warns(UserWarning):
+        assert span.similarity(token) == 0.0
+    assert span.similarity(span_2) == 1.0
+    with pytest.warns(UserWarning):
+        assert span_2.similarity(span_3) < 1.0
+
+
+def test_issue5458():
+    # Test that the noun chuncker does not generate overlapping spans
+    # fmt: off
+    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
+    vocab = Vocab(strings=words)
+    deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
+    pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
+    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
+    # fmt: on
+    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
+    en_doc.noun_chunks_iterator = noun_chunks
+
+    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
+    nlp = English()
+    merge_nps = nlp.create_pipe("merge_noun_chunks")
+    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
deleted file mode 100644
index bc52ae82f..000000000
--- a/spacy/tests/regression/test_issue5048.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy
-from spacy.tokens import Doc
-from spacy.attrs import DEP, POS, TAG
-
-from ..util import get_doc
-
-
-def test_issue5048(en_vocab):
-    words = ["This", "is", "a", "sentence"]
-    pos_s = ["DET", "VERB", "DET", "NOUN"]
-    spaces = [" ", " ", " ", ""]
-    deps_s = ["dep", "adj", "nn", "atm"]
-    tags_s = ["DT", "VBZ", "DT", "NN"]
-
-    strings = en_vocab.strings
-
-    for w in words:
-        strings.add(w)
-    deps = [strings.add(d) for d in deps_s]
-    pos = [strings.add(p) for p in pos_s]
-    tags = [strings.add(t) for t in tags_s]
-
-    attrs = [POS, DEP, TAG]
-    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
-
-    doc = Doc(en_vocab, words=words, spaces=spaces)
-    doc.from_array(attrs, array)
-    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
-
-    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
-    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
-    assert v1 == v2
diff --git a/spacy/tests/regression/test_issue5082.py b/spacy/tests/regression/test_issue5082.py
deleted file mode 100644
index 76f3a552e..000000000
--- a/spacy/tests/regression/test_issue5082.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-from spacy.lang.en import English
-
-
-def test_issue5082():
-    # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens
-    nlp = English()
-    vocab = nlp.vocab
-    array1 = np.asarray([0.1, 0.5, 0.8], dtype=np.float32)
-    array2 = np.asarray([-0.2, -0.6, -0.9], dtype=np.float32)
-    array3 = np.asarray([0.3, -0.1, 0.7], dtype=np.float32)
-    array4 = np.asarray([0.5, 0, 0.3], dtype=np.float32)
-    array34 = np.asarray([0.4, -0.05, 0.5], dtype=np.float32)
-
-    vocab.set_vector("I", array1)
-    vocab.set_vector("like", array2)
-    vocab.set_vector("David", array3)
-    vocab.set_vector("Bowie", array4)
-
-    text = "I like David Bowie"
-    patterns = [
-        {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]}
-    ]
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    parsed_vectors_1 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_1) == 4
-    np.testing.assert_array_equal(parsed_vectors_1[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_1[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_1[2], array3)
-    np.testing.assert_array_equal(parsed_vectors_1[3], array4)
-    nlp.add_pipe("merge_entities")
-    parsed_vectors_2 = [t.vector for t in nlp(text)]
-    assert len(parsed_vectors_2) == 3
-    np.testing.assert_array_equal(parsed_vectors_2[0], array1)
-    np.testing.assert_array_equal(parsed_vectors_2[1], array2)
-    np.testing.assert_array_equal(parsed_vectors_2[2], array34)
diff --git a/spacy/tests/regression/test_issue5137.py b/spacy/tests/regression/test_issue5137.py
deleted file mode 100644
index cc7a9bd38..000000000
--- a/spacy/tests/regression/test_issue5137.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import spacy
-from spacy.language import Language
-from spacy.lang.en import English
-from spacy.tests.util import make_tempdir
-
-
-def test_issue5137():
-    @Language.factory("my_component")
-    class MyComponent:
-        def __init__(self, nlp, name="my_component", categories="all_categories"):
-            self.nlp = nlp
-            self.categories = categories
-            self.name = name
-
-        def __call__(self, doc):
-            pass
-
-        def to_disk(self, path, **kwargs):
-            pass
-
-        def from_disk(self, path, **cfg):
-            pass
-
-    nlp = English()
-    my_component = nlp.add_pipe("my_component")
-    assert my_component.categories == "all_categories"
-
-    with make_tempdir() as tmpdir:
-        nlp.to_disk(tmpdir)
-        overrides = {"components": {"my_component": {"categories": "my_categories"}}}
-        nlp2 = spacy.load(tmpdir, config=overrides)
-        assert nlp2.get_pipe("my_component").categories == "my_categories"
diff --git a/spacy/tests/regression/test_issue5141.py b/spacy/tests/regression/test_issue5141.py
deleted file mode 100644
index 845454583..000000000
--- a/spacy/tests/regression/test_issue5141.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from spacy.tokens import DocBin
-
-
-def test_issue5141(en_vocab):
-    """ Ensure an empty DocBin does not crash on serialization """
-    doc_bin = DocBin(attrs=["DEP", "HEAD"])
-    assert list(doc_bin.get_docs(en_vocab)) == []
-    doc_bin_bytes = doc_bin.to_bytes()
-
-    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
-    assert list(doc_bin_2.get_docs(en_vocab)) == []
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
deleted file mode 100644
index c7a70a99c..000000000
--- a/spacy/tests/regression/test_issue5152.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from spacy.lang.en import English
-import pytest
-
-
-def test_issue5152():
-    # Test that the comparison between a Span and a Token, goes well
-    # There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
-    nlp = English()
-    text = nlp("Talk about being boring!")
-    text_var = nlp("Talk of being boring!")
-    y = nlp("Let")
-    span = text[0:3]  # Talk about being
-    span_2 = text[0:3]  # Talk about being
-    span_3 = text_var[0:3]  # Talk of being
-    token = y[0]  # Let
-    with pytest.warns(UserWarning):
-        assert span.similarity(token) == 0.0
-    assert span.similarity(span_2) == 1.0
-    with pytest.warns(UserWarning):
-        assert span_2.similarity(span_3) < 1.0
diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py
deleted file mode 100644
index a7a2959df..000000000
--- a/spacy/tests/regression/test_issue5458.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from spacy.lang.en import English
-from spacy.lang.en.syntax_iterators import noun_chunks
-from spacy.tests.util import get_doc
-from spacy.vocab import Vocab
-
-
-def test_issue5458():
-    # Test that the noun chuncker does not generate overlapping spans
-    # fmt: off
-    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
-    vocab = Vocab(strings=words)
-    dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
-    pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
-    heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
-    # fmt: on
-
-    en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
-    en_doc.noun_chunks_iterator = noun_chunks
-
-    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
-    nlp = English()
-    merge_nps = nlp.create_pipe("merge_noun_chunks")
-    merge_nps(en_doc)
diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index 66280f012..db957709c 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,5 +1,6 @@
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities
+import pytest
 
 
 def test_issue5918():
@@ -22,6 +23,7 @@ def test_issue5918():
     assert len(doc.ents) == 3
     # make it so that the third span's head is within the entity (ent_iob=I)
     # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
-    doc[29].head = doc[33]
+    with pytest.warns(UserWarning):
+        doc[29].head = doc[33]
     doc = merge_entities(doc)
     assert len(doc.ents) == 3
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index 1fa0eeaa1..040dd657f 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -1,15 +1,13 @@
 import pytest
 from spacy import displacy
 from spacy.displacy.render import DependencyRenderer, EntityRenderer
-from spacy.tokens import Span
+from spacy.tokens import Span, Doc
 from spacy.lang.fa import Persian
 
-from .util import get_doc
-
 
 def test_displacy_parse_ents(en_vocab):
     """Test that named entities on a Doc are converted into displaCy's format."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     ents = displacy.parse_ents(doc)
     assert isinstance(ents, dict)
@@ -20,11 +18,11 @@ def test_displacy_parse_ents(en_vocab):
 def test_displacy_parse_deps(en_vocab):
     """Test that deps and tags on a Doc are converted into displaCy's format."""
     words = ["This", "is", "a", "sentence"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 1, 3, 1]
     pos = ["DET", "VERB", "DET", "NOUN"]
     tags = ["DT", "VBZ", "DT", "NN"]
     deps = ["nsubj", "ROOT", "det", "attr"]
-    doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
+    doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
     deps = displacy.parse_deps(doc)
     assert isinstance(deps, dict)
     assert deps["words"] == [
@@ -53,7 +51,7 @@ def test_displacy_invalid_arcs():
 
 def test_displacy_spans(en_vocab):
     """Test that displaCy can render Spans."""
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc[1:4], style="ent")
     assert html.startswith("<div")
@@ -70,9 +68,9 @@ def test_displacy_rtl():
     # These are (likely) wrong, but it's just for testing
     pos = ["PRO", "ADV", "N_PL", "V_SUB"]  # needs to match lang.fa.tag_map
     deps = ["foo", "bar", "foo", "baz"]
-    heads = [1, 0, 1, -2]
+    heads = [1, 0, 3, 1]
     nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
+    doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
     doc.ents = [Span(doc, 1, 3, label="TEST")]
     html = displacy.render(doc, page=True, style="dep")
     assert "direction: rtl" in html
@@ -90,7 +88,7 @@ def test_displacy_render_wrapper(en_vocab):
         return "TEST" + html + "TEST"
 
     displacy.set_render_wrapper(wrapper)
-    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
+    doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
     doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
     html = displacy.render(doc, style="ent")
     assert html.startswith("TEST<div")
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 6e3604ce8..a1406c14a 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -5,7 +5,6 @@ from spacy.training import Example
 from spacy.training.iob_utils import biluo_tags_from_offsets
 from spacy.scorer import Scorer, ROCAUCScore
 from spacy.scorer import _roc_auc_score, _roc_curve
-from .util import get_doc
 from spacy.lang.en import English
 from spacy.tokens import Doc
 
@@ -137,11 +136,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         example = Example.from_dict(doc, gold)
@@ -161,11 +157,8 @@ def test_las_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_las_apple:
-        doc = get_doc(
-            en_vocab,
-            words=input_.split(" "),
-            heads=([h - i for i, h in enumerate(annot["heads"])]),
-            deps=annot["deps"],
+        doc = Doc(
+            en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"],
         )
         gold = {"heads": annot["heads"], "deps": annot["deps"]}
         doc[0].dep_ = "compound"
@@ -188,10 +181,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_cardinal:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
+            ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
@@ -213,10 +206,10 @@ def test_ner_per_type(en_vocab):
     scorer = Scorer()
     examples = []
     for input_, annot in test_ner_apple:
-        doc = get_doc(
+        doc = Doc(
             en_vocab,
             words=input_.split(" "),
-            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
+            ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
         )
         entities = biluo_tags_from_offsets(doc, annot["entities"])
         example = Example.from_dict(doc, {"entities": entities})
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index b09487965..4cab5b015 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,13 +12,14 @@ from thinc.api import compounding
 import pytest
 import srsly
 
-from ..util import make_tempdir, get_doc
+from ..util import make_tempdir
 
 
 @pytest.fixture
-def doc():
+def doc(en_vocab):
+    nlp = English()  # make sure we get a new vocab every time
     # fmt: off
-    text = "Sarah's sister flew to Silicon Valley via London."
+    words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
     tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
     pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
     morphs = ["NounType=prop|Number=sing", "Poss=yes", "Number=sing", "Tense=past|VerbForm=fin",
@@ -26,15 +27,12 @@ def doc():
               "NounType=prop|Number=sing", "PunctType=peri"]
     # head of '.' is intentionally nonprojective for testing
     heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
-    heads = [head - i for i, head in enumerate(heads)]
     deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
     lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
-    ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
+    ents = (("PERSON", 0, 2), ("LOC", 5, 7), ("GPE", 8, 9))
     cats = {"TRAVEL": 1.0, "BAKING": 0.0}
     # fmt: on
-    nlp = English()
-    words = [t.text for t in nlp.make_doc(text)]
-    doc = get_doc(
+    doc = Doc(
         nlp.vocab,
         words=words,
         tags=tags,
@@ -212,41 +210,24 @@ def test_json2docs_no_ner(en_vocab):
 
 
 def test_split_sentences(en_vocab):
+    # fmt: off
     words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San",
-        "Francisco",
-        "Valley",
-        "had",
-        "loads",
-        "of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
     sent_starts = [True, False, False, False, False, False, True, False, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "I flew to San Francisco Valley "
     assert split_examples[1].text == "had loads of fun "
-
+    # fmt: off
     words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
-    doc = Doc(en_vocab, words=words)
-    gold_words = [
-        "I",
-        "flew",
-        "to",
-        "San Francisco",
-        "Valley",
-        "had",
-        "loads of",
-        "fun",
-    ]
+    gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
     sent_starts = [True, False, False, False, False, True, False, False]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
     example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
     assert example.text == "I flew to San Francisco Valley had loads of fun "
     split_examples = example.split_sents()
@@ -479,7 +460,6 @@ def test_roundtrip_docs_to_docbin(doc):
     heads = [t.head.i for t in doc]
     cats = doc.cats
     ents = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-
     # roundtrip to DocBin
     with make_tempdir() as tmpdir:
         # use a separate vocab to test that all labels are added
@@ -600,7 +580,6 @@ def test_tuple_format_implicit():
 
 def test_tuple_format_implicit_invalid():
     """Test that an error is thrown for an implicit invalid field"""
-
     train_data = [
         ("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
         (
@@ -609,7 +588,6 @@ def test_tuple_format_implicit_invalid():
         ),
         ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
     ]
-
     with pytest.raises(KeyError):
         _train_tuples(train_data)
 
@@ -619,11 +597,9 @@ def _train_tuples(train_data):
     ner = nlp.add_pipe("ner")
     ner.add_label("ORG")
     ner.add_label("LOC")
-
     train_examples = []
     for t in train_data:
         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
     optimizer = nlp.begin_training()
     for i in range(5):
         losses = {}
@@ -639,17 +615,14 @@ def test_split_sents(merged_dict):
         merged_dict,
     )
     assert example.text == "Hi there everyone It is just me"
-
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "Hi there everyone "
     assert split_examples[1].text == "It is just me"
-
     token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
     assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
     assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
     assert token_annotation_1["SENT_START"] == [1, 0, 0]
-
     token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
     assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
     assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 6c67d2ee1..ef7b4d00d 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -2,11 +2,7 @@ import numpy
 import tempfile
 import contextlib
 import srsly
-
-from spacy import Errors
-from spacy.tokens import Doc, Span
-from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
-
+from spacy.tokens import Doc
 from spacy.vocab import Vocab
 from spacy.util import make_tempdir  # noqa: F401
 
@@ -18,35 +14,6 @@ def make_tempfile(mode="r"):
     f.close()
 
 
-def get_doc(
-    vocab,
-    words=[],
-    pos=None,
-    heads=None,
-    deps=None,
-    tags=None,
-    ents=None,
-    lemmas=None,
-    morphs=None,
-):
-    """Create Doc object from given vocab, words and annotations."""
-    if heads is not None:
-        heads = [i + head for i, head in enumerate(heads)]
-    if ents is not None:
-        ents = [(vocab.strings[ent_type], start, end) for start, end, ent_type in ents]
-    return Doc(
-        vocab,
-        words=words,
-        pos=pos,
-        heads=heads,
-        deps=deps,
-        tags=tags,
-        ents=ents,
-        lemmas=lemmas,
-        morphs=morphs,
-    )
-
-
 def get_batch(batch_size):
     vocab = Vocab()
     docs = []
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c5f1f6801..f81e4a96b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -200,8 +200,8 @@ cdef class Doc:
         sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
             the same length as words, to assign as token.is_sent_start. Will be
             overridden by heads if heads is provided. Defaults to None.
-        ents (Optional[List[Span]]): A list of spans to assign as doc.ents.
-            Defaults to None.
+        ents (Optional[List[Tuple[Union[str, int], int, int]]]): A list of
+            (label, start, end) tuples to assign as doc.ents. Defaults to None.
 
         DOCS: https://nightly.spacy.io/api/doc#init
         """
@@ -665,7 +665,7 @@ cdef class Doc:
             cdef attr_t kb_id
             cdef int ent_start, ent_end
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
                 for token_index in range(ent_start, ent_end):
                     if token_index in tokens_in_ents.keys():
                         raise ValueError(Errors.E103.format(
@@ -1583,7 +1583,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info):
+def get_entity_info(ent_info, vocab):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,4 +1596,6 @@ def get_entity_info(ent_info):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
+    if isinstance(ent_type, str):
+        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3344704bf..371b4a06a 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -172,7 +172,7 @@ cdef class Example:
         return output
 
     def get_aligned_ner(self):
-        if not self.y.is_nered:
+        if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
         x_ents = self.get_aligned_spans_y2x(self.y.ents)
         # Default to 'None' for missing values
@@ -221,7 +221,7 @@ cdef class Example:
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
-        if not self.reference.is_sentenced:
+        if not self.reference.has_annotation("SENT_START"):
             return [self]
 
         align = self.alignment.y2x
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 52f94a83d..648ade5f6 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -25,26 +25,27 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 >
 > # Construction 2
 > from spacy.tokens import Doc
+>
 > words = ["hello", "world", "!"]
 > spaces = [True, False, False]
 > doc = Doc(nlp.vocab, words=words, spaces=spaces)
 > ```
 
-| Name           | Description                                                                                                                                                                                    |
-| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | A storage container for lexical types. ~~Vocab~~                                                                                                                                               |
-| `words`        | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                             |
-| `spaces`       | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~   |
-| _keyword-only_ |                                                                                                                                                                                                |
-| `user\_data`   | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                             |
-| tags           | A list of strings, of the same length as words, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| pos            | A list of strings, of the same length as words, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| morphs         | A list of strings, of the same length as words, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| lemmas         | A list of strings, of the same length as words, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                          |
-| heads          | A list of values, of the same length as words, to assign as the head for each word. Head indices are the absolute position of the head in the doc. Defaults to `None`. ~~Optional[List[int]]~~ |
-| deps           | A list of strings, of the same length as words, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
-| sent_starts    | A list of values, of the same length as words, to assign as token.is_sent_start. Will be overridden by heads if heads is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~    |
-| ents           | A list of spans to assign as doc.ents. Defaults to `None`. ~~Optional[List[Span]]~~                                                                                                            |
+| Name                                     | Description                                                                                                                                                                                                       |
+| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                                  |
+| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                                |
+| `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~                      |
+| _keyword-only_                           |                                                                                                                                                                                                                   |
+| `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                                |
+| `tags` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `pos` <Tag variant="new">3</Tag>         | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `morphs` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                           |
+| `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~                |
+| `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                                             |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~                 |
+| `ents` <Tag variant="new">3</Tag>        | A list of `(label, start, end)` tuples to assign as `doc.ents`. Note that the `start` and `end` indices here refer to the token indices. Defaults to `None`. ~~Optional[List[Tuple[Union[str, int], int, int]]]~~ |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
 
@@ -281,6 +282,19 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 
 Check whether the doc contains annotation on a token attribute.
 
+<Infobox title="Changed in v3.0" variant="warning">
+
+This method replaces the previous boolean attributes like `Doc.is_tagged`,
+`Doc.is_parsed` or `Doc.is_sentenced`.
+
+```diff
+doc = nlp("This is a text")
+- assert doc.is_parsed
++ assert doc.has_annotation("DEP")
+```
+
+</Infobox>
+
 | Name               | Description                                                                                         |
 | ------------------ | --------------------------------------------------------------------------------------------------- |
 | `attr`             | The attribute string name or int ID. ~~Union[int, str]~~                                            |
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 5abeb5707..406ba4b75 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -530,6 +530,8 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
   [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
   patterns as the second argument (instead of a variable number of arguments).
   The `on_match` callback becomes an optional keyword argument.
+- The `Doc` flags like `Doc.is_parsed` or `Doc.is_tagged` have been replaced by
+  [`Doc.has_annotation`](/api/doc#has_annotation).
 - The `spacy.gold` module has been renamed to
   [`spacy.training`](%%GITHUB_SPACY/spacy/training).
 - The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has
@@ -807,10 +809,11 @@ nlp = spacy.blank("en")
 
 ### Migrating Doc flags {#migrating-doc-flags}
 
-The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
-`Doc.is_sentenced` are deprecated in v3 and replaced by
+The [`Doc`](/api/doc) flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
+`Doc.is_sentenced` are deprecated in v3.0 and replaced by
 [`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
-token attribute symbols (the same symbols used in `Matcher` patterns):
+token attribute symbols (the same symbols used in [`Matcher`](/api/matcher)
+patterns):
 
 ```diff
 doc = nlp(text)

From 3abc4a5adb9c29605de89ab984190f64d88190b4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 21 Sep 2020 22:58:03 +0200
Subject: [PATCH 19/30] Slightly tidy doc.ents.__set__

---
 spacy/tokens/doc.pyx | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f81e4a96b..b82bab294 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -663,11 +663,14 @@ cdef class Doc:
             tokens_in_ents = {}
             cdef attr_t entity_type
             cdef attr_t kb_id
-            cdef int ent_start, ent_end
+            cdef int ent_start, ent_end, token_index
             for ent_info in ents:
-                entity_type, kb_id, ent_start, ent_end = get_entity_info(ent_info, self.vocab)
+                entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
+                if isinstance(entity_type_, str):
+                    self.vocab.strings.add(entity_type_)
+                entity_type = self.vocab.strings.as_int(entity_type_)
                 for token_index in range(ent_start, ent_end):
-                    if token_index in tokens_in_ents.keys():
+                    if token_index in tokens_in_ents:
                         raise ValueError(Errors.E103.format(
                             span1=(tokens_in_ents[token_index][0],
                                    tokens_in_ents[token_index][1],
@@ -1583,7 +1586,7 @@ def fix_attributes(doc, attributes):
         attributes[ENT_TYPE] = attributes["ent_type"]
 
 
-def get_entity_info(ent_info, vocab):
+def get_entity_info(ent_info):
     if isinstance(ent_info, Span):
         ent_type = ent_info.label
         ent_kb_id = ent_info.kb_id
@@ -1596,6 +1599,4 @@ def get_entity_info(ent_info, vocab):
         ent_type, ent_kb_id, start, end = ent_info
     else:
         ent_id, ent_kb_id, ent_type, start, end = ent_info
-    if isinstance(ent_type, str):
-        ent_type = vocab.strings.add(ent_type)
     return ent_type, ent_kb_id, start, end

From fa5c416db646b919153a362c02f842c7a19dbb9e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:09:22 +0200
Subject: [PATCH 20/30] initialize through nlp object and with train_corpus

---
 spacy/cli/debug_model.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 3d76cdbde..017bcd239 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,5 +1,9 @@
+import warnings
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
+
+from spacy.training import Example
+from spacy.util import dot_to_object
 from wasabi import msg
 from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
 from thinc.api import Model, data_validation, set_gpu_allocator
@@ -71,12 +75,10 @@ def debug_model_cli(
             exits=1,
         )
     model = pipe.model
-    # call _link_components directly as we won't call nlp.begin_training
-    nlp._link_components()
-    debug_model(nlp, model, print_settings=print_settings)
+    debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -93,10 +95,21 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
 
     # STEP 1: Initializing the model and printing again
     X = _get_docs()
-    _set_output_dim(nO=7, model=model)
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        model.initialize(X=X)
+        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
+        try:
+            train_corpus = dot_to_object(config, config["training"]["train_corpus"])
+            nlp.begin_training(lambda: train_corpus(nlp))
+            msg.info("Initialized the model with the training corpus.")
+        except ValueError:
+            try:
+                _set_output_dim(nO=7, model=model)
+                nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
+                msg.info("Initialized the model with dummy data.")
+            except:
+                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
         _print_model(model, print_settings)
@@ -114,8 +127,7 @@ def debug_model(nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] =
         if tok2vec:
             tok2vec.predict(X)
         Y, get_dX = model.begin_update(X)
-        # simulate a goldY value
-        if not goldY:
+        if goldY is None:
             goldY = _simulate_gold(Y)
         dY = get_gradient(goldY, Y, model.ops)
         get_dX(dY)

From 45b29c4a5b926c8f85b0a2ed4a9b8be13c5bf7eb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 21 Sep 2020 23:17:23 +0200
Subject: [PATCH 21/30] cleanup

---
 spacy/cli/debug_model.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 017bcd239..1d27c7c52 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -78,7 +78,9 @@ def debug_model_cli(
     debug_model(config, nlp, model, print_settings=print_settings)
 
 
-def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
+def debug_model(
+    config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None
+):
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -97,7 +99,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     X = _get_docs()
     # The output vector might differ from the official type of the output layer
     with data_validation(False):
-        # msg.info(f"Could not initialize the model with dummy data - using the train_corpus.")
         try:
             train_corpus = dot_to_object(config, config["training"]["train_corpus"])
             nlp.begin_training(lambda: train_corpus(nlp))
@@ -108,7 +109,10 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
                 nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
                 msg.info("Initialized the model with dummy data.")
             except:
-                msg.fail("Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.", exits=1)
+                msg.fail(
+                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
+                    exits=1,
+                )
 
     if print_settings.get("print_after_init"):
         msg.divider(f"STEP 1 - after initialization")
@@ -121,7 +125,6 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     tok2vec = None
     if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
         tok2vec = nlp.get_pipe("tok2vec")
-        tok2vec.model.initialize(X=X)
     goldY = None
     for e in range(3):
         if tok2vec:
@@ -145,17 +148,17 @@ def debug_model(config, nlp, model: Model, *, print_settings: Optional[Dict[str,
     msg.good(f"Succesfully ended analysis - model looks good.")
 
 
+def get_gradient(goldY, Y, ops):
+    return ops.asarray(Y) - ops.asarray(goldY)
+
+
 def _simulate_gold(element, counter=1):
     if isinstance(element, Iterable):
         for i in range(len(element)):
-            element[i] = _simulate_gold(element[i], counter+i)
+            element[i] = _simulate_gold(element[i], counter + i)
         return element
     else:
-        return 1/counter
-
-
-def get_gradient(goldY, Y, ops):
-    return ops.asarray(Y) - ops.asarray(goldY)
+        return 1 / counter
 
 
 def _sentences():
@@ -229,12 +232,3 @@ def _print_matrix(value):
     sample_matrix = sample_matrix[0:5]
     result = result + str(sample_matrix)
     return result
-
-
-def _set_output_dim(model, nO):
-    # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
-    if model.has_dim("nO") is None:
-        model.set_dim("nO", nO)
-    if model.has_ref("output_layer"):
-        if model.get_ref("output_layer").has_dim("nO") is None:
-            model.get_ref("output_layer").set_dim("nO", nO)
\ No newline at end of file

From 69f7e52c26ef545fb9e39cd748666ae451318c77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:10:06 +0200
Subject: [PATCH 22/30] Update README.md

---
 spacy/tests/README.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/README.md b/spacy/tests/README.md
index 86bbd52da..833dc9266 100644
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@@ -38,18 +38,17 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji #
 
 ## Dos and don'ts
 
-To keep the behaviour of the tests consistent and predictable, we try to follow a few basic conventions:
+To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions:
 
 - **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`.
 - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory.
-- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behaviour, use `assert not` in your test.
-- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behaviour, consider adding an additional simpler version.
+- Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test.
+- Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version.
 - If tests require **loading the models**, they should be added to the [`spacy-models`](https://github.com/explosion/spacy-models) tests.
-- Before requiring the models, always make sure there is no other way to test the particular behaviour. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
+- Before requiring the models, always make sure there is no other way to test the particular behavior. In a lot of cases, it's sufficient to simply create a `Doc` object manually. See the section on [helpers and utility functions](#helpers-and-utilities) for more info on this.
 - **Avoid unnecessary imports.** There should never be a need to explicitly import spaCy at the top of a file, and many components are available as [fixtures](#fixtures). You should also avoid wildcard imports (`from module import *`).
 - If you're importing from spaCy, **always use absolute imports**. For example: `from spacy.language import Language`.
-- Don't forget the **unicode declarations** at the top of each file. This way, unicode strings won't have to be prefixed with `u`.
-- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behaviour at a time.
+- Try to keep the tests **readable and concise**. Use clear and descriptive variable names (`doc`, `tokens` and `text` are great), keep it short and only test for one behavior at a time.
 
 ## Parameters
 
@@ -77,7 +76,7 @@ To test for combinations of parameters, you can add several `parametrize` marker
 @pytest.mark.parametrize('punct', ['.', '!', '?'])
 ```
 
-This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unneccessary or undesired test bloat.
+This will run the test with all combinations of the two parameters `text` and `punct`. **Use this feature sparingly**, though, as it can easily cause unnecessary or undesired test bloat.
 
 ## Fixtures
 
@@ -104,9 +103,9 @@ If all tests in a file require a specific configuration, or use the same complex
 
 Our new test setup comes with a few handy utility functions that can be imported from [`util.py`](util.py).
 
-### Constructing a `Doc` object manually with
+### Constructing a `Doc` object manually
 
-Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need ia a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
+Loading the models is expensive and not necessary if you're not actually testing the model performance. If all you need is a `Doc` object with annotations like heads, POS tags or the dependency parse, you can construct it manually.
 
 ```python
 def test_doc_token_api_strings(en_vocab):

From beb766d0a09509a7d91518e60c990489789978e0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:15:57 +0200
Subject: [PATCH 23/30] Add test

---
 spacy/tests/doc/test_doc_api.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 2c22926e9..163de5ab0 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -455,3 +455,16 @@ def test_is_flags_deprecated(en_tokenizer):
         doc.is_nered
     with pytest.deprecated_call():
         doc.is_sentenced
+
+
+def test_doc_set_ents():
+    """Test that both strings and integers can be used to set entities in
+    tuple format via doc.ents."""
+    words = ["a", "b", "c", "d", "e"]
+    doc = Doc(Vocab(), words=words)
+    doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
+    vocab = Vocab()
+    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
+    doc = Doc(vocab, words=words, ents=ents)
+    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]

From fc9c78da25202322c9ec042b529a6a3f91d48e4d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:23:47 +0200
Subject: [PATCH 24/30] Add MorphAnalysis to API sidebar

---
 website/meta/sidebars.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index e27817c92..28915ebb7 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,6 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
+                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 844db6ff12441f63f51d4d9921cdaf4e6af61a04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:31:47 +0200
Subject: [PATCH 25/30] Update architecture overview

---
 website/docs/usage/101/_architecture.md | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 98011f173..6e9120022 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -65,22 +65,22 @@ Matchers help you find and extract information from [`Doc`](/api/doc) objects
 based on match patterns describing the sequences you're looking for. A matcher
 operates on a `Doc` and gives you access to the matched tokens **in context**.
 
-| Name                                          | Description                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                  |
-| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                         |
-| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
+| Name                                          | Description                                                                                                                                                                        |
+| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Matcher`](/api/matcher)                     | Match sequences of tokens, based on pattern rules, similar to regular expressions.                                                                                                 |
+| [`PhraseMatcher`](/api/phrasematcher)         | Match sequences of tokens based on phrases.                                                                                                                                        |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using [Semgrex operators](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
 
 ### Other classes {#architecture-other}
 
-| Name                                             | Description                                                                                                      |
-| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects.               |
-| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                                             |
-| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                                 |
-| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                                         |
-| [`Morphology`](/api/morphology)                  | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
-| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                                        |
-| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                                         |
-| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                                       |
-| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                                           |
+| Name                                             | Description                                                                                        |
+| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab)                            | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
+| [`StringStore`](/api/stringstore)                | Map strings to and from hash values.                                                               |
+| [`Vectors`](/api/vectors)                        | Container class for vector data keyed by string.                                                   |
+| [`Lookups`](/api/lookups)                        | Container for convenient access to large lookup tables and dictionaries.                           |
+| [`Morphology`](/api/morphology)                  | Store morphological analyses and map them to and from hash values.                                 |
+| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis.                                                                          |
+| [`KnowledgeBase`](/api/kb)                       | Storage for entities and aliases of a knowledge base for entity linking.                           |
+| [`Scorer`](/api/scorer)                          | Compute evaluation scores.                                                                         |
+| [`Corpus`](/api/corpus)                          | Class for managing annotated corpora for training and evaluation data.                             |

From e05d6d358d04166779093d2acff0e2c3bb95fe04 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 22 Sep 2020 09:36:37 +0200
Subject: [PATCH 26/30] Update API sidebar MorphAnalysis link

---
 website/meta/sidebars.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 28915ebb7..c5404b68e 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -119,7 +119,7 @@
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },
                     { "text": "Lookups", "url": "/api/lookups" },
-                    { "text": "MorphAnalysis", "url": "/api/morphanalysis" },
+                    { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
                     { "text": "Morphology", "url": "/api/morphology" },
                     { "text": "Scorer", "url": "/api/scorer" },
                     { "text": "StringStore", "url": "/api/stringstore" },

From 6316d5f3989a53e4868cd346256fa614bd49e711 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:34 +0200
Subject: [PATCH 27/30] Improve messages in project CLI [ci skip]

---
 spacy/cli/project/assets.py | 1 +
 spacy/cli/project/run.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 8a3aaff25..58f59a3f9 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -66,6 +66,7 @@ def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None:
                 branch=asset["git"].get("branch"),
                 sparse=sparse_checkout,
             )
+            msg.good(f"Downloaded asset {dest}")
         else:
             url = asset.get("url")
             if not url:
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index d7e1075f3..69c49fba7 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -59,7 +59,7 @@ def project_run(
         for dep in cmd.get("deps", []):
             if not (project_dir / dep).exists():
                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
-                err_help = "Maybe you forgot to run the 'project assets' command?"
+                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
                 err_kwargs = {"exits": 1} if not dry else {}
                 msg.fail(err, err_help, **err_kwargs)
         with working_dir(project_dir) as current_dir:

From f9af7d365c228a8113e6db66d5bc4941c2546d88 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 09:45:41 +0200
Subject: [PATCH 28/30] Update docs [ci skip]

---
 website/docs/api/language.md              |  2 +-
 website/docs/usage/linguistic-features.md | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index ffdae9ec6..a7b9c0d88 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -187,7 +187,7 @@ more efficient than processing texts one-by-one.
 > ```python
 > texts = ["One document.", "...", "Lots of documents"]
 > for doc in nlp.pipe(texts, batch_size=50):
->     assert doc.is_parsed
+>     assert doc.has_annotation("DEP")
 > ```
 
 | Name                                       | Description                                                                                                                                                         |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index a229c18e9..914e18acb 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -205,9 +205,10 @@ acquired from [WordNet](https://wordnet.princeton.edu/).
 spaCy features a fast and accurate syntactic dependency parser, and has a rich
 API for navigating the tree. The parser also powers the sentence boundary
 detection, and lets you iterate over base noun phrases, or "chunks". You can
-check whether a [`Doc`](/api/doc) object has been parsed with the
-`doc.is_parsed` attribute, which returns a boolean value. If this attribute is
-`False`, the default sentence iterator will raise an exception.
+check whether a [`Doc`](/api/doc) object has been parsed by calling
+`doc.has_annotation("DEP")`, which checks whether the attribute `Token.dep` has
+been set returns a boolean value. If the result is `False`, the default sentence
+iterator will raise an exception.
 
 <Infobox title="Dependency label scheme" emoji="📖">
 
@@ -1705,9 +1706,10 @@ and can still be overwritten by the parser.
 <Infobox title="Important note" variant="warning">
 
 To prevent inconsistent state, you can only set boundaries **before** a document
-is parsed (and `doc.is_parsed` is `False`). To ensure that your component is
-added in the right place, you can set `before='parser'` or `first=True` when
-adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
+is parsed (and `doc.has_annotation("DEP")` is `False`). To ensure that your
+component is added in the right place, you can set `before='parser'` or
+`first=True` when adding it to the pipeline using
+[`nlp.add_pipe`](/api/language#add_pipe).
 
 </Infobox>
 

From db7126ead9675d70212c33ab9f09d2f67d72cf77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 10:31:26 +0200
Subject: [PATCH 29/30] Increment version

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index ec3c168a5..b57bbeda2 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a20"
+__version__ = "3.0.0a21"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 5e3b796b122fc9b1125f350b5dcda625fd9740f0 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 22 Sep 2020 12:24:39 +0200
Subject: [PATCH 30/30] Validate section refs in debug config

---
 spacy/cli/debug_config.py | 27 +++++++++++++++++++++++++--
 spacy/tests/test_cli.py   | 15 ++++++++++++++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 7930d0674..d07a0bb2d 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
 from pathlib import Path
 from wasabi import msg, table
 from thinc.api import Config
-from thinc.config import VARIABLE_RE
+from thinc.config import VARIABLE_RE, ConfigValidationError
 import typer
 
 from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@@ -51,7 +51,10 @@ def debug_config(
     msg.divider("Config validation")
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides)
-        nlp, _ = util.load_model_from_config(config)
+        nlp, resolved = util.load_model_from_config(config)
+        # Use the resolved config here in case user has one function returning
+        # a dict of corpora etc.
+        check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
     msg.good("Config is valid")
     if show_vars:
         variables = get_variables(config)
@@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
         value = util.dot_to_object(config, path)
         result[variable] = repr(value)
     return result
+
+
+def check_section_refs(config: Config, fields: List[str]) -> None:
+    """Validate fields in the config that refer to other sections or values
+    (e.g. in the corpora) and make sure that those references exist.
+    """
+    errors = []
+    for field in fields:
+        # If the field doesn't exist in the config, we ignore it
+        try:
+            value = util.dot_to_object(config, field)
+        except KeyError:
+            continue
+        try:
+            util.dot_to_object(config, value)
+        except KeyError:
+            msg = f"not a valid section reference: {value}"
+            errors.append({"loc": field.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config, errors)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a9c9d8ca5..1bc246fef 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -7,7 +7,8 @@ from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
 from spacy.cli._util import load_project_config, substitute_project_variables
 from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
-from thinc.config import ConfigValidationError
+from spacy.cli.debug_config import check_section_refs
+from thinc.config import ConfigValidationError, Config
 import srsly
 import os
 
@@ -413,3 +414,15 @@ def test_string_to_list(value):
 def test_string_to_list_intify(value):
     assert string_to_list(value, intify=False) == ["1", "2", "3"]
     assert string_to_list(value, intify=True) == [1, 2, 3]
+
+
+def test_check_section_refs():
+    config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
+    config = Config(config)
+    # Valid section reference
+    check_section_refs(config, ["a.b.c"])
+    # Section that doesn't exist in this config
+    check_section_refs(config, ["x.y.z"])
+    # Invalid section reference
+    with pytest.raises(ConfigValidationError):
+        check_section_refs(config, ["a.b.c", "f.g"])