Merge branch 'develop' into master-tmp

2025-09-04 03:15:00 +03:00 · 2020-07-20 14:58:04 +02:00 · 2020-07-20 14:58:04 +02:00 · 644074b954
commit 644074b954
parent a8978ca285 cb65b36839
859 changed files with 50848 additions and 66758 deletions
--- a/.github/contributors/tiangolo.md
+++ b/.github/contributors/tiangolo.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sebastián Ramírez    |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2020-07-01           |
+| GitHub username                | tiangolo             |
+| Website (optional)             |                      |
--- a/.gitignore
+++ b/.gitignore
@ -44,6 +44,7 @@ __pycache__/
 .env*
 .~env/
 .venv
+env3.6/
 venv/
 env3.*/
 .dev
@ -119,3 +120,6 @@ Desktop.ini

 # Pycharm project files
 *.idea
+
+# IPython
+.ipynb_checkpoints/
--- a/.travis.yml
+++ b/.travis.yml
@ -1,23 +0,0 @@
-language: python
-sudo: false
-cache: pip
-dist: trusty
-group: edge
-python:
-   - "2.7"
-os:
-  - linux
-install:
-  - "pip install -r requirements.txt"
-  - "python setup.py build_ext --inplace"
-  - "pip install -e ."
-script:
-  - "cat /proc/cpuinfo | grep flags | head -n 1"
-  - "python -m pytest --tb=native spacy"
-branches:
-  except:
-    - spacy.io
-notifications:
-  slack:
-    secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
-  email: false
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -280,23 +280,7 @@ except:  # noqa: E722

 ### Python conventions

-All Python code must be written in an **intersection of Python 2 and Python 3**.
-This is easy in Cython, but somewhat ugly in Python. Logic that deals with
-Python or platform compatibility should only live in
-[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
-functions, replacement functions are suffixed with an underscore, for example
-`unicode_`. If you need to access the user's version or platform information,
-for example to show more specific error messages, you can use the `is_config()`
-helper function.
-
-```python
-from .compat import unicode_, is_config
-
-compatible_unicode = unicode_('hello world')
-if is_config(windows=True, python2=True):
-    print("You are using Python 2 on Windows.")
-```
-
+All Python code must be written **compatible with Python 3.6+**.
 Code that interacts with the file-system should accept objects that follow the
 `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
 If the function is user-facing and takes a path as an argument, it should check
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,8 +1,7 @@
 recursive-include include *.h
-recursive-include spacy *.txt *.pyx *.pxd
+recursive-include spacy *.pyx *.pxd *.txt *.cfg
 include LICENSE
 include README.md
-include bin/spacy
 include pyproject.toml
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
--- a/4
+++ b/4
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 version := $(shell "bin/get-version.sh")

 dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
 	chmod a+rx $@
 	cp $@ dist/spacy.pex

@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl

 wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 	$(VENV)/bin/pip wheel . -w ./wheelhouse
-	$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
+	$(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
 	touch $@

 wheelhouse/pytest-%.whl : $(VENV)/bin/pex
--- a/README.md
+++ b/README.md
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
-[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
 [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
@ -98,12 +97,19 @@ For detailed installation instructions, see the

 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
  Studio)
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
+- **Python version**: Python 3.6+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)

 [pip]: https://pypi.org/project/spacy/
 [conda]: https://anaconda.org/conda-forge/spacy

+> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
+> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
+> providers and other tooling to support it. This means that in order to run
+> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
+> the library and its Cython dependencies locally. If this is causing problems
+> for you, the easiest solution is to **use Python 3.7** in the meantime.
+
 ### pip

 Using pip, spaCy releases are available as source packages and binary wheels (as
@ -188,7 +194,7 @@ pip install https://github.com/explosion/spacy-models/releases/download/en_core_

 ### Loading and using models

-To load a model, use `spacy.load()` with the model name, a shortcut link or a
+To load a model, use `spacy.load()` with the model name or a
 path to the model data directory.

 ```python
@ -263,9 +269,7 @@ and git preinstalled.
 Install a version of the
 [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
 or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
-matches the version that was used to compile your Python interpreter. For
-official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
-VS 2015 (Python 3.5).
+matches the version that was used to compile your Python interpreter.

 ## Run tests

--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -27,7 +27,7 @@ jobs:
    inputs:
      versionSpec: '3.7'
  - script: |
-      pip install flake8
+      pip install flake8==3.5.0
      python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
    displayName: 'flake8'

@ -35,12 +35,6 @@ jobs:
  dependsOn: 'Validate'
  strategy:
    matrix:
-      Python35Linux:
-        imageName: 'ubuntu-16.04'
-        python.version: '3.5'
-      Python35Windows:
-        imageName: 'vs2017-win2016'
-        python.version: '3.5'
      Python36Linux:
        imageName: 'ubuntu-16.04'
        python.version: '3.6'
@ -58,7 +52,7 @@ jobs:
      #   imageName: 'vs2017-win2016'
      #   python.version: '3.7'
      # Python37Mac:
-      #   imageName: 'macos-10.13'
+      #   imageName: 'macos-10.14'
      #   python.version: '3.7'
      Python38Linux:
        imageName: 'ubuntu-16.04'
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@ -1,169 +0,0 @@
-#!/usr/bin/env python
-""" cythonize.py
-
-Cythonize pyx files into C++ files as needed.
-
-Usage: cythonize.py [root]
-
-Checks pyx files to see if they have been changed relative to their
-corresponding C++ files. If they have, then runs cython on these files to
-recreate the C++ files.
-
-Additionally, checks pxd files and setup.py if they have been changed. If
-they have, rebuilds everything.
-
-Change detection based on file hashes stored in JSON format.
-
-For now, this script should be run by developers when changing Cython files
-and the resulting C++ files checked in, so that end-users (and Python-only
-developers) do not get the Cython dependencies.
-
-Based upon:
-
-https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
-https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
-
-Note: this script does not check any of the dependent C++ libraries.
-"""
-from __future__ import print_function
-
-import os
-import sys
-import json
-import hashlib
-import subprocess
-import argparse
-
-
-HASH_FILE = "cythonize.json"
-
-
-def process_pyx(fromfile, tofile, language_level="-2"):
-    print("Processing %s" % fromfile)
-    try:
-        from Cython.Compiler.Version import version as cython_version
-        from distutils.version import LooseVersion
-
-        if LooseVersion(cython_version) < LooseVersion("0.19"):
-            raise Exception("Require Cython >= 0.19")
-
-    except ImportError:
-        pass
-
-    flags = ["--fast-fail", language_level]
-    if tofile.endswith(".cpp"):
-        flags += ["--cplus"]
-
-    try:
-        try:
-            r = subprocess.call(
-                ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
-            )  # See Issue #791
-            if r != 0:
-                raise Exception("Cython failed")
-        except OSError:
-            # There are ways of installing Cython that don't result in a cython
-            # executable on the path, see gh-2397.
-            r = subprocess.call(
-                [
-                    sys.executable,
-                    "-c",
-                    "import sys; from Cython.Compiler.Main import "
-                    "setuptools_main as main; sys.exit(main())",
-                ]
-                + flags
-                + ["-o", tofile, fromfile]
-            )
-            if r != 0:
-                raise Exception("Cython failed")
-    except OSError:
-        raise OSError("Cython needs to be installed")
-
-
-def preserve_cwd(path, func, *args):
-    orig_cwd = os.getcwd()
-    try:
-        os.chdir(path)
-        func(*args)
-    finally:
-        os.chdir(orig_cwd)
-
-
-def load_hashes(filename):
-    try:
-        return json.load(open(filename))
-    except (ValueError, IOError):
-        return {}
-
-
-def save_hashes(hash_db, filename):
-    with open(filename, "w") as f:
-        f.write(json.dumps(hash_db))
-
-
-def get_hash(path):
-    return hashlib.md5(open(path, "rb").read()).hexdigest()
-
-
-def hash_changed(base, path, db):
-    full_path = os.path.normpath(os.path.join(base, path))
-    return not get_hash(full_path) == db.get(full_path)
-
-
-def hash_add(base, path, db):
-    full_path = os.path.normpath(os.path.join(base, path))
-    db[full_path] = get_hash(full_path)
-
-
-def process(base, filename, db):
-    root, ext = os.path.splitext(filename)
-    if ext in [".pyx", ".cpp"]:
-        if hash_changed(base, filename, db) or not os.path.isfile(
-            os.path.join(base, root + ".cpp")
-        ):
-            preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
-            hash_add(base, root + ".cpp", db)
-            hash_add(base, root + ".pyx", db)
-
-
-def check_changes(root, db):
-    res = False
-    new_db = {}
-
-    setup_filename = "setup.py"
-    hash_add(".", setup_filename, new_db)
-    if hash_changed(".", setup_filename, db):
-        res = True
-
-    for base, _, files in os.walk(root):
-        for filename in files:
-            if filename.endswith(".pxd"):
-                hash_add(base, filename, new_db)
-                if hash_changed(base, filename, db):
-                    res = True
-
-    if res:
-        db.clear()
-        db.update(new_db)
-    return res
-
-
-def run(root):
-    db = load_hashes(HASH_FILE)
-
-    try:
-        check_changes(root, db)
-        for base, _, files in os.walk(root):
-            for filename in files:
-                process(base, filename, db)
-    finally:
-        save_hashes(db, HASH_FILE)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Cythonize pyx files into C++ files as needed"
-    )
-    parser.add_argument("root", help="root directory")
-    args = parser.parse_args()
-    run(args.root)
--- a/bin/spacy
+++ b/bin/spacy
@ -1,2 +0,0 @@
-#! /bin/sh
-python -m spacy "$@"
--- a/bin/ud/run_eval.py
+++ b/bin/ud/run_eval.py
@ -12,11 +12,11 @@ from ud_train import write_conllu
 from spacy.lang.lex_attrs import word_shape
 from spacy.util import get_lang_class

-# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
-ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
-                 "ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
+# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later)
+ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr,"
+                 "ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb,"
                 "nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
-                 "tr, tt, uk, ur, vi, zh")
+                 "tr, tt, uk, ur, vi, yo, zh")

 # Non-parsing tasks that will be evaluated (works for default models)
 EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
@ -251,39 +251,43 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train

    # initialize all models with the multi-lang model
    for lang in languages:
-        models[lang] = [multi] if multi else []
+        UD_lang = lang
+        # Norwegian is 'nb' in spaCy but 'no' in the UD corpora
+        if lang == "nb":
+            UD_lang = "no"
+        try:
+            models[UD_lang] = [multi] if multi else []
            # add default models if we don't want to evaluate parsing info
            if not check_parse:
-            # Norwegian is 'nb' in spaCy but 'no' in the UD corpora
-            if lang == 'no':
-                models['no'].append(load_default_model_sentencizer('nb'))
-            else:
-                models[lang].append(load_default_model_sentencizer(lang))
+                models[UD_lang].append(load_default_model_sentencizer(lang))
+        except:
+            print(f"Exception initializing lang {lang} - skipping")

    # language-specific trained models
    if not exclude_trained_models:
-        if 'de' in models:
-            models['de'].append(load_model('de_core_news_sm'))
-            models['de'].append(load_model('de_core_news_md'))
-        if 'el' in models:
-            models['el'].append(load_model('el_core_news_sm'))
-            models['el'].append(load_model('el_core_news_md'))
-        if 'en' in models:
-            models['en'].append(load_model('en_core_web_sm'))
-            models['en'].append(load_model('en_core_web_md'))
-            models['en'].append(load_model('en_core_web_lg'))
-        if 'es' in models:
-            models['es'].append(load_model('es_core_news_sm'))
-            models['es'].append(load_model('es_core_news_md'))
-        if 'fr' in models:
-            models['fr'].append(load_model('fr_core_news_sm'))
-            models['fr'].append(load_model('fr_core_news_md'))
-        if 'it' in models:
-            models['it'].append(load_model('it_core_news_sm'))
-        if 'nl' in models:
-            models['nl'].append(load_model('nl_core_news_sm'))
-        if 'pt' in models:
-            models['pt'].append(load_model('pt_core_news_sm'))
+        news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"]
+        news_languages = ["nb"]
+        web_languages = ["en", "zh"]
+        sizes = ["sm", "md", "lg"]
+        for lang in web_languages:
+            UD_lang = lang
+            for size in sizes:
+                model_name = f'{lang}_core_web_{size}'
+                try:
+                    models[UD_lang].append(load_model(model_name))
+                except Exception as e:
+                    print(f"Error loading {model_name}: {e}")
+
+        for lang in news_languages:
+            UD_lang = lang
+            if lang == "nb":
+                UD_lang = "no"
+            for size in sizes:
+                model_name = f'{lang}_core_news_{size}'
+                try:
+                    models[UD_lang].append(load_model(model_name))
+                except Exception as e:
+                    print(f"Error loading {model_name}: {e}")

    with out_path.open(mode='w', encoding='utf-8') as out_file:
        run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
--- a/bin/ud/ud_run_test.py
+++ b/bin/ud/ud_run_test.py
@ -13,23 +13,12 @@ import srsly
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
-from spacy.util import compounding, minibatch_by_words
-from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher

-# from spacy.morphology import Fused_begin, Fused_inside
-from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer

 Fused_begin = None
 Fused_inside = None

-import itertools
-import random
-import numpy.random
-
 from . import conll17_ud_eval

 from spacy import lang
@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
    nlp.add_pipe(nlp.create_pipe("parser"))
    return nlp

--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -14,7 +14,7 @@ import spacy
 import spacy.util
 from bin.ud import conll17_ud_eval
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import Example
 from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
@ -53,7 +53,7 @@ def read_data(
    max_doc_length=None,
    limit=None,
 ):
-    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    """Read the CONLLU format into Example objects. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
@ -78,47 +78,41 @@ def read_data(
                head = int(head) - 1 if head != "0" else id_
                sent["words"].append(word)
                sent["tags"].append(tag)
-                sent["morphology"].append(_parse_morph_string(morph))
-                sent["morphology"][-1].add("POS_%s" % pos)
+                sent["morphs"].append(_compile_morph_string(morph, pos))
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
-            sent["entities"] = ["-"] * len(sent["words"])
+            sent["entities"] = ["-"] * len(sent["words"])    # TODO: doc-level format
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
-                assert golds[-1].morphology is not None
+                golds.append(sent)
+                assert golds[-1]["morphs"] is not None

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
-                assert gold.morphology is not None
+                assert gold["morphs"] is not None
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
-                    return docs, golds
+                    return golds_to_gold_data(docs, golds)

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
-            return docs, golds
-    return docs, golds
+            return golds_to_gold_data(docs, golds)
+    return golds_to_gold_data(docs, golds)

-def _parse_morph_string(morph_string):
+
+def _compile_morph_string(morph_string, pos):
    if morph_string == '_':
-        return set()
-    output = []
-    replacements = {'1': 'one', '2': 'two', '3': 'three'}
-    for feature in morph_string.split('|'):
-        key, value = feature.split('=')
-        value = replacements.get(value, value)
-        value = value.split(',')[0]
-        output.append('%s_%s' % (key, value.lower()))
-    return set(output)
+        return f"POS={pos}"
+    return morph_string + f"|POS={pos}"
+

 def read_conllu(file_):
    docs = []
@ -149,28 +143,27 @@ def read_conllu(file_):

 def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
-        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
+        for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
+            gold[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
    if text is None:
        text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
        )
    doc = nlp.make_doc(text)
-    flat.pop("spaces")
-    gold = GoldParse(doc, **flat)
-    gold.sent_starts = sent_starts
-    for i in range(len(gold.heads)):
+    gold.pop("spaces")
+    gold["sent_starts"] = sent_starts
+    for i in range(len(gold["heads"])):
        if random.random() < drop_deps:
-            gold.heads[i] = None
-            gold.labels[i] = None
+            gold["heads"][i] = None
+            gold["labels"][i] = None

    return doc, gold

@ -180,16 +173,13 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 #############################


-def golds_to_gold_tuples(docs, golds):
-    """Get out the annoying 'tuples' format used by begin_training, given the
-    GoldParse objects."""
-    tuples = []
+def golds_to_gold_data(docs, golds):
+    """Get out the training data format used by begin_training"""
+    data = []
    for doc, gold in zip(docs, golds):
-        text = doc.text
-        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
-        sents = [((ids, words, tags, heads, labels, iob), [])]
-        tuples.append((text, sents))
-    return tuples
+        example = Example.from_dict(doc, dict(gold))
+        data.append(example)
+    return data


 ##############
@ -313,7 +303,9 @@ def get_token_conllu(token, i):
    feat_str = []
    replacements = {"one": "1", "two": "2", "three": "3"}
    for feat in features:
-        if not feat.startswith("begin") and not feat.startswith("end"):
+        if "=" in feat:
+            feat_str.append(feat)
+        elif not feat.startswith("begin") and not feat.startswith("end"):
            key, value = feat.split("_", 1)
            value = replacements.get(value, value)
            feat_str.append("%s=%s" % (key, value.title()))
@ -327,7 +319,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)


-
 ##################
 # Initialization #
 ##################
@ -348,7 +339,7 @@ def load_nlp(corpus, config, vectors=None):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
    nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
    nlp.add_pipe(nlp.create_pipe("morphologizer"))
    nlp.add_pipe(nlp.create_pipe("parser"))
@ -356,14 +347,14 @@ def initialize_pipeline(nlp, docs, golds, config, device):
        nlp.parser.add_multitask_objective("tag")
    if config.multitask_sent:
        nlp.parser.add_multitask_objective("sent_start")
-    for gold in golds:
-        for tag in gold.tags:
+    for eg in examples:
+        for tag in eg.get_aligned("TAG", as_string=True):
            if tag is not None:
                nlp.tagger.add_label(tag)
    if torch is not None and device != -1:
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    optimizer = nlp.begin_training(
-        lambda: golds_to_gold_tuples(docs, golds),
+        lambda: examples,
        device=device,
        subword_features=config.subword_features,
        conv_depth=config.conv_depth,
@ -382,8 +373,8 @@ def _load_pretrained_tok2vec(nlp, loc):
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
-            component.tok2vec.from_bytes(weights_data)
+        if hasattr(component, "model") and component.model.has_ref("tok2vec"):
+            component.get_ref("tok2vec").from_bytes(weights_data)
            loaded.append(name)
    return loaded

@ -505,7 +496,7 @@ def main(
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

-    docs, golds = read_data(
+    examples = read_data(
        nlp,
        paths.train.conllu.open(encoding="utf8"),
        paths.train.text.open(encoding="utf8"),
@ -513,12 +504,12 @@ def main(
        limit=limit,
    )

-    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
+    optimizer = initialize_pipeline(nlp, examples, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
-        docs, golds = read_data(
+        examples = read_data(
            nlp,
            paths.train.conllu.open(encoding="utf8"),
            paths.train.text.open(encoding="utf8"),
@ -527,22 +518,19 @@ def main(
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
-        Xs = list(zip(docs, golds))
-        random.shuffle(Xs)
+        random.shuffle(examples)
        if config.batch_by_words:
-            batches = minibatch_by_words(Xs, size=batch_sizes)
+            batches = minibatch_by_words(examples, size=batch_sizes)
        else:
-            batches = minibatch(Xs, size=batch_sizes)
+            batches = minibatch(examples, size=batch_sizes)
        losses = {}
-        n_train_words = sum(len(doc) for doc in docs)
+        n_train_words = sum(len(eg.predicted) for eg in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                batch_docs, batch_gold = zip(*batch)
-                pbar.update(sum(len(doc) for doc in batch_docs))
+                pbar.update(sum(len(ex.predicted) for ex in batch))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
-                    batch_docs,
-                    batch_gold,
+                    batch,
                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -14,7 +14,7 @@ pip install keras==2.0.9

 Compatible with: spaCy v2.0.0+
 """
-
+import ml_datasets
 import plac
 import random
 import pathlib
@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
 from keras.layers import LSTM, Dense, Embedding, Bidirectional
 from keras.layers import TimeDistributed
 from keras.optimizers import Adam
-import thinc.extra.datasets
 from spacy.compat import pickle
 import spacy

@ -224,7 +223,7 @@ def main(
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
-        imdb_data = thinc.extra.datasets.imdb()
+        imdb_data = ml_datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -0,0 +1,119 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 5000
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+omit_extra_lookups = false
+batch_by = "words"
+use_gpu = -1
+raw_text = null
+tag_map = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 1e-8
+#learn_rate = 0.001
+
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.001
+
+[nlp]
+lang = "en"
+base_model = null
+vectors = null
+
+[nlp.pipeline]
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+
+[nlp.pipeline.ner]
+factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 30
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 2
+use_upper = true
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 2
+use_upper = true
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 128
+depth = 4
+window_size = 1
+embed_size = 7000
+maxout_pieces = 3
+subword_features = true
+dropout = ${training:dropout}
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@ -0,0 +1,152 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+omit_extra_lookups = false
+batch_by = "words"
+use_gpu = -1
+raw_text = null
+tag_map = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[pretraining]
+max_epochs = 1000
+min_length = 5
+max_length = 500
+dropout = 0.2
+n_save_every = null
+batch_size = 3000
+seed = ${training:seed}
+use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
+tok2vec_model = "nlp.pipeline.tok2vec.model"
+
+[pretraining.objective]
+type = "characters"
+n_characters = 4
+
+[pretraining.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = null
+base_model = null
+
+[nlp.pipeline]
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@ -0,0 +1,73 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 3000
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 100000
+max_epochs = 0
+max_steps = 0
+eval_frequency = 1000
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "ents_p", "ents_r", "ents_f"]
+score_weights = {"ents_f": 1.0}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+omit_extra_lookups = false
+batch_by = "words"
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.ner]
+factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 64
+maxout_pieces = 2
+use_upper = true
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+dropout = ${training:dropout}
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -0,0 +1,73 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = 0
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedBiLSTM.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+embed_size = 2000
+subword_features = true
+maxout_pieces = 3
+dropout = null
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -0,0 +1,74 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = -1
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@ -0,0 +1,69 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.CharacterEmbed.v1"
+width = 96
+nM = 64
+nC = 8
+rows = 2000
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+dropout = null
+
+[nlp.pipeline.tok2vec.model.extract.features]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+maxout_pieces = 4
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+window_size = 1
+maxout_pieces = 2
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -0,0 +1,48 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_p", "ents_r", "ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 3000
+stop = 3000
+compound = 1.001
+
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.ner]
+factory = "simple_ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.BiluoTagger.v1"
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+width = 128
+depth = 4
+embed_size = 7000
+maxout_pieces = 3
+window_size = 1
+subword_features = true
+pretrained_vectors = null
+dropout = null
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -13,9 +13,10 @@ Prerequisites: pip install joblib
 from __future__ import print_function, unicode_literals

 from pathlib import Path
+
+import ml_datasets
 from joblib import Parallel, delayed
 from functools import partial
-import thinc.extra.datasets
 import plac
 import spacy
 from spacy.util import minibatch
@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
-    data, _ = thinc.extra.datasets.imdb()
+    data, _ = ml_datasets.imdb()
    texts, _ = zip(*data[-limit:])
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
--- a/examples/streamlit_spacy.py
+++ b/examples/streamlit_spacy.py
@ -1,7 +1,7 @@
 # coding: utf-8
 """
 Example of a Streamlit app for an interactive spaCy model visualizer. You can
-either download the script, or point streamlit run to the raw URL of this
+either download the script, or point `streamlit run` to the raw URL of this
 file. For more details, see https://streamlit.io.

 Installation:
@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py
 """
 from __future__ import unicode_literals

+import base64
+
 import streamlit as st
 import spacy
 from spacy import displacy
@ -54,6 +56,14 @@ model_load_state.empty()
 text = st.text_area("Text to analyze", DEFAULT_TEXT)
 doc = process_text(spacy_model, text)

+
+def render_svg(svg):
+    """Renders the given svg string."""
+    b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
+    html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
+    st.write(html, unsafe_allow_html=True)
+
+
 if "parser" in nlp.pipe_names:
    st.header("Dependency Parse & Part-of-speech tags")
    st.sidebar.header("Dependency Parse")
@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names:
    }
    docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
    for sent in docs:
-        html = displacy.render(sent, options=options)
+        html = displacy.render(sent, options=options, style="dep")
        # Double newlines seem to mess with the rendering
        html = html.replace("\n\n", "\n")
        if split_sents and len(docs) > 1:
            st.markdown(f"> {sent.text}")
-        st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+        render_svg(html)
+        # this didn't show the dep arc labels properly, cf #5089
+        # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

 if "ner" in nlp.pipe_names:
    st.header("Named Entities")
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -12,7 +12,7 @@ import tqdm
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import Example
 from spacy.syntax.nonproj import projectivize
 from collections import defaultdict
 from spacy.matcher import Matcher
@ -33,31 +33,6 @@ random.seed(0)
 numpy.random.seed(0)


-def minibatch_by_words(items, size=5000):
-    random.shuffle(items)
-    if isinstance(size, int):
-        size_ = itertools.repeat(size)
-    else:
-        size_ = size
-    items = iter(items)
-    while True:
-        batch_size = next(size_)
-        batch = []
-        while batch_size >= 0:
-            try:
-                doc, gold = next(items)
-            except StopIteration:
-                if batch:
-                    yield batch
-                return
-            batch_size -= len(doc)
-            batch.append((doc, gold))
-        if batch:
-            yield batch
-        else:
-            break
-
-
 ################
 # Data reading #
 ################
@ -78,7 +53,7 @@ def read_data(
    max_doc_length=None,
    limit=None,
 ):
-    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    """Read the CONLLU format into Example objects. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
@ -110,7 +85,7 @@ def read_data(
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
-                golds.append(GoldParse(docs[-1], **sent))
+                golds.append(sent)

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
@ -119,15 +94,15 @@ def read_data(
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
-                    return docs, golds
+                    return golds_to_gold_data(docs, golds)

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
-            return docs, golds
-    return docs, golds
+            return golds_to_gold_data(docs, golds)
+    return golds_to_gold_data(docs, golds)


 def read_conllu(file_):
@ -159,20 +134,19 @@ def read_conllu(file_):

 def _make_gold(nlp, text, sent_annots):
    # Flatten the conll annotations, and adjust the head indices
-    flat = defaultdict(list)
+    gold = defaultdict(list)
    for sent in sent_annots:
-        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+        gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
        for field in ["words", "tags", "deps", "entities", "spaces"]:
-            flat[field].extend(sent[field])
+            gold[field].extend(sent[field])
    # Construct text if necessary
-    assert len(flat["words"]) == len(flat["spaces"])
+    assert len(gold["words"]) == len(gold["spaces"])
    if text is None:
        text = "".join(
-            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+            word + " " * space for word, space in zip(gold["words"], gold["spaces"])
        )
    doc = nlp.make_doc(text)
-    flat.pop("spaces")
-    gold = GoldParse(doc, **flat)
+    gold.pop("spaces")
    return doc, gold


@ -181,16 +155,13 @@ def _make_gold(nlp, text, sent_annots):
 #############################


-def golds_to_gold_tuples(docs, golds):
-    """Get out the annoying 'tuples' format used by begin_training, given the
-    GoldParse objects."""
-    tuples = []
+def golds_to_gold_data(docs, golds):
+    """Get out the training data format used by begin_training."""
+    data = []
    for doc, gold in zip(docs, golds):
-        text = doc.text
-        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
-        sents = [((ids, words, tags, heads, labels, iob), [])]
-        tuples.append((text, sents))
-    return tuples
+        example = Example.from_dict(doc, gold)
+        data.append(example)
+    return data


 ##############
@ -303,7 +274,7 @@ def load_nlp(corpus, config):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config):
+def initialize_pipeline(nlp, examples, config):
    nlp.add_pipe(nlp.create_pipe("parser"))
    if config.multitask_tag:
        nlp.parser.add_multitask_objective("tag")
@ -311,18 +282,19 @@ def initialize_pipeline(nlp, docs, golds, config):
        nlp.parser.add_multitask_objective("sent_start")
    nlp.parser.moves.add_action(2, "subtok")
    nlp.add_pipe(nlp.create_pipe("tagger"))
-    for gold in golds:
-        for tag in gold.tags:
+    for eg in examples:
+        for tag in eg.get_aligned("TAG", as_string=True):
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split("-")[1] for act in actions if "-" in act])
-    for gold in golds:
+    for eg in examples:
+        gold = eg.gold
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split("||")[0]
-    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
+    return nlp.begin_training(lambda: examples)


 ########################
@ -391,13 +363,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)

+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config)

-    docs, golds = read_data(
+    examples = read_data(
        nlp,
        paths.train.conllu.open(encoding="utf8"),
        paths.train.text.open(encoding="utf8"),
@ -405,23 +381,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
        limit=limit,
    )

-    optimizer = initialize_pipeline(nlp, docs, golds, config)
+    optimizer = initialize_pipeline(nlp, examples, config)

    for i in range(config.nr_epoch):
-        docs = [nlp.make_doc(doc.text) for doc in docs]
-        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
+        batches = spacy.minibatch_by_words(examples, size=config.batch_size)
        losses = {}
-        n_train_words = sum(len(doc) for doc in docs)
+        n_train_words = sum(len(eg.reference.doc) for eg in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                batch_docs, batch_gold = zip(*batch)
-                pbar.update(sum(len(doc) for doc in batch_docs))
+                pbar.update(sum(len(eg.reference.doc) for eg in batch))
                nlp.update(
-                    batch_docs,
-                    batch_gold,
-                    sgd=optimizer,
-                    drop=config.dropout,
-                    losses=losses,
+                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
                )

        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
--- a/examples/training/create_kb.py
+++ b/examples/training/create_kb.py
@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
    model=("Model name, should have pretrained word embeddings", "positional", None, str),
    output_dir=("Optional output directory", "option", "o", Path),
 )
-def main(model=None, output_dir=None):
+def main(model, output_dir=None):
    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@ -24,21 +24,22 @@ import random
 import plac
 import spacy
 import os.path
+
+from spacy.gold.example import Example
 from spacy.tokens import Doc
-from spacy.gold import read_json_file, GoldParse
+from spacy.gold import read_json_file

 random.seed(0)

 PWD = os.path.dirname(__file__)

-TRAIN_DATA = list(read_json_file(
-    os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
+TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))


-def get_position_label(i, words, tags, heads, labels, ents):
+def get_position_label(i, token_annotation):
    """Return labels indicating the position of the word in the document.
    """
-    if len(words) < 20:
+    if len(token_annotation.words) < 20:
        return "short-doc"
    elif i == 0:
        return "first-word"
@ -46,7 +47,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
        return "early-word"
    elif i < 20:
        return "mid-word"
-    elif i == len(words) - 1:
+    elif i == len(token_annotation.words) - 1:
        return "last-word"
    else:
        return "late-word"
@ -60,17 +61,15 @@ def main(n_iter=10):
    print(nlp.pipeline)

    print("Create data", len(TRAIN_DATA))
-    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
+    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
-        for text, annot_brackets in TRAIN_DATA:
-            for annotations, _ in annot_brackets:
-                doc = Doc(nlp.vocab, words=annotations[1])
-                gold = GoldParse.from_annot_tuples(doc, annotations)
+        for example_dict in TRAIN_DATA:
+            doc = Doc(nlp.vocab, words=example_dict["words"])
+            example = Example.from_dict(doc, example_dict)
            nlp.update(
-                    [doc],  # batch of texts
-                    [gold],  # batch of annotations
+                examples=[example],  # 1 example
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses,
@ -78,9 +77,9 @@ def main(n_iter=10):
        print(losses.get("nn_labeller", 0.0), losses["ner"])

    # test the trained model
-    for text, _ in TRAIN_DATA:
-        if text is not None:
-            doc = nlp(text)
+    for example_dict in TRAIN_DATA:
+        if "text" in example_dict:
+            doc = nlp(example_dict["text"])
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -1,217 +0,0 @@
-"""This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pretrained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pretrained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-"""
-import plac
-import tqdm
-import random
-import spacy
-import thinc.extra.datasets
-from spacy.util import minibatch, use_gpu, compounding
-from spacy._ml import Tok2Vec
-from spacy.pipeline import TextCategorizer
-import numpy
-
-
-def load_texts(limit=0):
-    train, dev = thinc.extra.datasets.imdb()
-    train_texts, train_labels = zip(*train)
-    dev_texts, dev_labels = zip(*train)
-    train_texts = list(train_texts)
-    dev_texts = list(dev_texts)
-    random.shuffle(train_texts)
-    random.shuffle(dev_texts)
-    if limit >= 1:
-        return train_texts[:limit]
-    else:
-        return list(train_texts) + list(dev_texts)
-
-
-def load_textcat_data(limit=0):
-    """Load data from the IMDB dataset."""
-    # Partition off part of the train data for evaluation
-    train_data, eval_data = thinc.extra.datasets.imdb()
-    random.shuffle(train_data)
-    train_data = train_data[-limit:]
-    texts, labels = zip(*train_data)
-    eval_texts, eval_labels = zip(*eval_data)
-    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
-    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
-    return (texts, cats), (eval_texts, eval_cats)
-
-
-def prefer_gpu():
-    used = spacy.util.use_gpu(0)
-    if used is None:
-        return False
-    else:
-        import cupy.random
-
-        cupy.random.seed(0)
-        return True
-
-
-def build_textcat_model(tok2vec, nr_class, width):
-    from thinc.v2v import Model, Softmax, Maxout
-    from thinc.api import flatten_add_lengths, chain
-    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
-    from thinc.misc import Residual, LayerNorm
-    from spacy._ml import logistic, zero_init
-
-    with Model.define_operators({">>": chain}):
-        model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> Pooling(mean_pool)
-            >> Softmax(nr_class, width)
-        )
-    model.tok2vec = tok2vec
-    return model
-
-
-def block_gradients(model):
-    from thinc.api import wrap
-
-    def forward(X, drop=0.0):
-        Y, _ = model.begin_update(X, drop=drop)
-        return Y, None
-
-    return wrap(forward, model)
-
-
-def create_pipeline(width, embed_size, vectors_model):
-    print("Load vectors")
-    nlp = spacy.load(vectors_model)
-    print("Start training")
-    textcat = TextCategorizer(
-        nlp.vocab,
-        labels=["POSITIVE", "NEGATIVE"],
-        model=build_textcat_model(
-            Tok2Vec(width=width, embed_size=embed_size), 2, width
-        ),
-    )
-
-    nlp.add_pipe(textcat)
-    return nlp
-
-
-def train_tensorizer(nlp, texts, dropout, n_iter):
-    tensorizer = nlp.create_pipe("tensorizer")
-    nlp.add_pipe(tensorizer)
-    optimizer = nlp.begin_training()
-    for i in range(n_iter):
-        losses = {}
-        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
-            docs = [nlp.make_doc(text) for text in batch]
-            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
-        print(losses)
-    return optimizer
-
-
-def train_textcat(nlp, n_texts, n_iter=10):
-    textcat = nlp.get_pipe("textcat")
-    tok2vec_weights = textcat.model.tok2vec.to_bytes()
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
-    print(
-        "Using {} examples ({} training, {} evaluation)".format(
-            n_texts, len(train_texts), len(dev_texts)
-        )
-    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
-
-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train textcat
-        optimizer = nlp.begin_training()
-        textcat.model.tok2vec.from_bytes(tok2vec_weights)
-        print("Training the model...")
-        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
-        for i in range(n_iter):
-            losses = {"textcat": 0.0}
-            # batch up the examples using spaCy's minibatch
-            batches = minibatch(tqdm.tqdm(train_data), size=2)
-            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
-            with textcat.model.use_params(optimizer.averages):
-                # evaluate on the dev data split off in load_data()
-                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print(
-                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
-                    losses["textcat"],
-                    scores["textcat_p"],
-                    scores["textcat_r"],
-                    scores["textcat_f"],
-                )
-            )
-
-
-def evaluate_textcat(tokenizer, textcat, texts, cats):
-    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8
-    fp = 1e-8
-    tn = 1e-8
-    fn = 1e-8
-    for i, doc in enumerate(textcat.pipe(docs)):
-        gold = cats[i]
-        for label, score in doc.cats.items():
-            if label not in gold:
-                continue
-            if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.0
-            elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.0
-            elif score < 0.5 and gold[label] < 0.5:
-                tn += 1
-            elif score < 0.5 and gold[label] >= 0.5:
-                fn += 1
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    f_score = 2 * (precision * recall) / (precision + recall)
-    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
-
-
-@plac.annotations(
-    width=("Width of CNN layers", "positional", None, int),
-    embed_size=("Embedding rows", "positional", None, int),
-    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
-    train_iters=("Number of iterations to train", "option", "tn", int),
-    train_examples=("Number of labelled examples", "option", "eg", int),
-    vectors_model=("Name or path to vectors model to learn from"),
-)
-def main(
-    width,
-    embed_size,
-    vectors_model,
-    pretrain_iters=30,
-    train_iters=30,
-    train_examples=1000,
-):
-    random.seed(0)
-    numpy.random.seed(0)
-    use_gpu = prefer_gpu()
-    print("Using GPU?", use_gpu)
-
-    nlp = create_pipeline(width, embed_size, vectors_model)
-    print("Load data")
-    texts = load_texts(limit=0)
-    print("Train tensorizer")
-    optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
-    print("Train textcat")
-    train_textcat(nlp, train_examples, n_iter=train_iters)
-
-
-if __name__ == "__main__":
-    plac.call(main)
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -4,9 +4,10 @@ import random
 import warnings
 import srsly
 import spacy
-from spacy.gold import GoldParse
+from spacy.gold import Example
 from spacy.util import minibatch, compounding

+# TODO: further fix & test this script for v.3 ? (read_gold_data is never called)

 LABEL = "ANIMAL"
 TRAIN_DATA = [
@ -32,19 +33,17 @@ def read_raw_data(nlp, jsonl_loc):
    for json_obj in srsly.read_jsonl(jsonl_loc):
        if json_obj["text"].strip():
            doc = nlp.make_doc(json_obj["text"])
-            yield doc
+            yield Example.from_dict(doc, {})


 def read_gold_data(nlp, gold_loc):
-    docs = []
-    golds = []
+    examples = []
    for json_obj in srsly.read_jsonl(gold_loc):
        doc = nlp.make_doc(json_obj["text"])
        ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
-        gold = GoldParse(doc, entities=ents)
-        docs.append(doc)
-        golds.append(gold)
-    return list(zip(docs, golds))
+        example = Example.from_dict(doc, {"entities": ents})
+        examples.append(example)
+    return examples


 def main(model_name, unlabelled_loc):
@ -53,34 +52,34 @@ def main(model_name, unlabelled_loc):
    batch_size = 4
    nlp = spacy.load(model_name)
    nlp.get_pipe("ner").add_label(LABEL)
-    raw_docs = list(read_raw_data(nlp, unlabelled_loc))
+    raw_examples = list(read_raw_data(nlp, unlabelled_loc))
    optimizer = nlp.resume_training()
    # Avoid use of Adam when resuming training. I don't understand this well
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
-    optimizer.alpha = 0.1
+    optimizer.learn_rate = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0
-
-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    sizes = compounding(1.0, 4.0, 1.001)
-    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+
+    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
-            random.shuffle(raw_docs)
+            random.shuffle(train_examples)
+            random.shuffle(raw_examples)
            losses = {}
            r_losses = {}
            # batch up the examples using spaCy's minibatch
-            raw_batches = minibatch(raw_docs, size=4)
-            for batch in minibatch(TRAIN_DATA, size=sizes):
-                docs, golds = zip(*batch)
-                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
+            raw_batches = minibatch(raw_examples, size=4)
+            for batch in minibatch(train_examples, size=sizes):
+                nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
                raw_batch = list(next(raw_batches))
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
--- a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
+++ b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
@ -5,16 +5,17 @@ from spacy.gold import docs_to_json
 import srsly
 import sys

+
@plac.annotations(
    model=("Model name. Defaults to 'en'.", "option", "m", str),
    input_file=("Input file (jsonl)", "positional", None, Path),
    output_dir=("Output directory", "positional", None, Path),
    n_texts=("Number of texts to convert", "option", "t", int),
 )
-def convert(model='en', input_file=None, output_dir=None, n_texts=0):
+def convert(model="en", input_file=None, output_dir=None, n_texts=0):
    # Load model with tokenizer + sentencizer only
    nlp = spacy.load(model)
-    nlp.disable_pipes(*nlp.pipe_names)
+    nlp.select_pipes(disable=nlp.pipe_names)
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer, first=True)

@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):

    srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])

+
 if __name__ == "__main__":
    plac.call(convert)
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -16,11 +16,10 @@ from __future__ import unicode_literals, print_function
 import plac
 import random
 from pathlib import Path
-
-from spacy.vocab import Vocab
-
 import spacy
 from spacy.kb import KnowledgeBase
+
+from spacy.gold import Example
 from spacy.pipeline import EntityRuler
 from spacy.util import minibatch, compounding

@ -70,32 +69,34 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
-    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        # use only the predicted EL score and not the prior probability (for demo purposes)
-        cfg = {"incl_prior": False}
-        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
-        entity_linker.set_kb(kb)
+
+        # use only the predicted EL score and not the prior probability (for demo purposes)
+        cfg = {"kb": kb, "incl_prior": False}
+        entity_linker = nlp.create_pipe("entity_linker", cfg)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
-    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+    # Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
-    TRAIN_DOCS = []
+    train_examples = []
    for text, annotation in TRAIN_DATA:
-        with nlp.disable_pipes("entity_linker"):
+        with nlp.select_pipes(disable="entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
@ -108,24 +109,20 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
                        "Removed", kb_id, "from training because it is not in the KB."
                    )
            annotation_clean["links"][offset] = new_dict
-        TRAIN_DOCS.append((doc, annotation_clean))
+        train_examples.append(Example.from_dict(doc, annotation_clean))

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train entity linker
+    with nlp.select_pipes(enable="entity_linker"):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
+
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DOCS)
+            random.shuffle(train_examples)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
                nlp.update(
-                    texts,  # batch of texts
-                    annotations,  # batch of annotations
+                    batch,
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -23,6 +23,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.gold import Example
 from spacy.util import minibatch, compounding


@ -120,22 +121,21 @@ def main(model=None, output_dir=None, n_iter=15):
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

+    train_examples = []
    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

-    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train parser
+    with nlp.select_pipes(enable="parser"):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(train_examples)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+                nlp.update(batch, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_morphologizer.py
+++ b/examples/training/train_morphologizer.py
@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a morphologizer. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+
+Compatible with: spaCy v3.0.0+
+Last tested with: v3.0.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.gold import Example
+from spacy.util import minibatch, compounding
+from spacy.morphology import Morphology
+
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+    (
+        "I like green eggs",
+        {
+            "morphs": [
+                "PronType=Prs|Person=1",
+                "VerbForm=Fin",
+                "Degree=Pos",
+                "Number=Plur",
+            ],
+            "pos": ["PRON", "VERB", "ADJ", "NOUN"],
+        },
+    ),
+    (
+        "Eat blue ham",
+        {
+            "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
+            "pos": ["VERB", "ADJ", "NOUN"],
+        },
+    ),
+    (
+        "She was blue",
+        {
+            "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
+            "pos": ["PRON", "VERB", "ADJ"],
+        },
+    ),
+    (
+        "He was blue today",
+        {
+            "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
+            "pos": ["PRON", "VERB", "ADJ", "ADV"],
+        },
+    ),
+]
+
+# The POS tags are optional, set `with_pos_tags = False` to omit them for
+# this example:
+with_pos_tags = True
+
+if not with_pos_tags:
+    for i in range(len(TRAIN_DATA)):
+        del TRAIN_DATA[i][1]["pos"]
+
+
+@plac.annotations(
+    lang=("ISO Code of language to use", "option", "l", str),
+    output_dir=("Optional output directory", "option", "o", Path),
+    n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+    """Create a new model, set up the pipeline and train the tagger. In order to
+    train the tagger with a custom tag map, we're creating a new Language
+    instance with a custom vocab.
+    """
+    nlp = spacy.blank(lang)
+    # add the tagger to the pipeline
+    # nlp.create_pipe works for built-ins that are registered with spaCy
+    morphologizer = nlp.create_pipe("morphologizer")
+    nlp.add_pipe(morphologizer)
+
+    # add labels and create the Example instances
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+        morph_labels = annotations.get("morphs")
+        pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
+        assert len(morph_labels) == len(pos_labels)
+        for morph, pos in zip(morph_labels, pos_labels):
+            morph_dict = Morphology.feats_to_dict(morph)
+            if pos:
+                morph_dict["POS"] = pos
+            morph = Morphology.dict_to_feats(morph_dict)
+            morphologizer.add_label(morph)
+
+    optimizer = nlp.begin_training()
+    for i in range(n_iter):
+        random.shuffle(train_examples)
+        losses = {}
+        # batch up the examples using spaCy's minibatch
+        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+        print("Losses", losses)
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    print("Morphs", [(t.text, t.morph) for t in doc])
+
+    # save model to output directory
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+        print("Saved model to", output_dir)
+
+        # test the save model
+        print("Loading from", output_dir)
+        nlp2 = spacy.load(output_dir)
+        doc = nlp2(test_text)
+        print("Morphs", [(t.text, t.morph) for t in doc])
+
+
+if __name__ == "__main__":
+    plac.call(main)
+
+# Expected output:
+# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -17,6 +17,7 @@ import random
 import warnings
 from pathlib import Path
 import spacy
+from spacy.gold import Example
 from spacy.util import minibatch, compounding


@ -43,41 +44,41 @@ def main(model=None, output_dir=None, n_iter=100):

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
-    if "ner" not in nlp.pipe_names:
-        ner = nlp.create_pipe("ner")
+    if "simple_ner" not in nlp.pipe_names:
+        ner = nlp.create_pipe("simple_ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
-        ner = nlp.get_pipe("ner")
+        ner = nlp.get_pipe("simple_ner")

-    # add labels
-    for _, annotations in TRAIN_DATA:
+    # add labels and create Example objects
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for ent in annotations.get("entities"):
+            print("Add label", ent[2])
            ner.add_label(ent[2])

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    # only train NER
-    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
+    with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
+        print(
+            "Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))
+        )
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(train_examples)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
                nlp.update(
-                    texts,  # batch of texts
-                    annotations,  # batch of annotations
-                    drop=0.5,  # dropout - make it harder to memorise data
+                    batch,
+                    drop=0.0,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -80,6 +80,10 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
+    train_examples = []
+    for text, annotation in TRAIN_DATA:
+        train_examples.append(TRAIN_DATA.from_dict(nlp(text), annotation))
+
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
@ -95,23 +99,18 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    # only train NER
-    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
+    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
-            batches = minibatch(TRAIN_DATA, size=sizes)
+            random.shuffle(train_examples)
+            batches = minibatch(train_examples, size=sizes)
            losses = {}
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
+                nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -14,6 +14,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.gold import Example
 from spacy.util import minibatch, compounding


@ -59,24 +60,22 @@ def main(model=None, output_dir=None, n_iter=15):
    else:
        parser = nlp.get_pipe("parser")

-    # add labels to the parser
-    for _, annotations in TRAIN_DATA:
+    # add labels to the parser and create the Example objects
+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train parser
+    with nlp.select_pipes(enable="parser"):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(train_examples)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+                nlp.update(batch, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -17,6 +17,7 @@ import plac
 import random
 from pathlib import Path
 import spacy
+from spacy.gold import Example
 from spacy.util import minibatch, compounding


@ -58,15 +59,18 @@ def main(lang="en", output_dir=None, n_iter=25):
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)

+    train_examples = []
+    for text, annotations in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+
    optimizer = nlp.begin_training()
    for i in range(n_iter):
-        random.shuffle(TRAIN_DATA)
+        random.shuffle(train_examples)
        losses = {}
        # batch up the examples using spaCy's minibatch
-        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
-            texts, annotations = zip(*batch)
-            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            nlp.update(batch, sgd=optimizer, losses=losses)
        print("Losses", losses)

    # test the trained model
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -2,89 +2,86 @@
 # coding: utf8
 """Train a convolutional neural network text classifier on the
 IMDB dataset, using the TextCategorizer component. The dataset will be loaded
-automatically via Thinc's built-in dataset loader. The model is added to
+automatically via the package `ml_datasets`. The model is added to
 spacy.pipeline, and predictions are available via `doc.cats`. For more details,
 see the documentation:
 * Training: https://spacy.io/usage/training

-Compatible with: spaCy v2.0.0+
+Compatible with: spaCy v3.0.0+
 """
 from __future__ import unicode_literals, print_function
+
 import plac
 import random
 from pathlib import Path
-import thinc.extra.datasets
+from ml_datasets import loaders

 import spacy
+from spacy import util
 from spacy.util import minibatch, compounding
+from spacy.gold import Example


@plac.annotations(
-    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
+    config_path=("Path to config file", "positional", None, Path),
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int),
    init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
+    dataset=("Dataset to train on (default: imdb)", "option", "d", str),
+    threshold=("Min. number of instances for a given label (default 20)", "option", "m", int)
 )
-def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
+def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20):
+    if not config_path or not config_path.exists():
+        raise ValueError(f"Config file not found at {config_path}")
+
+    spacy.util.fix_random_seed()
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

-    if model is not None:
-        nlp = spacy.load(model)  # load existing spaCy model
-        print("Loaded model '%s'" % model)
-    else:
-        nlp = spacy.blank("en")  # create blank Language class
-        print("Created blank 'en' model")
+    print(f"Loading nlp model from {config_path}")
+    nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
+    nlp = util.load_model_from_config(nlp_config)

-    # add the text classifier to the pipeline if it doesn't exist
-    # nlp.create_pipe works for built-ins that are registered with spaCy
+    # ensure the nlp object was defined with a textcat component
    if "textcat" not in nlp.pipe_names:
-        textcat = nlp.create_pipe(
-            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
-        )
-        nlp.add_pipe(textcat, last=True)
-    # otherwise, get it, so we can add labels to it
-    else:
+        raise ValueError(f"The nlp definition in the config does not contain a textcat component")
+
    textcat = nlp.get_pipe("textcat")

-    # add label to text classifier
-    textcat.add_label("POSITIVE")
-    textcat.add_label("NEGATIVE")
-
-    # load the IMDB dataset
-    print("Loading IMDB data...")
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
-    train_texts = train_texts[:n_texts]
-    train_cats = train_cats[:n_texts]
+    # load the dataset
+    print(f"Loading dataset {dataset} ...")
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts)
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
+    train_examples = []
+    for text, cats in zip(train_texts, train_cats):
+        doc = nlp.make_doc(text)
+        example = Example.from_dict(doc, {"cats": cats})
+        for cat in cats:
+            textcat.add_label(cat)
+        train_examples.append(example)

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train textcat
+    with nlp.select_pipes(enable="textcat"):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
-                textcat.model.tok2vec.from_bytes(file_.read())
+                textcat.model.get_ref("tok2vec").from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
-            random.shuffle(train_data)
-            batches = minibatch(train_data, size=batch_sizes)
+            random.shuffle(train_examples)
+            batches = minibatch(train_examples, size=batch_sizes)
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
+                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
@ -97,7 +94,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
                )
            )

-    # test the trained model
+    # test the trained model (only makes sense for sentiment analysis)
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)
@ -114,14 +111,48 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
        print(test_text, doc2.cats)


-def load_data(limit=0, split=0.8):
-    """Load data from the IMDB dataset."""
+def load_data(dataset, threshold, limit=0, split=0.8):
+    """Load data from the provided dataset."""
    # Partition off part of the train data for evaluation
-    train_data, _ = thinc.extra.datasets.imdb()
+    data_loader = loaders.get(dataset)
+    train_data, _ = data_loader(limit=int(limit/split))
    random.shuffle(train_data)
-    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
+
+    unique_labels = set()
+    for label_set in labels:
+        if isinstance(label_set, int) or isinstance(label_set, str):
+            unique_labels.add(label_set)
+        elif isinstance(label_set, list) or isinstance(label_set, set):
+            unique_labels.update(label_set)
+    unique_labels = sorted(unique_labels)
+    print(f"# of unique_labels: {len(unique_labels)}")
+
+    count_values_train = dict()
+    for text, annot_list in train_data:
+        if isinstance(annot_list, int) or isinstance(annot_list, str):
+            count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
+        else:
+            for annot in annot_list:
+                count_values_train[annot] = count_values_train.get(annot, 0) + 1
+    for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
+        if count < threshold:
+            unique_labels.remove(value)
+
+    print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}")
+
+    if unique_labels == {0, 1}:
        cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+    else:
+        cats = []
+        for y in labels:
+            if isinstance(y, str) or isinstance(y, int):
+                cats.append({str(label): (label == y) for label in unique_labels})
+            elif isinstance(y, set):
+                cats.append({str(label): (label in y) for label in unique_labels})
+            else:
+                raise ValueError(f"Unrecognised type of labels: {type(y)}")
+
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

--- a/examples/training/train_textcat_config.cfg
+++ b/examples/training/train_textcat_config.cfg
@ -0,0 +1,19 @@
+[nlp]
+lang = "en"
+
+[nlp.pipeline.textcat]
+factory = "textcat"
+
+[nlp.pipeline.textcat.model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+
+[nlp.pipeline.textcat.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
--- a/fabfile.py
+++ b/fabfile.py
@ -1,9 +1,6 @@
-# coding: utf-8
-from __future__ import unicode_literals, print_function
-
 import contextlib
 from pathlib import Path
-from fabric.api import local, lcd, env, settings, prefix
+from fabric.api import local, lcd
 from os import path, environ
 import shutil
 import sys
@ -82,9 +79,7 @@ def pex():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            sha = local("git rev-parse --short HEAD", capture=True)
-            venv_local(
-                "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
-            )
+            venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)


 def clean():
--- a/netlify.toml
+++ b/netlify.toml
@ -38,6 +38,13 @@ redirects = [
    {from = "/docs/usage/showcase", to = "/universe", force = true},
    {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
    {from = "/tutorials", to = "/usage/examples", force = true},
+    # Old documentation pages (v2.x)
+    {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
+    {from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
+    {from = "/api/goldparse", to = "/api/top-level", force = true},
+    {from = "/api/goldcorpus", to = "/api/corpus", force = true},
+    {from = "/api/annotation", to = "/api/data-formats", force = true},
+    {from = "/usage/examples", to = "/usage/projects", force = true},
    # Rewrite all other docs pages to /
    {from = "/docs/*", to = "/:splat"},
    # Updated documentation pages
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,6 +6,8 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc==7.4.1",
+    "thinc>=8.0.0a17,<8.0.0a20",
+    "blis>=0.4.0,<0.5.0",
+    "pytokenizations"
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,20 +1,24 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc==7.4.1
+thinc>=8.0.0a17,<8.0.0a20
 blis>=0.4.0,<0.5.0
+ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.4.0,<1.1.0
-srsly>=1.0.2,<1.1.0
+wasabi>=0.7.0,<1.1.0
+srsly>=2.1.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
+typer>=0.3.0,<0.4.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
-plac>=0.9.6,<1.2.0
-pathlib==1.0.1; python_version < "3.4"
 tqdm>=4.38.0,<5.0.0
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
+pydantic>=1.3.0,<2.0.0
+pytokenizations
+# Official Python utilities
+setuptools
+packaging
+importlib_metadata>=0.20; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
--- a/setup.cfg
+++ b/setup.cfg
@ -16,10 +16,7 @@ classifiers =
    Operating System :: MacOS :: MacOS X
    Operating System :: Microsoft :: Windows
    Programming Language :: Cython
-    Programming Language :: Python :: 2
-    Programming Language :: Python :: 2.7
    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.5
    Programming Language :: Python :: 3.6
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
@ -28,34 +25,41 @@ classifiers =
 [options]
 zip_safe = false
 include_package_data = true
-scripts =
-    bin/spacy
-python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
+python_requires = >=3.6
 setup_requires =
    wheel
    cython>=0.25
+    numpy>=1.15.0
    # We also need our Cython packages here to compile against
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc==7.4.1
+    thinc>=8.0.0a17,<8.0.0a20
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc==7.4.1
+    thinc>=8.0.0a17,<8.0.0a20
    blis>=0.4.0,<0.5.0
-    wasabi>=0.4.0,<1.1.0
-    srsly>=1.0.2,<1.1.0
+    wasabi>=0.7.0,<1.1.0
+    srsly>=2.1.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
+    typer>=0.3.0,<0.4.0
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
-    setuptools
    numpy>=1.15.0
-    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
-    pathlib==1.0.1; python_version < "3.4"
+    pydantic>=1.3.0,<2.0.0
+    pytokenizations
+    # Official Python utilities
+    setuptools
+    packaging
+    importlib_metadata>=0.20; python_version < "3.8"
+
+[options.entry_points]
+console_scripts =
+    spacy = spacy.cli:app

 [options.extras_require]
 lookups =
--- a/setup.py
+++ b/setup.py
@ -1,36 +1,30 @@
 #!/usr/bin/env python
-from __future__ import print_function
-import io
-import os
-import subprocess
+from setuptools import Extension, setup, find_packages
 import sys
-import contextlib
+import platform
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
-import distutils.util
 from distutils import ccompiler, msvccompiler
-from setuptools import Extension, setup, find_packages
+import numpy
+from pathlib import Path
+import shutil
+from Cython.Build import cythonize
+from Cython.Compiler import Options
+import os
+import subprocess


-def is_new_osx():
-    """Check whether we're on OSX >= 10.10"""
-    name = distutils.util.get_platform()
-    if sys.platform != "darwin":
-        return False
-    elif name.startswith("macosx-10"):
-        minor_version = int(name.split("-")[1].split(".")[1])
-        if minor_version >= 7:
-            return True
-        else:
-            return False
-    else:
-        return False
+ROOT = Path(__file__).parent
+PACKAGE_ROOT = ROOT / "spacy"


+# Preserve `__doc__` on functions and classes
+# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
+Options.docstrings = True
+
 PACKAGES = find_packages()
-
-
 MOD_NAMES = [
+    "spacy.gold.example",
    "spacy.parts_of_speech",
    "spacy.strings",
    "spacy.lexeme",
@ -45,11 +39,10 @@ MOD_NAMES = [
    "spacy.tokenizer",
    "spacy.syntax.nn_parser",
    "spacy.syntax._parser_model",
-    "spacy.syntax._beam_utils",
    "spacy.syntax.nonproj",
    "spacy.syntax.transition_system",
    "spacy.syntax.arc_eager",
-    "spacy.gold",
+    "spacy.gold.gold_io",
    "spacy.tokens.doc",
    "spacy.tokens.span",
    "spacy.tokens.token",
@ -62,16 +55,37 @@ MOD_NAMES = [
    "spacy.symbols",
    "spacy.vectors",
 ]
-
-
 COMPILE_OPTIONS = {
    "msvc": ["/Ox", "/EHsc"],
    "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
    "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 }
-
-
 LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
+COMPILER_DIRECTIVES = {
+    "language_level": -3,
+    "embedsignature": True,
+    "annotation_typing": False,
+}
+# Files to copy into the package that are otherwise not included
+COPY_FILES = {
+    ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+}
+
+
+def is_new_osx():
+    """Check whether we're on OSX >= 10.7"""
+    if sys.platform != "darwin":
+        return False
+    mac_ver = platform.mac_ver()[0]
+    if mac_ver.startswith("10"):
+        minor_version = int(mac_ver.split(".")[1])
+        if minor_version >= 7:
+            return True
+        else:
+            return False
+    return False


 if is_new_osx():
@ -104,20 +118,6 @@ class build_ext_subclass(build_ext, build_ext_options):
        build_ext.build_extensions(self)


-def generate_cython(root, source):
-    print("Cythonizing sources")
-    p = subprocess.call(
-        [sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
-        env=os.environ,
-    )
-    if p != 0:
-        raise RuntimeError("Running cythonize failed")
-
-
-def is_source_release(path):
-    return os.path.exists(os.path.join(path, "PKG-INFO"))
-
-
 # Include the git version in the build (adapted from NumPy)
 # Copyright (c) 2005-2020, NumPy Developers.
 # BSD 3-Clause license, see licenses/3rd_party_licenses.txt
@ -137,19 +137,19 @@ def write_git_info_py(filename="spacy/git_info.py"):
        return out

    git_version = "Unknown"
-    if os.path.exists(".git"):
+    if Path(".git").exists():
        try:
            out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"])
            git_version = out.strip().decode("ascii")
-        except:
+        except Exception:
            pass
-    elif os.path.exists(filename):
+    elif Path(filename).exists():
        # must be a source distribution, use existing version file
        try:
            a = open(filename, "r")
            lines = a.readlines()
            git_version = lines[-1].split('"')[1]
-        except:
+        except Exception:
            pass
        finally:
            a.close()
@ -160,89 +160,58 @@ GIT_VERSION = "%(git_version)s"
 """
    a = open(filename, "w")
    try:
-        a.write(
-            text % {"git_version": git_version,}
-        )
+        a.write(text % {"git_version": git_version})
    finally:
        a.close()


 def clean(path):
-    for name in MOD_NAMES:
-        name = name.replace(".", "/")
-        for ext in [".so", ".html", ".cpp", ".c"]:
-            file_path = os.path.join(path, name + ext)
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-
-
-@contextlib.contextmanager
-def chdir(new_dir):
-    old_dir = os.getcwd()
-    try:
-        os.chdir(new_dir)
-        sys.path.insert(0, new_dir)
-        yield
-    finally:
-        del sys.path[0]
-        os.chdir(old_dir)
+    for path in path.glob("**/*"):
+        if path.is_file() and path.suffix in (".so", ".cpp", ".html"):
+            print(f"Deleting {path.name}")
+            path.unlink()


 def setup_package():
    write_git_info_py()
-
-    root = os.path.abspath(os.path.dirname(__file__))
-
    if len(sys.argv) > 1 and sys.argv[1] == "clean":
-        return clean(root)
+        return clean(PACKAGE_ROOT)

-    with chdir(root):
-        with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
+    with (PACKAGE_ROOT / "about.py").open("r") as f:
        about = {}
        exec(f.read(), about)

+    for copy_file, target_dir in COPY_FILES.items():
+        if copy_file.exists():
+            shutil.copy(str(copy_file), str(target_dir))
+            print(f"Copied {copy_file} -> {target_dir}")
+
    include_dirs = [
        get_python_inc(plat_specific=True),
-            os.path.join(root, "include"),
+        numpy.get_include(),
+        str(ROOT / "include"),
    ]
-
    if (
        ccompiler.new_compiler().compiler_type == "msvc"
        and msvccompiler.get_build_version() == 9
    ):
-            include_dirs.append(os.path.join(root, "include", "msvc9"))
-
+        include_dirs.append(str(ROOT / "include" / "msvc9"))
    ext_modules = []
-        for mod_name in MOD_NAMES:
-            mod_path = mod_name.replace(".", "/") + ".cpp"
-            extra_link_args = []
-            # ???
-            # Imported from patch from @mikepb
-            # See Issue #267. Running blind here...
-            if sys.platform == "darwin":
-                dylib_path = [".." for _ in range(mod_name.count("."))]
-                dylib_path = "/".join(dylib_path)
-                dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
-                extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
-            ext_modules.append(
-                Extension(
-                    mod_name,
-                    [mod_path],
-                    language="c++",
-                    include_dirs=include_dirs,
-                    extra_link_args=extra_link_args,
-                )
-            )
-
-        if not is_source_release(root):
-            generate_cython(root, "spacy")
+    for name in MOD_NAMES:
+        mod_path = name.replace(".", "/") + ".pyx"
+        ext = Extension(name, [mod_path], language="c++")
+        ext_modules.append(ext)
+    print("Cythonizing sources")
+    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)

    setup(
-            name="spacy",
+        name="spacy-nightly",
        packages=PACKAGES,
        version=about["__version__"],
        ext_modules=ext_modules,
        cmdclass={"build_ext": build_ext_subclass},
+        include_dirs=include_dirs,
+        package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
    )


--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
 import warnings
 import sys

@ -7,10 +5,10 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

 # These are imported as part of the API
-from thinc.neural.util import prefer_gpu, require_gpu
+from thinc.api import prefer_gpu, require_gpu

 from . import pipeline
-from .cli.info import info as cli_info
+from .cli.info import info
 from .glossary import explain
 from .about import __version__
 from .errors import Errors, Warnings
@ -23,17 +21,13 @@ if sys.maxunicode == 65535:
    raise SystemError(Errors.E130)


+config = registry
+
+
 def load(name, **overrides):
-    depr_path = overrides.get("path")
-    if depr_path not in (True, False, None):
-        warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
    return util.load_model(name, **overrides)


 def blank(name, **kwargs):
    LangClass = util.get_lang_class(name)
    return LangClass(**kwargs)
-
-
-def info(model=None, markdown=False, silent=False):
-    return cli_info(model, markdown, silent)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,36 +1,4 @@
-# coding: utf8
-from __future__ import print_function
-
-# NB! This breaks in plac on Python 2!!
-# from __future__ import unicode_literals
-
 if __name__ == "__main__":
-    import plac
-    import sys
-    from wasabi import msg
-    from spacy.cli import download, link, info, package, train, pretrain, convert
-    from spacy.cli import init_model, profile, evaluate, validate, debug_data
+    from spacy.cli import setup_cli

-    commands = {
-        "download": download,
-        "link": link,
-        "info": info,
-        "train": train,
-        "pretrain": pretrain,
-        "debug-data": debug_data,
-        "evaluate": evaluate,
-        "convert": convert,
-        "package": package,
-        "init-model": init_model,
-        "profile": profile,
-        "validate": validate,
-    }
-    if len(sys.argv) == 1:
-        msg.info("Available commands", ", ".join(commands), exits=1)
-    command = sys.argv.pop(1)
-    sys.argv[0] = "spacy %s" % command
-    if command in commands:
-        plac.call(commands[command], sys.argv[1:])
-    else:
-        available = "Available: {}".format(", ".join(commands))
-        msg.fail("Unknown command: {}".format(command), available, exits=1)
+    setup_cli()
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,7 +1,7 @@
 # fmt: off
-__title__ = "spacy"
-__version__ = "2.3.2"
+__title__ = "spacy-nightly"
+__version__ = "3.0.0a4"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
+__projects__ = "https://github.com/explosion/spacy-boilerplates"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -91,6 +91,7 @@ cdef enum attr_id_t:

    LANG
    ENT_KB_ID = symbols.ENT_KB_ID
+    MORPH
    ENT_ID = symbols.ENT_ID

    IDX
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-

 IDS = {
    "": NULL_ATTR,
@ -92,6 +89,7 @@ IDS = {
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
+    "MORPH": MORPH,
    "IDX": IDX
 }

--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,12 +1,32 @@
+from wasabi import msg
+
+from ._util import app, setup_cli  # noqa: F401
+
+# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
+# are registered automatically and won't have to be imported here.
 from .download import download  # noqa: F401
 from .info import info  # noqa: F401
-from .link import link  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
-from .train import train  # noqa: F401
+from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
+from .debug_model import debug_model  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
+from .project.clone import project_clone  # noqa: F401
+from .project.assets import project_assets  # noqa: F401
+from .project.run import project_run  # noqa: F401
+from .project.dvc import project_update_dvc  # noqa: F401
+
+
+@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
+def link(*args, **kwargs):
+    """As of spaCy v3.0, model symlinks are deprecated. You can load models
+    using their full names or from a directory path."""
+    msg.warn(
+        "As of spaCy v3.0, model symlinks are deprecated. You can load models "
+        "using their full names or from a directory path."
+    )
--- a/spacy/cli/_schemas.py
+++ b/spacy/cli/_schemas.py
@ -1,220 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-
-# NB: This schema describes the new format of the training data, see #2928
-TRAINING_SCHEMA = {
-    "$schema": "http://json-schema.org/draft-06/schema",
-    "title": "Training data for spaCy models",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "text": {
-                "title": "The text of the training example",
-                "type": "string",
-                "minLength": 1,
-            },
-            "ents": {
-                "title": "Named entity spans in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "start": {
-                            "title": "Start character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "label": {
-                            "title": "Entity label",
-                            "type": "string",
-                            "minLength": 1,
-                            "pattern": "^[A-Z0-9]*$",
-                        },
-                    },
-                    "required": ["start", "end", "label"],
-                },
-            },
-            "sents": {
-                "title": "Sentence spans in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "start": {
-                            "title": "Start character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                    },
-                    "required": ["start", "end"],
-                },
-            },
-            "cats": {
-                "title": "Text categories for the text classifier",
-                "type": "object",
-                "patternProperties": {
-                    "*": {
-                        "title": "A text category",
-                        "oneOf": [
-                            {"type": "boolean"},
-                            {"type": "number", "minimum": 0},
-                        ],
-                    }
-                },
-                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
-            },
-            "tokens": {
-                "title": "The tokens in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "minProperties": 1,
-                    "properties": {
-                        "id": {
-                            "title": "Token ID, usually token index",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "start": {
-                            "title": "Start character offset of the token",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the token",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "pos": {
-                            "title": "Coarse-grained part-of-speech tag",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "tag": {
-                            "title": "Fine-grained part-of-speech tag",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "dep": {
-                            "title": "Dependency label",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "head": {
-                            "title": "Index of the token's head",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                    },
-                    "required": ["start", "end"],
-                },
-            },
-            "_": {"title": "Custom user space", "type": "object"},
-        },
-        "required": ["text"],
-    },
-}
-
-META_SCHEMA = {
-    "$schema": "http://json-schema.org/draft-06/schema",
-    "type": "object",
-    "properties": {
-        "lang": {
-            "title": "Two-letter language code, e.g. 'en'",
-            "type": "string",
-            "minLength": 2,
-            "maxLength": 2,
-            "pattern": "^[a-z]*$",
-        },
-        "name": {
-            "title": "Model name",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[a-z_]*$",
-        },
-        "version": {
-            "title": "Model version",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[0-9a-z.-]*$",
-        },
-        "spacy_version": {
-            "title": "Compatible spaCy version identifier",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[0-9a-z.-><=]*$",
-        },
-        "parent_package": {
-            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
-            "type": "string",
-            "minLength": 1,
-            "default": "spacy",
-        },
-        "pipeline": {
-            "title": "Names of pipeline components",
-            "type": "array",
-            "items": {"type": "string", "minLength": 1},
-        },
-        "description": {"title": "Model description", "type": "string"},
-        "license": {"title": "Model license", "type": "string"},
-        "author": {"title": "Model author name", "type": "string"},
-        "email": {"title": "Model author email", "type": "string", "format": "email"},
-        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
-        "sources": {
-            "title": "Training data sources",
-            "type": "array",
-            "items": {"type": "string"},
-        },
-        "vectors": {
-            "title": "Included word vectors",
-            "type": "object",
-            "properties": {
-                "keys": {
-                    "title": "Number of unique keys",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-                "vectors": {
-                    "title": "Number of unique vectors",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-                "width": {
-                    "title": "Number of dimensions",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-            },
-        },
-        "accuracy": {
-            "title": "Accuracy numbers",
-            "type": "object",
-            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
-        },
-        "speed": {
-            "title": "Speed evaluation numbers",
-            "type": "object",
-            "patternProperties": {
-                "*": {
-                    "oneOf": [
-                        {"type": "number", "minimum": 0.0},
-                        {"type": "integer", "minimum": 0},
-                    ]
-                }
-            },
-        },
-    },
-    "required": ["lang", "name", "version"],
-}
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -0,0 +1,195 @@
+from typing import Dict, Any, Union, List, Optional
+from pathlib import Path
+from wasabi import msg
+import srsly
+import hashlib
+import typer
+from typer.main import get_command
+from contextlib import contextmanager
+from thinc.config import ConfigValidationError
+from configparser import InterpolationError
+import sys
+
+from ..schemas import ProjectConfigSchema, validate
+from ..util import import_file
+
+
+PROJECT_FILE = "project.yml"
+PROJECT_LOCK = "project.lock"
+COMMAND = "python -m spacy"
+NAME = "spacy"
+HELP = """spaCy Command-line Interface
+
+DOCS: https://spacy.io/api/cli
+"""
+PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
+You'd typically start by cloning a project template to a local directory and
+fetching its assets like datasets etc. See the project's {PROJECT_FILE} for the
+available commands.
+"""
+DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
+commands to check and validate your config files, training and evaluation data,
+and custom model implementations.
+"""
+
+# Wrappers for Typer's annotations. Initially created to set defaults and to
+# keep the names short, but not needed at the moment.
+Arg = typer.Argument
+Opt = typer.Option
+
+app = typer.Typer(name=NAME, help=HELP)
+project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
+debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
+
+app.add_typer(project_cli)
+app.add_typer(debug_cli)
+
+
+def setup_cli() -> None:
+    # Ensure that the help messages always display the correct prompt
+    command = get_command(app)
+    command(prog_name=COMMAND)
+
+
+def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
+    """Generate a dictionary of config overrides based on the extra arguments
+    provided on the CLI, e.g. --training.batch_size to override
+    "training.batch_size". Arguments without a "." are considered invalid,
+    since the config only allows top-level sections to exist.
+
+    args (List[str]): The extra arguments from the command line.
+    RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
+    """
+    result = {}
+    while args:
+        opt = args.pop(0)
+        err = f"Invalid config override '{opt}'"
+        if opt.startswith("--"):  # new argument
+            opt = opt.replace("--", "").replace("-", "_")
+            if "." not in opt:
+                msg.fail(f"{err}: can't override top-level section", exits=1)
+            if not args or args[0].startswith("--"):  # flag with no value
+                value = "true"
+            else:
+                value = args.pop(0)
+            # Just like we do in the config, we're calling json.loads on the
+            # values. But since they come from the CLI, it'd b unintuitive to
+            # explicitly mark strings with escaped quotes. So we're working
+            # around that here by falling back to a string if parsing fails.
+            # TODO: improve logic to handle simple types like list of strings?
+            try:
+                result[opt] = srsly.json_loads(value)
+            except ValueError:
+                result[opt] = str(value)
+        else:
+            msg.fail(f"{err}: options need to start with --", exits=1)
+    return result
+
+
+def load_project_config(path: Path) -> Dict[str, Any]:
+    """Load the project.yml file from a directory and validate it. Also make
+    sure that all directories defined in the config exist.
+
+    path (Path): The path to the project directory.
+    RETURNS (Dict[str, Any]): The loaded project.yml.
+    """
+    config_path = path / PROJECT_FILE
+    if not config_path.exists():
+        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
+    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
+    try:
+        config = srsly.read_yaml(config_path)
+    except ValueError as e:
+        msg.fail(invalid_err, e, exits=1)
+    errors = validate(ProjectConfigSchema, config)
+    if errors:
+        msg.fail(invalid_err, "\n".join(errors), exits=1)
+    validate_project_commands(config)
+    # Make sure directories defined in config exist
+    for subdir in config.get("directories", []):
+        dir_path = path / subdir
+        if not dir_path.exists():
+            dir_path.mkdir(parents=True)
+    return config
+
+
+def validate_project_commands(config: Dict[str, Any]) -> None:
+    """Check that project commands and workflows are valid, don't contain
+    duplicates, don't clash  and only refer to commands that exist.
+
+    config (Dict[str, Any]): The loaded config.
+    """
+    command_names = [cmd["name"] for cmd in config.get("commands", [])]
+    workflows = config.get("workflows", {})
+    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
+    if duplicates:
+        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
+        msg.fail(err, exits=1)
+    for workflow_name, workflow_steps in workflows.items():
+        if workflow_name in command_names:
+            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
+            msg.fail(err, exits=1)
+        for step in workflow_steps:
+            if step not in command_names:
+                msg.fail(
+                    f"Unknown command specified in workflow '{workflow_name}': {step}",
+                    f"Workflows can only refer to commands defined in the 'commands' "
+                    f"section of the {PROJECT_FILE}.",
+                    exits=1,
+                )
+
+
+def get_hash(data) -> str:
+    """Get the hash for a JSON-serializable object.
+
+    data: The data to hash.
+    RETURNS (str): The hash.
+    """
+    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
+    return hashlib.md5(data_str).hexdigest()
+
+
+def get_checksum(path: Union[Path, str]) -> str:
+    """Get the checksum for a file or directory given its file path. If a
+    directory path is provided, this uses all files in that directory.
+
+    path (Union[Path, str]): The file or directory path.
+    RETURNS (str): The checksum.
+    """
+    path = Path(path)
+    if path.is_file():
+        return hashlib.md5(Path(path).read_bytes()).hexdigest()
+    if path.is_dir():
+        # TODO: this is currently pretty slow
+        dir_checksum = hashlib.md5()
+        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
+            dir_checksum.update(sub_file.read_bytes())
+        return dir_checksum.hexdigest()
+    raise ValueError(f"Can't get checksum for {path}: not a file or directory")
+
+
+@contextmanager
+def show_validation_error(title: str = "Config validation error"):
+    """Helper to show custom config validation errors on the CLI.
+
+    title (str): Title of the custom formatted error.
+    """
+    try:
+        yield
+    except (ConfigValidationError, InterpolationError) as e:
+        msg.fail(title, spaced=True)
+        print(str(e).replace("Config validation error", "").strip())
+        sys.exit(1)
+
+
+def import_code(code_path: Optional[Union[Path, str]]) -> None:
+    """Helper to import Python file provided in training commands / commands
+    using the config. This makes custom registered functions available.
+    """
+    if code_path is not None:
+        if not Path(code_path).exists():
+            msg.fail("Path to Python code not found", code_path, exits=1)
+        try:
+            import_file("python_code", code_path)
+        except Exception as e:
+            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,132 +1,163 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
+from typing import Optional
+from enum import Enum
 from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
+import sys

-from .converters import conllu2json, iob2json, conll_ner2json
-from .converters import ner_jsonl2json
+from ._util import app, Arg, Opt
+from ..gold import docs_to_json
+from ..tokens import DocBin
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs


 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
+
 CONVERTERS = {
-    "conllubio": conllu2json,
-    "conllu": conllu2json,
-    "conll": conllu2json,
-    "ner": conll_ner2json,
-    "iob": iob2json,
-    "jsonl": ner_jsonl2json,
+    "conllubio": conllu2docs,
+    "conllu": conllu2docs,
+    "conll": conllu2docs,
+    "ner": conll_ner2docs,
+    "iob": iob2docs,
+    "json": json2docs,
 }

-# File types
-FILE_TYPES = ("json", "jsonl", "msg")
-FILE_TYPES_STDOUT = ("json", "jsonl")
+
+# File types that can be written to stdout
+FILE_TYPES_STDOUT = ("json",)


-@plac.annotations(
-    input_file=("Input file", "positional", None, str),
-    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
-    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
-    n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
-    seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
-    model=("Model for sentence segmentation (for -s)", "option", "b", str),
-    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
-    lang=("Language (if tokenizer required)", "option", "l", str),
-    morphology=("Enable appending morphology to tags", "flag", "m", bool),
-)
-def convert(
-    input_file,
-    output_dir="-",
-    file_type="json",
-    n_sents=1,
-    seg_sents=False,
-    model=None,
-    morphology=False,
-    converter="auto",
-    lang=None,
+class FileTypes(str, Enum):
+    json = "json"
+    spacy = "spacy"
+
+
+@app.command("convert")
+def convert_cli(
+    # fmt: off
+    input_path: str = Arg(..., help="Input file or directory", exists=True),
+    output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
+    file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
+    n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
+    seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
+    model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
+    morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
+    merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
+    converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
+    ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
+    lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
+    # fmt: on
 ):
    """
-    Convert files into JSON format for use with train command and other
-    experiment management functions. If no output_dir is specified, the data
+    Convert files into json or DocBin format for training. The resulting .spacy
+    file can be used with the train command and other experiment management
+    functions.
+
+    If no output_dir is specified and the output format is JSON, the data
    is written to stdout, so you can pipe them forward to a JSON file:
-    $ spacy convert some_file.conllu > some_file.json
+    $ spacy convert some_file.conllu --file-type json > some_file.json
    """
-    no_print = output_dir == "-"
-    msg = Printer(no_print=no_print)
-    input_path = Path(input_file)
-    if file_type not in FILE_TYPES:
-        msg.fail(
-            "Unknown file type: '{}'".format(file_type),
-            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
-            exits=1,
+    if isinstance(file_type, FileTypes):
+        # We get an instance of the FileTypes from the CLI so we need its string value
+        file_type = file_type.value
+    input_path = Path(input_path)
+    output_dir = "-" if output_dir == Path("-") else output_dir
+    cli_args = locals()
+    silent = output_dir == "-"
+    msg = Printer(no_print=silent)
+    verify_cli_args(msg, **cli_args)
+    converter = _get_converter(msg, converter, input_path)
+    convert(
+        input_path,
+        output_dir,
+        file_type=file_type,
+        n_sents=n_sents,
+        seg_sents=seg_sents,
+        model=model,
+        morphology=morphology,
+        merge_subtokens=merge_subtokens,
+        converter=converter,
+        ner_map=ner_map,
+        lang=lang,
+        silent=silent,
+        msg=msg,
    )
-    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
-        # TODO: support msgpack via stdout in srsly?
-        msg.fail(
-            "Can't write .{} data to stdout.".format(file_type),
-            "Please specify an output directory.",
-            exits=1,
-        )
-    if not input_path.exists():
-        msg.fail("Input file not found", input_path, exits=1)
-    if output_dir != "-" and not Path(output_dir).exists():
-        msg.fail("Output directory not found", output_dir, exits=1)
-    input_data = input_path.open("r", encoding="utf-8").read()
-    if converter == "auto":
-        converter = input_path.suffix[1:]
-    if converter == "ner" or converter == "iob":
-        converter_autodetect = autodetect_ner_format(input_data)
-        if converter_autodetect == "ner":
-            msg.info("Auto-detected token-per-line NER format")
-            converter = converter_autodetect
-        elif converter_autodetect == "iob":
-            msg.info("Auto-detected sentence-per-line NER format")
-            converter = converter_autodetect
-        else:
-            msg.warn(
-                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
-            )
-    if converter not in CONVERTERS:
-        msg.fail("Can't find converter for {}".format(converter), exits=1)
+
+
+def convert(
+    input_path: Path,
+    output_dir: Path,
+    *,
+    file_type: str = "json",
+    n_sents: int = 1,
+    seg_sents: bool = False,
+    model: Optional[str] = None,
+    morphology: bool = False,
+    merge_subtokens: bool = False,
+    converter: str = "auto",
+    ner_map: Optional[Path] = None,
+    lang: Optional[str] = None,
+    silent: bool = True,
+    msg: Optional[Path] = None,
+) -> None:
+    if not msg:
+        msg = Printer(no_print=silent)
+    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
+
+    for input_loc in walk_directory(input_path):
+        input_data = input_loc.open("r", encoding="utf-8").read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
-    data = func(
+        docs = func(
            input_data,
            n_sents=n_sents,
            seg_sents=seg_sents,
-        use_morphology=morphology,
+            append_morphology=morphology,
+            merge_subtokens=merge_subtokens,
            lang=lang,
            model=model,
-        no_print=no_print,
+            no_print=silent,
+            ner_map=ner_map,
        )
-    if output_dir != "-":
-        # Export data to a file
-        suffix = ".{}".format(file_type)
-        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
-            srsly.write_json(output_file, data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl(output_file, data)
-        elif file_type == "msg":
-            srsly.write_msgpack(output_file, data)
-        msg.good(
-            "Generated output file ({} documents): {}".format(len(data), output_file)
-        )
+            data = [docs_to_json(docs)]
        else:
-        # Print to stdout
-        if file_type == "json":
+            data = DocBin(docs=docs, store_user_data=True).to_bytes()
+        if output_dir == "-":
+            _print_docs_to_stdout(data, file_type)
+        else:
+            if input_loc != input_path:
+                subpath = input_loc.relative_to(input_path)
+                output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
+            else:
+                output_file = Path(output_dir) / input_loc.parts[-1]
+                output_file = output_file.with_suffix(f".{file_type}")
+            _write_docs_to_file(data, output_file, file_type)
+            msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
+
+
+def _print_docs_to_stdout(data, output_type):
+    if output_type == "json":
        srsly.write_json("-", data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl("-", data)
+    else:
+        sys.stdout.buffer.write(data)


-def autodetect_ner_format(input_data):
+def _write_docs_to_file(data, output_file, output_type):
+    if not output_file.parent.exists():
+        output_file.parent.mkdir(parents=True)
+    if output_type == "json":
+        srsly.write_json(output_file, data)
+    else:
+        with output_file.open("wb") as file_:
+            file_.write(data)
+
+
+def autodetect_ner_format(input_data: str) -> str:
    # guess format from the first 20 lines
    lines = input_data.split("\n")[:20]
    format_guesses = {"ner": 0, "iob": 0}
@ -143,3 +174,86 @@ def autodetect_ner_format(input_data):
    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
        return "iob"
    return None
+
+
+def walk_directory(path):
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        else:
+            locs.append(path)
+    return locs
+
+
+def verify_cli_args(
+    msg,
+    input_path,
+    output_dir,
+    file_type,
+    n_sents,
+    seg_sents,
+    model,
+    morphology,
+    merge_subtokens,
+    converter,
+    ner_map,
+    lang,
+):
+    input_path = Path(input_path)
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            f"Can't write .{file_type} data to stdout",
+            "Please specify an output directory.",
+            exits=1,
+        )
+    if not input_path.exists():
+        msg.fail("Input file not found", input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail("Output directory not found", output_dir, exits=1)
+    if input_path.is_dir():
+        input_locs = walk_directory(input_path)
+        if len(input_locs) == 0:
+            msg.fail("No input files in directory", input_path, exits=1)
+        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+        if len(file_types) >= 2:
+            file_types = ",".join(file_types)
+            msg.fail("All input files must be same type", file_types, exits=1)
+    converter = _get_converter(msg, converter, input_path)
+    if converter not in CONVERTERS:
+        msg.fail(f"Can't find converter for {converter}", exits=1)
+    return converter
+
+
+def _get_converter(msg, converter, input_path):
+    if input_path.is_dir():
+        input_path = walk_directory(input_path)[0]
+    if converter == "auto":
+        converter = input_path.suffix[1:]
+    if converter == "ner" or converter == "iob":
+        with input_path.open() as file_:
+            input_data = file_.read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. "
+                "Conversion may not succeed. "
+                "See https://spacy.io/api/cli#convert"
+            )
+    return converter
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,4 +0,0 @@
-from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2json  # noqa: F401
-from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2json import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,141 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import re
-
-from ...gold import iob_to_biluo
-
-
-def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
-    """
-    Convert conllu files into JSON format for use with train cli.
-    use_morphology parameter enables appending morphology to tags, which is
-    useful for languages such as Spanish, where UD tags are not so rich.
-
-    Extract NER tags if available and convert them so that they follow
-    BILUO and the Wikipedia scheme
-    """
-    # by @dvsrepo, via #11 explosion/spacy-dev-resources
-    # by @katarkor
-    docs = []
-    sentences = []
-    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
-    checked_for_ner = False
-    has_ner_tags = False
-    for i, (raw_text, tokens) in enumerate(conll_tuples):
-        sentence, brackets = tokens[0]
-        if not checked_for_ner:
-            has_ner_tags = is_ner(sentence[5][0])
-            checked_for_ner = True
-        sentences.append(generate_sentence(sentence, has_ner_tags))
-        # Real-sized documents could be extracted using the comments on the
-        # conluu document
-        if len(sentences) % n_sents == 0:
-            doc = create_doc(sentences, i)
-            docs.append(doc)
-            sentences = []
-    if sentences:
-        doc = create_doc(sentences, i)
-        docs.append(doc)
-    return docs
-
-
-def is_ner(tag):
-    """
-    Check the 10th column of the first token to determine if the file contains
-    NER tags
-    """
-    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
-    if tag_match:
-        return True
-    elif tag == "O":
-        return True
-    else:
-        return False
-
-
-def read_conllx(input_data, use_morphology=False, n=0):
-    i = 0
-    for sent in input_data.strip().split("\n\n"):
-        lines = sent.strip().split("\n")
-        if lines:
-            while lines[0].startswith("#"):
-                lines.pop(0)
-            tokens = []
-            for line in lines:
-
-                parts = line.split("\t")
-                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
-                if "-" in id_ or "." in id_:
-                    continue
-                try:
-                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head not in ["0", "_"] else id_
-                    dep = "ROOT" if dep == "root" else dep
-                    tag = pos if tag == "_" else tag
-                    tag = tag + "__" + morph if use_morphology else tag
-                    iob = iob if iob else "O"
-                    tokens.append((id_, word, tag, head, dep, iob))
-                except:  # noqa: E722
-                    print(line)
-                    raise
-            tuples = [list(t) for t in zip(*tokens)]
-            yield (None, [[tuples, []]])
-            i += 1
-            if n >= 1 and i >= n:
-                break
-
-
-def simplify_tags(iob):
-    """
-    Simplify tags obtained from the dataset in order to follow Wikipedia
-    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
-    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
-    'MISC'.
-    """
-    new_iob = []
-    for tag in iob:
-        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
-        if tag_match:
-            prefix = tag_match.group(1)
-            suffix = tag_match.group(2)
-            if suffix == "GPE_LOC":
-                suffix = "LOC"
-            elif suffix == "GPE_ORG":
-                suffix = "ORG"
-            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
-                suffix = "MISC"
-            tag = prefix + "-" + suffix
-        new_iob.append(tag)
-    return new_iob
-
-
-def generate_sentence(sent, has_ner_tags):
-    (id_, word, tag, head, dep, iob) = sent
-    sentence = {}
-    tokens = []
-    if has_ner_tags:
-        iob = simplify_tags(iob)
-        biluo = iob_to_biluo(iob)
-    for i, id in enumerate(id_):
-        token = {}
-        token["id"] = id
-        token["orth"] = word[i]
-        token["tag"] = tag[i]
-        token["head"] = head[i] - id
-        token["dep"] = dep[i]
-        if has_ner_tags:
-            token["ner"] = biluo[i]
-        tokens.append(token)
-    sentence["tokens"] = tokens
-    return sentence
-
-
-def create_doc(sentences, id):
-    doc = {}
-    paragraph = {}
-    doc["id"] = id
-    doc["paragraphs"] = []
-    paragraph["sentences"] = sentences
-    doc["paragraphs"].append(paragraph)
-    return doc
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,68 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from wasabi import Printer
-
-from ...gold import iob_to_biluo
-from ...util import minibatch
-from .conll_ner2json import n_sents_info
-
-
-def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
-    """
-    Convert IOB files with one sentence per line and tags separated with '|'
-    into JSON format for use with train cli. IOB and IOB2 are accepted.
-
-    Sample formats:
-
-    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
-    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
-    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
-    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
-    """
-    msg = Printer(no_print=no_print)
-    docs = read_iob(input_data.split("\n"))
-    if n_sents > 0:
-        n_sents_info(msg, n_sents)
-        docs = merge_sentences(docs, n_sents)
-    return docs
-
-
-def read_iob(raw_sents):
-    sentences = []
-    for line in raw_sents:
-        if not line.strip():
-            continue
-        tokens = [t.split("|") for t in line.split()]
-        if len(tokens[0]) == 3:
-            words, pos, iob = zip(*tokens)
-        elif len(tokens[0]) == 2:
-            words, iob = zip(*tokens)
-            pos = ["-"] * len(words)
-        else:
-            raise ValueError(
-                "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
-            )
-        biluo = iob_to_biluo(iob)
-        sentences.append(
-            [
-                {"orth": w, "tag": p, "ner": ent}
-                for (w, p, ent) in zip(words, pos, biluo)
-            ]
-        )
-    sentences = [{"tokens": sent} for sent in sentences]
-    paragraphs = [{"sentences": [sent]} for sent in sentences]
-    docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
-    return docs
-
-
-def merge_sentences(docs, n_sents):
-    merged = []
-    for group in minibatch(docs, size=n_sents):
-        group = list(group)
-        first = group.pop(0)
-        to_extend = first["paragraphs"][0]["sentences"]
-        for sent in group:
-            to_extend.extend(sent["paragraphs"][0]["sentences"])
-        merged.append(first)
-    return merged
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,53 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import srsly
-
-from ...gold import docs_to_json
-from ...util import get_lang_class, minibatch
-
-
-def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
-    if lang is None:
-        raise ValueError("No --lang specified, but tokenization required")
-    json_docs = []
-    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
-    nlp = get_lang_class(lang)()
-    sentencizer = nlp.create_pipe("sentencizer")
-    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
-        docs = []
-        for record in batch:
-            raw_text = record["text"]
-            if "entities" in record:
-                ents = record["entities"]
-            else:
-                ents = record["spans"]
-            ents = [(e["start"], e["end"], e["label"]) for e in ents]
-            doc = nlp.make_doc(raw_text)
-            sentencizer(doc)
-            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
-            doc.ents = _cleanup_spans(spans)
-            docs.append(doc)
-        json_docs.append(docs_to_json(docs, id=i))
-    return json_docs
-
-
-def _cleanup_spans(spans):
-    output = []
-    seen = set()
-    for span in spans:
-        if span is not None:
-            # Trim whitespace
-            while len(span) and span[0].is_space:
-                span = span[1:]
-            while len(span) and span[-1].is_space:
-                span = span[:-1]
-            if not len(span):
-                continue
-            for i in range(span.start, span.end):
-                if i in seen:
-                    break
-            else:
-                output.append(span)
-                seen.update(range(span.start, span.end))
-    return output
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,16 +1,18 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
+from typing import List, Sequence, Dict, Any, Tuple, Optional
 from pathlib import Path
 from collections import Counter
-import plac
 import sys
 import srsly
-from wasabi import Printer, MESSAGES
+from wasabi import Printer, MESSAGES, msg
+import typer

-from ..gold import GoldCorpus
+from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
+from ._util import import_code, debug_cli
+from ..schemas import ConfigSchema
+from ..gold import Corpus, Example
 from ..syntax import nonproj
-from ..util import load_model, get_lang_class
+from ..language import Language
+from .. import util


 # Minimum number of expected occurrences of NER label in data to train new label
@ -22,83 +24,137 @@ BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000


-@plac.annotations(
-    # fmt: off
-    lang=("model language", "positional", None, str),
-    train_path=("location of JSON-formatted training data", "positional", None, Path),
-    dev_path=("location of JSON-formatted development data", "positional", None, Path),
-    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
-    base_model=("name of model to update (optional)", "option", "b", str),
-    pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
-    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
-    verbose=("Print additional information and explanations", "flag", "V", bool),
-    no_format=("Don't pretty-print the results", "flag", "NF", bool),
-    # fmt: on
+@debug_cli.command(
+    "config",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
-def debug_data(
-    lang,
-    train_path,
-    dev_path,
-    tag_map_path=None,
-    base_model=None,
-    pipeline="tagger,parser,ner",
-    ignore_warnings=False,
-    verbose=False,
-    no_format=False,
+def debug_config_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    # fmt: on
+):
+    """Debug a config.cfg file and show validation errors. The command will
+    create all objects in the tree and validate them. Note that some config
+    validation errors are blocking and will prevent the rest of the config from
+    being resolved. This means that you may not see all validation errors at
+    once and some issues are only shown once previous errors have been fixed.
+    """
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    with show_validation_error():
+        util.load_config(
+            config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
+        )
+    msg.good("Config is valid")
+
+
+@debug_cli.command(
+    "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+@app.command(
+    "debug-data",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    hidden=True,  # hide this from main CLI help but still allow it to work with warning
+)
+def debug_data_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
+    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
+    no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
+    # fmt: on
 ):
    """
-    Analyze, debug and validate your training and development data, get useful
-    stats, and find problems like invalid entity annotations, cyclic
-    dependencies, low data labels and more.
+    Analyze, debug and validate your training and development data. Outputs
+    useful stats, and can help you find problems like invalid entity annotations,
+    cyclic dependencies, low data labels and more.
    """
-    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
+    if ctx.command.name == "debug-data":
+        msg.warn(
+            "The debug-data command is now available via the 'debug data' "
+            "subcommand (without the hyphen). You can run python -m spacy debug "
+            "--help for an overview of the other available debugging commands."
+        )
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    debug_data(
+        train_path,
+        dev_path,
+        config_path,
+        config_overrides=overrides,
+        ignore_warnings=ignore_warnings,
+        verbose=verbose,
+        no_format=no_format,
+        silent=False,
+    )

+
+def debug_data(
+    train_path: Path,
+    dev_path: Path,
+    config_path: Path,
+    *,
+    config_overrides: Dict[str, Any] = {},
+    ignore_warnings: bool = False,
+    verbose: bool = False,
+    no_format: bool = True,
+    silent: bool = True,
+):
+    msg = Printer(
+        no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
+    )
    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
-
+    if not config_path.exists():
+        msg.fail("Config file not found", config_path, exists=1)
+    with show_validation_error():
+        config = util.load_config(
+            config_path,
+            create_objects=False,
+            schema=ConfigSchema,
+            overrides=config_overrides,
+        )
+    nlp = util.load_model_from_config(config["nlp"])
+    lang = config["nlp"]["lang"]
+    base_model = config["nlp"]["base_model"]
+    pipeline = list(config["nlp"]["pipeline"].keys())
+    tag_map_path = util.ensure_path(config["training"]["tag_map"])
    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
-
-    # Initialize the model and pipeline
-    pipeline = [p.strip() for p in pipeline.split(",")]
-    if base_model:
-        nlp = load_model(base_model)
-    else:
-        lang_cls = get_lang_class(lang)
-        nlp = lang_cls()
+    morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
+    morph_rules = {}
+    if morph_rules_path is not None:
+        morph_rules = srsly.read_json(morph_rules_path)
    # Replace tag map with provided mapping
    nlp.vocab.morphology.load_tag_map(tag_map)
+    # Load morph rules
+    nlp.vocab.morphology.load_morph_exceptions(morph_rules)

-    msg.divider("Data format validation")
-
-    # TODO: Validate data format using the JSON schema
-    # TODO: update once the new format is ready
-    # TODO: move validation to GoldCorpus in order to be able to load from dir
+    msg.divider("Data file validation")

    # Create the gold corpus to be able to better analyze data
    loading_train_error_message = ""
    loading_dev_error_message = ""
    with msg.loading("Loading corpus..."):
-        corpus = GoldCorpus(train_path, dev_path)
+        corpus = Corpus(train_path, dev_path)
        try:
-            train_docs = list(corpus.train_docs(nlp))
-            train_docs_unpreprocessed = list(
-                corpus.train_docs_without_preprocessing(nlp)
-            )
+            train_dataset = list(corpus.train_dataset(nlp))
        except ValueError as e:
-            loading_train_error_message = "Training data cannot be loaded: {}".format(
-                str(e)
-            )
+            loading_train_error_message = f"Training data cannot be loaded: {e}"
        try:
-            dev_docs = list(corpus.dev_docs(nlp))
+            dev_dataset = list(corpus.dev_dataset(nlp))
        except ValueError as e:
-            loading_dev_error_message = "Development data cannot be loaded: {}".format(
-                str(e)
-            )
+            loading_dev_error_message = f"Development data cannot be loaded: {e}"
    if loading_train_error_message or loading_dev_error_message:
        if loading_train_error_message:
            msg.fail(loading_train_error_message)
@ -107,82 +163,68 @@ def debug_data(
        sys.exit(1)
    msg.good("Corpus is loadable")

-    # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
+    # Create all gold data here to avoid iterating over the train_dataset constantly
+    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
    gold_train_unpreprocessed_data = _compile_gold(
-        train_docs_unpreprocessed, pipeline, nlp
+        train_dataset, pipeline, nlp, make_proj=False
    )
-    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
+    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
-    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
+    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    for pipe in [p for p in pipeline if p not in nlp.factories]:
-        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
+        msg.fail(f"Pipeline component '{pipe}' not available in factories")
    if base_model:
-        msg.text("Starting with base model '{}'".format(base_model))
+        msg.text(f"Starting with base model '{base_model}'")
    else:
-        msg.text("Starting with blank model '{}'".format(lang))
-    msg.text("{} training docs".format(len(train_docs)))
-    msg.text("{} evaluation docs".format(len(dev_docs)))
+        msg.text(f"Starting with blank model '{lang}'")
+    msg.text(f"{len(train_dataset)} training docs")
+    msg.text(f"{len(dev_dataset)} evaluation docs")

-    if not len(dev_docs):
+    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
-        msg.warn("{} training examples also in evaluation data".format(overlap))
+        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
-    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
-        text = "Low number of examples to train from a blank model ({})".format(
-            len(train_docs)
+    if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
+        text = (
+            f"Low number of examples to train from a blank model ({len(train_dataset)})"
        )
-        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
+        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
-            "It's recommended to use at least {} examples (minimum {})".format(
-                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
-            ),
+            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
+            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
-        "{} total {} in the data ({} unique)".format(
-            n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
-        )
+        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
-        msg.warn(
-            "{} misaligned tokens in the training data".format(
-                gold_train_data["n_misaligned_words"]
-            )
-        )
+        n_misaligned = gold_train_data["n_misaligned_words"]
+        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
-        msg.warn(
-            "{} misaligned tokens in the dev data".format(
-                gold_dev_data["n_misaligned_words"]
-            )
-        )
+        n_misaligned = gold_dev_data["n_misaligned_words"]
+        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
-        "10 most common words: {}".format(
-            _format_labels(most_common_words, counts=True)
-        ),
+        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
-            "{} vectors ({} unique keys, {} dimensions)".format(
-                len(nlp.vocab.vectors),
-                nlp.vocab.vectors.n_keys,
-                nlp.vocab.vectors_length,
-            )
+            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
@ -205,7 +247,7 @@ def debug_data(
    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(
-            label for label in gold_train_data["ner"] if label not in ("O", "-")
+            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
@ -218,19 +260,10 @@ def debug_data(

        msg.divider("Named Entity Recognition")
        msg.info(
-            "{} new {}, {} existing {}".format(
-                len(new_labels),
-                "label" if len(new_labels) == 1 else "labels",
-                len(existing_labels),
-                "label" if len(existing_labels) == 1 else "labels",
-            )
+            f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
        )
        missing_values = label_counts["-"]
-        msg.text(
-            "{} missing {} (tokens with '-' label)".format(
-                missing_values, "value" if missing_values == 1 else "values"
-            )
-        )
+        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in new_labels:
            if len(label) == 0:
                msg.fail("Empty label found in new labels")
@ -241,43 +274,28 @@ def debug_data(
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
-            msg.text("New: {}".format(labels_with_counts), show=verbose)
+            msg.text(f"New: {labels_with_counts}", show=verbose)
        if existing_labels:
-            msg.text(
-                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
-            )
-
+            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
        if gold_train_data["ws_ents"]:
-            msg.fail(
-                "{} invalid whitespace entity span(s)".format(
-                    gold_train_data["ws_ents"]
-                )
-            )
+            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
-            msg.warn(
-                "{} entity span(s) with punctuation".format(
-                    gold_train_data["punct_ents"]
-                )
-            )
+            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
            has_punct_ents_warning = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
-                    "Low number of examples for new label '{}' ({})".format(
-                        label, label_counts[label]
-                    )
+                    f"Low number of examples for new label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
-                    neg_docs = _get_examples_without_label(train_docs, label)
+                    neg_docs = _get_examples_without_label(train_dataset, label)
                if neg_docs == 0:
-                    msg.warn(
-                        "No examples for texts WITHOUT new label '{}'".format(label)
-                    )
+                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if not has_low_data_warning:
@ -291,8 +309,8 @@ def debug_data(

        if has_low_data_warning:
            msg.text(
-                "To train a new entity type, your data should include at "
-                "least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
+                f"To train a new entity type, your data should include at "
+                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
@ -321,27 +339,21 @@ def debug_data(
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
-            "Text Classification: {} new label(s), {} existing label(s)".format(
-                len(new_labels), len(existing_labels)
-            )
+            f"Text Classification: {len(new_labels)} new label(s), "
+            f"{len(existing_labels)} existing label(s)"
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_train_data["cats"].most_common(), counts=True
            )
-            msg.text("New: {}".format(labels_with_counts), show=verbose)
+            msg.text(f"New: {labels_with_counts}", show=verbose)
        if existing_labels:
-            msg.text(
-                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
-            )
+            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.fail(
-                "The train and dev labels are not the same. "
-                "Train labels: {}. "
-                "Dev labels: {}.".format(
-                    _format_labels(gold_train_data["cats"]),
-                    _format_labels(gold_dev_data["cats"]),
-                )
+                f"The train and dev labels are not the same. "
+                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
+                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            msg.info(
@ -371,27 +383,16 @@ def debug_data(
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.vocab.morphology.tag_map
-        msg.info(
-            "{} {} in data ({} {} in tag map)".format(
-                len(labels),
-                "label" if len(labels) == 1 else "labels",
-                len(tag_map),
-                "label" if len(tag_map) == 1 else "labels",
-            )
-        )
+        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
-            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
+            msg.good(f"All labels present in tag map for language '{nlp.lang}'")
        for label in non_tagmap:
-            msg.fail(
-                "Label '{}' not found in tag map for language '{}'".format(
-                    label, nlp.lang
-                )
-            )
+            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")

    if "parser" in pipeline:
        has_low_data_warning = False
@ -399,21 +400,18 @@ def debug_data(

        # profile sentence length
        msg.info(
-            "Found {} sentence{} with an average length of {:.1f} words.".format(
-                gold_train_data["n_sents"],
-                "s" if len(train_docs) > 1 else "",
-                gold_train_data["n_words"] / gold_train_data["n_sents"],
-            )
+            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
+            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
-                "The training data contains {:.2f} sentences per "
-                "document. When there are very few documents containing more "
-                "than one sentence, the parser will not learn how to segment "
-                "longer texts into sentences.".format(sents_per_doc)
+                f"The training data contains {sents_per_doc:.2f} sentences per "
+                f"document. When there are very few documents containing more "
+                f"than one sentence, the parser will not learn how to segment "
+                f"longer texts into sentences."
            )

        # profile labels
@ -424,32 +422,13 @@ def debug_data(
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
-            msg.info(
-                "Found {} nonprojective train sentence{}".format(
-                    gold_train_unpreprocessed_data["n_nonproj"],
-                    "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
-                )
-            )
+            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
+            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
-            msg.info(
-                "Found {} nonprojective dev sentence{}".format(
-                    gold_dev_data["n_nonproj"],
-                    "s" if gold_dev_data["n_nonproj"] > 1 else "",
-                )
-            )
-
-        msg.info(
-            "{} {} in train data".format(
-                len(labels_train_unpreprocessed),
-                "label" if len(labels_train) == 1 else "labels",
-            )
-        )
-        msg.info(
-            "{} {} in projectivized train data".format(
-                len(labels_train), "label" if len(labels_train) == 1 else "labels"
-            )
-        )
-
+            n_nonproj = gold_dev_data["n_nonproj"]
+            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
+        msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
+        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
        )
@ -459,9 +438,8 @@ def debug_data(
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
                msg.warn(
-                    "Low number of examples for label '{}' ({})".format(
-                        label, gold_train_unpreprocessed_data["deps"][label]
-                    )
+                    f"Low number of examples for label '{label}' "
+                    f"({gold_train_unpreprocessed_data['deps'][label]})"
                )
                has_low_data_warning = True

@ -470,22 +448,19 @@ def debug_data(
        for label in gold_train_data["deps"]:
            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
                rare_projectivized_labels.append(
-                    "{}: {}".format(label, str(gold_train_data["deps"][label]))
+                    f"{label}: {gold_train_data['deps'][label]}"
                )

        if len(rare_projectivized_labels) > 0:
            msg.warn(
-                "Low number of examples for {} label{} in the "
-                "projectivized dependency trees used for training. You may "
-                "want to projectivize labels such as punct before "
-                "training in order to improve parser performance.".format(
-                    len(rare_projectivized_labels),
-                    "s" if len(rare_projectivized_labels) > 1 else "",
-                )
+                f"Low number of examples for {len(rare_projectivized_labels)} "
+                "label(s) in the projectivized dependency trees used for "
+                "training. You may want to projectivize labels such as punct "
+                "before training in order to improve parser performance."
            )
            msg.warn(
-                "Projectivized labels with low numbers of examples: "
-                "{}".format("\n".join(rare_projectivized_labels)),
+                f"Projectivized labels with low numbers of examples: ",
+                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True
@ -493,50 +468,44 @@ def debug_data(
        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
-                "The following labels were found only in the train data: "
-                "{}".format(", ".join(set(labels_train) - set(labels_dev))),
+                "The following labels were found only in the train data:",
+                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
-                "The following labels were found only in the dev data: "
-                + ", ".join(set(labels_dev) - set(labels_train)),
+                "The following labels were found only in the dev data:",
+                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
-                "To train a parser, your data should include at "
-                "least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
+                f"To train a parser, your data should include at "
+                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
-                "Multiple root labels ({}) ".format(
-                    ", ".join(gold_train_unpreprocessed_data["roots"])
-                )
-                + "found in training data. spaCy's parser uses a single root "
-                "label ROOT so this distinction will not be available."
+                f"Multiple root labels "
+                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
+                f"found in training data. spaCy's parser uses a single root "
+                f"label ROOT so this distinction will not be available."
            )

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
-                "Found {} nonprojective projectivized train sentence{}".format(
-                    gold_train_data["n_nonproj"],
-                    "s" if gold_train_data["n_nonproj"] > 1 else "",
-                )
+                f"Found {gold_train_data['n_nonproj']} nonprojective "
+                f"projectivized train sentence(s)"
            )
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
-                "Found {} projectivized train sentence{} with cycles".format(
-                    gold_train_data["n_cycles"],
-                    "s" if gold_train_data["n_cycles"] > 1 else "",
-                )
+                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
@ -544,42 +513,36 @@ def debug_data(
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
-        msg.good(
-            "{} {} passed".format(
-                good_counts, "check" if good_counts == 1 else "checks"
-            )
-        )
+        msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
    if warn_counts:
-        msg.warn(
-            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
-        )
-    if fail_counts:
-        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
-
+        msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
+        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)


-def _load_file(file_path, msg):
+def _load_file(file_path: Path, msg: Printer) -> None:
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
-        with msg.loading("Loading {}...".format(file_name)):
+        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_json(file_path)
-        msg.good("Loaded {}".format(file_name))
+        msg.good(f"Loaded {file_name}")
        return data
    elif file_path.suffix == ".jsonl":
-        with msg.loading("Loading {}...".format(file_name)):
+        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_jsonl(file_path)
-        msg.good("Loaded {}".format(file_name))
+        msg.good(f"Loaded {file_name}")
        return data
    msg.fail(
-        "Can't load file extension {}".format(file_path.suffix),
+        f"Can't load file extension {file_path.suffix}",
        "Expected .json or .jsonl",
        exits=1,
    )


-def _compile_gold(train_docs, pipeline, nlp):
+def _compile_gold(
+    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
+) -> Dict[str, Any]:
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -598,18 +561,20 @@ def _compile_gold(train_docs, pipeline, nlp):
        "n_cats_multilabel": 0,
        "texts": set(),
    }
-    for doc, gold in train_docs:
-        valid_words = [x for x in gold.words if x is not None]
+    for eg in examples:
+        gold = eg.reference
+        doc = eg.predicted
+        valid_words = [x for x in gold if x is not None]
        data["words"].update(valid_words)
        data["n_words"] += len(valid_words)
-        data["n_misaligned_words"] += len(gold.words) - len(valid_words)
+        data["n_misaligned_words"] += len(gold) - len(valid_words)
        data["texts"].add(doc.text)
        if len(nlp.vocab.vectors):
            for word in valid_words:
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in pipeline:
-            for i, label in enumerate(gold.ner):
+            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
@ -635,40 +600,42 @@ def _compile_gold(train_docs, pipeline, nlp):
            if list(gold.cats.values()).count(1.0) != 1:
                data["n_cats_multilabel"] += 1
        if "tagger" in pipeline:
-            data["tags"].update([x for x in gold.tags if x is not None])
+            tags = eg.get_aligned("TAG", as_string=True)
+            data["tags"].update([x for x in tags if x is not None])
        if "parser" in pipeline:
-            data["deps"].update([x for x in gold.labels if x is not None])
-            for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
+            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
+            data["deps"].update([x for x in aligned_deps if x is not None])
+            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
                if head == i:
                    data["roots"].update([dep])
                    data["n_sents"] += 1
-            if nonproj.is_nonproj_tree(gold.heads):
+            if nonproj.is_nonproj_tree(aligned_heads):
                data["n_nonproj"] += 1
-            if nonproj.contains_cycle(gold.heads):
+            if nonproj.contains_cycle(aligned_heads):
                data["n_cycles"] += 1
    return data


-def _format_labels(labels, counts=False):
+def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
    if counts:
-        return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
-    return ", ".join(["'{}'".format(l) for l in labels])
+        return ", ".join([f"'{l}' ({c})" for l, c in labels])
+    return ", ".join([f"'{l}'" for l in labels])


-def _get_examples_without_label(data, label):
+def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
    count = 0
-    for doc, gold in data:
+    for eg in data:
        labels = [
            label.split("-")[1]
-            for label in gold.ner
-            if label is not None and label not in ("O", "-")
+            for label in eg.get_aligned_ner()
+            if label not in ("O", "-", None)
        ]
        if label not in labels:
            count += 1
    return count


-def _get_labels_from_model(nlp, pipe_name):
+def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
    if pipe_name not in nlp.pipe_names:
        return set()
    pipe = nlp.get_pipe(pipe_name)
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@ -0,0 +1,168 @@
+from pathlib import Path
+from wasabi import msg
+from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
+
+from ._util import Arg, Opt, debug_cli
+from .. import util
+from ..lang.en import English
+
+
+@debug_cli.command("model")
+def debug_model_cli(
+    # fmt: off
+    config_path: Path = Arg(..., help="Path to config file", exists=True),
+    layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
+    dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
+    parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
+    gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
+    attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
+    P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
+    P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
+    P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
+    P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
+    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
+    seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
+    # fmt: on
+):
+    """
+    Analyze a Thinc model implementation. Includes checks for internal structure
+    and activations during training.
+    """
+    print_settings = {
+        "dimensions": dimensions,
+        "parameters": parameters,
+        "gradients": gradients,
+        "attributes": attributes,
+        "layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
+        "print_before_training": P0,
+        "print_after_init": P1,
+        "print_after_training": P2,
+        "print_prediction": P3,
+    }
+
+    if seed is not None:
+        msg.info(f"Fixing random seed: {seed}")
+        fix_random_seed(seed)
+    if use_gpu >= 0:
+        msg.info(f"Using GPU: {use_gpu}")
+        require_gpu(use_gpu)
+    else:
+        msg.info(f"Using CPU")
+
+    debug_model(
+        config_path, print_settings=print_settings,
+    )
+
+
+def debug_model(config_path: Path, *, print_settings=None):
+    if print_settings is None:
+        print_settings = {}
+
+    model = util.load_config(config_path, create_objects=True)["model"]
+
+    # STEP 0: Printing before training
+    msg.info(f"Analysing model with ID {model.id}")
+    if print_settings.get("print_before_training"):
+        msg.info(f"Before training:")
+        _print_model(model, print_settings)
+
+    # STEP 1: Initializing the model and printing again
+    model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
+    if print_settings.get("print_after_init"):
+        msg.info(f"After initialization:")
+        _print_model(model, print_settings)
+
+    # STEP 2: Updating the model and printing again
+    optimizer = Adam(0.001)
+    set_dropout_rate(model, 0.2)
+    for e in range(3):
+        Y, get_dX = model.begin_update(_get_docs())
+        dY = get_gradient(model, Y)
+        get_dX(dY)
+        model.finish_update(optimizer)
+    if print_settings.get("print_after_training"):
+        msg.info(f"After training:")
+        _print_model(model, print_settings)
+
+    # STEP 3: the final prediction
+    prediction = model.predict(_get_docs())
+    if print_settings.get("print_prediction"):
+        msg.info(f"Prediction:", str(prediction))
+
+
+def get_gradient(model, Y):
+    goldY = _get_output(model.ops.xp)
+    return Y - goldY
+
+
+def _sentences():
+    return [
+        "Apple is looking at buying U.K. startup for $1 billion",
+        "Autonomous cars shift insurance liability toward manufacturers",
+        "San Francisco considers banning sidewalk delivery robots",
+        "London is a big city in the United Kingdom.",
+    ]
+
+
+def _get_docs():
+    nlp = English()
+    return list(nlp.pipe(_sentences()))
+
+
+def _get_output(xp):
+    return xp.asarray(
+        [
+            xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
+            for i, _ in enumerate(_get_docs())
+        ]
+    )
+
+
+def _print_model(model, print_settings):
+    layers = print_settings.get("layers", "")
+    parameters = print_settings.get("parameters", False)
+    dimensions = print_settings.get("dimensions", False)
+    gradients = print_settings.get("gradients", False)
+    attributes = print_settings.get("attributes", False)
+
+    for i, node in enumerate(model.walk()):
+        if not layers or i in layers:
+            msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
+
+            if dimensions:
+                for name in node.dim_names:
+                    if node.has_dim(name):
+                        msg.info(f" - dim {name}: {node.get_dim(name)}")
+                    else:
+                        msg.info(f" - dim {name}: {node.has_dim(name)}")
+
+            if parameters:
+                for name in node.param_names:
+                    if node.has_param(name):
+                        print_value = _print_matrix(node.get_param(name))
+                        msg.info(f" - param {name}: {print_value}")
+                    else:
+                        msg.info(f" - param {name}: {node.has_param(name)}")
+            if gradients:
+                for name in node.param_names:
+                    if node.has_grad(name):
+                        print_value = _print_matrix(node.get_grad(name))
+                        msg.info(f" - grad {name}: {print_value}")
+                    else:
+                        msg.info(f" - grad {name}: {node.has_grad(name)}")
+            if attributes:
+                attrs = node.attrs
+                for name, value in attrs.items():
+                    msg.info(f" - attr {name}: {value}")
+
+
+def _print_matrix(value):
+    if value is None or isinstance(value, bool):
+        return value
+    result = str(value.shape) + " - sample: "
+    sample_matrix = value
+    for d in range(value.ndim - 1):
+        sample_matrix = sample_matrix[0]
+    sample_matrix = sample_matrix[0:5]
+    result = result + str(sample_matrix)
+    return result
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,30 +1,54 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
+from typing import Optional, Sequence
 import requests
-import os
-import subprocess
 import sys
 from wasabi import msg
+import typer

-from .link import link
-from ..util import get_package_path
+from ._util import app, Arg, Opt
 from .. import about
+from ..util import is_package, get_base_version, run_command
+
+# These are the old shortcuts we previously supported in spacy download. As of
+# v3, shortcuts are deprecated so we're not expecting to add anything to this
+# list. It only exists to show users warnings.
+OLD_SHORTCUTS = {
+    "en": "en_core_web_sm",
+    "de": "de_core_news_sm",
+    "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "el": "el_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "xx": "xx_ent_wiki_sm",
+}


-@plac.annotations(
-    model=("Model to download (shortcut or name)", "positional", None, str),
-    direct=("Force direct download of name + version", "flag", "d", bool),
-    pip_args=("Additional arguments to be passed to `pip install` on model install"),
+@app.command(
+    "download",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
-def download(model, direct=False, *pip_args):
+def download_cli(
+    # fmt: off
+    ctx: typer.Context,
+    model: str = Arg(..., help="Name of model to download"),
+    direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
+    # fmt: on
+):
    """
-    Download compatible model from default download path using pip. Model
-    can be shortcut, model name or, if --direct flag is set, full model name
-    with version. For direct downloads, the compatibility check will be skipped.
+    Download compatible model from default download path using pip. If --direct
+    flag is set, the command expects the full model name with version.
+    For direct downloads, the compatibility check will be skipped. All
+    additional arguments provided to this command will be passed to `pip install`
+    on model installation.
    """
-    if not require_package("spacy") and "--no-deps" not in pip_args:
+    download(model, direct, *ctx.args)
+
+
+def download(model: str, direct: bool = False, *pip_args) -> None:
+    if not is_package("spacy") and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping model package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
@ -39,97 +63,59 @@ def download(model, direct=False, *pip_args):
        components = model.split("-")
        model_name = "".join(components[:-1])
        version = components[-1]
-        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
+        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    else:
-        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
-        model_name = shortcuts.get(model, model)
+        model_name = model
+        if model in OLD_SHORTCUTS:
+            msg.warn(
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
+                f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
+            )
+            model_name = OLD_SHORTCUTS[model]
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
-        if dl != 0:  # if download subprocess doesn't return 0, exit
-            sys.exit(dl)
+        download_model(dl_tpl.format(m=model_name, v=version), pip_args)
    msg.good(
        "Download and installation successful",
-            "You can now load the model via spacy.load('{}')".format(model_name),
+        f"You can now load the model via spacy.load('{model_name}')",
    )
-        # Only create symlink if the model is installed via a shortcut like 'en'.
-        # There's no real advantage over an additional symlink for en_core_web_sm
-        # and if anything, it's more error prone and causes more confusion.
-        if model in shortcuts:
-            try:
-                # Get package path here because link uses
-                # pip.get_installed_distributions() to check if model is a
-                # package, which fails if model was just installed via
-                # subprocess
-                package_path = get_package_path(model_name)
-                link(model_name, model, force=True, model_path=package_path)
-            except:  # noqa: E722
-                # Dirty, but since spacy.download and the auto-linking is
-                # mostly a convenience wrapper, it's best to show a success
-                # message and loading instructions, even if linking fails.
-                msg.warn(
-                    "Download successful but linking failed",
-                    "Creating a shortcut link for '{}' didn't work (maybe you "
-                    "don't have admin permissions?), but you can still load "
-                    "the model via its full package name: "
-                    "nlp = spacy.load('{}')".format(model, model_name),
-                )
-        # If a model is downloaded and then loaded within the same process, our
-        # is_package check currently fails, because pkg_resources.working_set
-        # is not refreshed automatically (see #3923). We're trying to work
-        # around this here be requiring the package explicitly.
-        require_package(model_name)


-def require_package(name):
-    try:
-        import pkg_resources
-
-        pkg_resources.working_set.require(name)
-        return True
-    except:  # noqa: E722
-        return False
-
-
-def get_json(url, desc):
-    r = requests.get(url)
+def get_compatibility() -> dict:
+    version = get_base_version(about.__version__)
+    r = requests.get(about.__compatibility__)
    if r.status_code != 200:
        msg.fail(
-            "Server error ({})".format(r.status_code),
-            "Couldn't fetch {}. Please find a model for your spaCy "
-            "installation (v{}), and download it manually. For more "
-            "details, see the documentation: "
-            "https://spacy.io/usage/models".format(desc, about.__version__),
+            f"Server error ({r.status_code})",
+            f"Couldn't fetch compatibility table. Please find a model for your spaCy "
+            f"installation (v{about.__version__}), and download it manually. "
+            f"For more details, see the documentation: "
+            f"https://spacy.io/usage/models",
            exits=1,
        )
-    return r.json()
-
-
-def get_compatibility():
-    version = about.__version__
-    version = version.rsplit(".dev", 1)[0]
-    comp_table = get_json(about.__compatibility__, "compatibility table")
+    comp_table = r.json()
    comp = comp_table["spacy"]
    if version not in comp:
-        msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
+        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
    return comp[version]


-def get_version(model, comp):
-    model = model.rsplit(".dev", 1)[0]
+def get_version(model: str, comp: dict) -> str:
+    model = get_base_version(model)
    if model not in comp:
        msg.fail(
-            "No compatible model found for '{}' "
-            "(spaCy v{}).".format(model, about.__version__),
+            f"No compatible model found for '{model}' (spaCy v{about.__version__})",
            exits=1,
        )
    return comp[model][0]


-def download_model(filename, user_pip_args=None):
+def download_model(
+    filename: str, user_pip_args: Optional[Sequence[str]] = None
+) -> None:
    download_url = about.__download_url__ + "/" + filename
    pip_args = ["--no-cache-dir"]
    if user_pip_args:
        pip_args.extend(user_pip_args)
    cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
-    return subprocess.call(cmd, env=os.environ.copy())
+    run_command(cmd)
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,74 +1,111 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
-import plac
+from typing import Optional, List, Dict
 from timeit import default_timer as timer
-from wasabi import msg
+from wasabi import Printer
+from pathlib import Path
+import re
+import srsly
+from thinc.api import require_gpu, fix_random_seed

-from ..gold import GoldCorpus
+from ..gold import Corpus
+from ..tokens import Doc
+from ._util import app, Arg, Opt
+from ..scorer import Scorer
 from .. import util
 from .. import displacy


-@plac.annotations(
-    model=("Model name or path", "positional", None, str),
-    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    gpu_id=("Use GPU", "option", "g", int),
-    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
-    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
-    return_scores=("Return dict containing model scores", "flag", "R", bool),
-)
-def evaluate(
-    model,
-    data_path,
-    gpu_id=-1,
-    gold_preproc=False,
-    displacy_path=None,
-    displacy_limit=25,
-    return_scores=False,
+@app.command("evaluate")
+def evaluate_cli(
+    # fmt: off
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
+    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
+    gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
+    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
+    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
+    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+    # fmt: on
 ):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
-    util.fix_random_seed()
+    evaluate(
+        model,
+        data_path,
+        output=output,
+        gpu_id=gpu_id,
+        gold_preproc=gold_preproc,
+        displacy_path=displacy_path,
+        displacy_limit=displacy_limit,
+        silent=False,
+    )
+
+
+def evaluate(
+    model: str,
+    data_path: Path,
+    output: Optional[Path],
+    gpu_id: int = -1,
+    gold_preproc: bool = False,
+    displacy_path: Optional[Path] = None,
+    displacy_limit: int = 25,
+    silent: bool = True,
+) -> Scorer:
+    msg = Printer(no_print=silent, pretty=not silent)
+    fix_random_seed()
    if gpu_id >= 0:
-        util.use_gpu(gpu_id)
+        require_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
+    output_path = util.ensure_path(output)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
-    corpus = GoldCorpus(data_path, data_path)
-    if model.startswith("blank:"):
-        nlp = util.get_lang_class(model.replace("blank:", ""))()
-    else:
+    corpus = Corpus(data_path, data_path)
    nlp = util.load_model(model)
-    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
+    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
    begin = timer()
-    scorer = nlp.evaluate(dev_docs, verbose=False)
+    scorer = nlp.evaluate(dev_dataset, verbose=False)
    end = timer()
-    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+    nwords = sum(len(ex.predicted) for ex in dev_dataset)
    results = {
-        "Time": "%.2f s" % (end - begin),
+        "Time": f"{end - begin:.2f} s",
        "Words": nwords,
-        "Words/s": "%.0f" % (nwords / (end - begin)),
-        "TOK": "%.2f" % scorer.token_acc,
-        "POS": "%.2f" % scorer.tags_acc,
-        "UAS": "%.2f" % scorer.uas,
-        "LAS": "%.2f" % scorer.las,
-        "NER P": "%.2f" % scorer.ents_p,
-        "NER R": "%.2f" % scorer.ents_r,
-        "NER F": "%.2f" % scorer.ents_f,
-        "Textcat": "%.2f" % scorer.textcat_score,
+        "Words/s": f"{nwords / (end - begin):.0f}",
+        "TOK": f"{scorer.token_acc:.2f}",
+        "TAG": f"{scorer.tags_acc:.2f}",
+        "POS": f"{scorer.pos_acc:.2f}",
+        "MORPH": f"{scorer.morphs_acc:.2f}",
+        "UAS": f"{scorer.uas:.2f}",
+        "LAS": f"{scorer.las:.2f}",
+        "NER P": f"{scorer.ents_p:.2f}",
+        "NER R": f"{scorer.ents_r:.2f}",
+        "NER F": f"{scorer.ents_f:.2f}",
+        "Textcat AUC": f"{scorer.textcat_auc:.2f}",
+        "Textcat F": f"{scorer.textcat_f:.2f}",
+        "Sent P": f"{scorer.sent_p:.2f}",
+        "Sent R": f"{scorer.sent_r:.2f}",
+        "Sent F": f"{scorer.sent_f:.2f}",
    }
+    data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
+
    msg.table(results, title="Results")

+    if scorer.ents_per_type:
+        data["ents_per_type"] = scorer.ents_per_type
+        print_ents_per_type(msg, scorer.ents_per_type)
+    if scorer.textcats_f_per_cat:
+        data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
+        print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
+    if scorer.textcats_auc_per_cat:
+        data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
+        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
+
    if displacy_path:
-        docs, golds = zip(*dev_docs)
+        docs = [ex.predicted for ex in dev_dataset]
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
@ -79,12 +116,22 @@ def evaluate(
            deps=render_deps,
            ents=render_ents,
        )
-        msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
-    if return_scores:
-        return scorer.scores
+        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
+
+    if output_path is not None:
+        srsly.write_json(output_path, data)
+        msg.good(f"Saved results to {output_path}")
+    return data


-def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
+def render_parses(
+    docs: List[Doc],
+    output_path: Path,
+    model_name: str = "",
+    limit: int = 250,
+    deps: bool = True,
+    ents: bool = True,
+):
    docs[0].user_data["title"] = model_name
    if ents:
        html = displacy.render(docs[:limit], style="ent", page=True)
@ -96,3 +143,40 @@ def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=T
        )
        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
            file_.write(html)
+
+
+def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
+    data = [
+        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        for k, v in scores.items()
+    ]
+    msg.table(
+        data,
+        header=("", "P", "R", "F"),
+        aligns=("l", "r", "r", "r"),
+        title="NER (per type)",
+    )
+
+
+def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
+    data = [
+        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
+        for k, v in scores.items()
+    ]
+    msg.table(
+        data,
+        header=("", "P", "R", "F"),
+        aligns=("l", "r", "r", "r"),
+        title="Textcat F (per type)",
+    )
+
+
+def print_textcats_auc_per_cat(
+    msg: Printer, scores: Dict[str, Dict[str, float]]
+) -> None:
+    msg.table(
+        [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
+        header=("", "ROC AUC"),
+        aligns=("l", "r"),
+        title="Textcat ROC AUC (per label)",
+    )
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,92 +1,109 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
+from typing import Optional, Dict, Any, Union
 import platform
 from pathlib import Path
-from wasabi import msg
+from wasabi import Printer
 import srsly

-from ..compat import path2str, basestring_, unicode_
+from ._util import app, Arg, Opt
 from .. import util
 from .. import about


-@plac.annotations(
-    model=("Optional shortcut link of model", "positional", None, str),
-    markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
-    silent=("Don't print anything (just return)", "flag", "s"),
-)
-def info(model=None, markdown=False, silent=False):
+@app.command("info")
+def info_cli(
+    # fmt: off
+    model: Optional[str] = Arg(None, help="Optional model name"),
+    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
+    silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
+    # fmt: on
+):
    """
-    Print info about spaCy installation. If a model shortcut link is
-    speficied as an argument, print model information. Flag --markdown
-    prints details in Markdown for easy copy-pasting to GitHub issues.
+    Print info about spaCy installation. If a model is speficied as an argument,
+    print model information. Flag --markdown prints details in Markdown for easy
+    copy-pasting to GitHub issues.
    """
+    info(model, markdown=markdown, silent=silent)
+
+
+def info(
+    model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
+) -> Union[str, dict]:
+    msg = Printer(no_print=silent, pretty=not silent)
    if model:
+        title = f"Info about model '{model}'"
+        data = info_model(model, silent=silent)
+    else:
+        title = "Info about spaCy"
+        data = info_spacy()
+    raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
+    if "Models" in data and isinstance(data["Models"], dict):
+        data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
+    markdown_data = get_markdown(data, title=title)
+    if markdown:
+        if not silent:
+            print(markdown_data)
+        return markdown_data
+    if not silent:
+        table_data = dict(data)
+        msg.table(table_data, title=title)
+    return raw_data
+
+
+def info_spacy() -> Dict[str, any]:
+    """Generate info about the current spaCy intallation.
+
+    RETURNS (dict): The spaCy info.
+    """
+    all_models = {}
+    for pkg_name in util.get_installed_models():
+        package = pkg_name.replace("-", "_")
+        all_models[package] = util.get_package_version(pkg_name)
+    return {
+        "spaCy version": about.__version__,
+        "Location": str(Path(__file__).parent.parent),
+        "Platform": platform.platform(),
+        "Python version": platform.python_version(),
+        "Models": all_models,
+    }
+
+
+def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
+    """Generate info about a specific model.
+
+    model (str): Model name of path.
+    silent (bool): Don't print anything, just return.
+    RETURNS (dict): The model meta.
+    """
+    msg = Printer(no_print=silent, pretty=not silent)
    if util.is_package(model):
        model_path = util.get_package_path(model)
    else:
-            model_path = util.get_data_path() / model
+        model_path = model
    meta_path = model_path / "meta.json"
    if not meta_path.is_file():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if model_path.resolve() != model_path:
-            meta["link"] = path2str(model_path)
-            meta["source"] = path2str(model_path.resolve())
+        meta["link"] = str(model_path)
+        meta["source"] = str(model_path.resolve())
    else:
-            meta["source"] = path2str(model_path)
-        if not silent:
-            title = "Info about model '{}'".format(model)
-            model_meta = {
-                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
-            }
-            if markdown:
-                print_markdown(model_meta, title=title)
-            else:
-                msg.table(model_meta, title=title)
-        return meta
-    data = {
-        "spaCy version": about.__version__,
-        "Location": path2str(Path(__file__).parent.parent),
-        "Platform": platform.platform(),
-        "Python version": platform.python_version(),
-        "Models": list_models(),
-    }
-    if not silent:
-        title = "Info about spaCy"
-        if markdown:
-            print_markdown(data, title=title)
-        else:
-            msg.table(data, title=title)
-    return data
+        meta["source"] = str(model_path)
+    return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}


-def list_models():
-    def exclude_dir(dir_name):
-        # exclude common cache directories and hidden directories
-        exclude = ("cache", "pycache", "__pycache__")
-        return dir_name in exclude or dir_name.startswith(".")
-
-    data_path = util.get_data_path()
-    if data_path:
-        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
-        return ", ".join([m for m in models if not exclude_dir(m)])
-    return "-"
-
-
-def print_markdown(data, title=None):
-    """Print data in GitHub-flavoured Markdown format for issues etc.
+def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
+    """Get data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
-    title (unicode or None): Title, will be rendered as headline 2.
+    title (str / None): Title, will be rendered as headline 2.
+    RETURNS (str): The Markdown string.
    """
    markdown = []
    for key, value in data.items():
-        if isinstance(value, basestring_) and Path(value).exists():
+        if isinstance(value, str) and Path(value).exists():
            continue
-        markdown.append("* **{}:** {}".format(key, unicode_(value)))
+        markdown.append(f"* **{key}:** {value}")
+    result = "\n{}\n".format("\n".join(markdown))
    if title:
-        print("\n## {}".format(title))
-    print("\n{}\n".format("\n".join(markdown)))
+        result = f"\n## {title}\n{result}"
+    return result
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -1,7 +1,4 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
+from typing import Optional, List, Dict, Any, Union, IO
 import math
 from tqdm import tqdm
 import numpy
@ -13,14 +10,15 @@ import gzip
 import zipfile
 import srsly
 import warnings
-from wasabi import msg
+from wasabi import Printer

+from ._util import app, Arg, Opt
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
+from ..language import Language
 from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
 from ..lookups import Lookups

-
 try:
    import ftfy
 except ImportError:
@ -30,49 +28,60 @@ except ImportError:
 DEFAULT_OOV_PROB = -20


-@plac.annotations(
-    lang=("Model language", "positional", None, str),
-    output_dir=("Model output directory", "positional", None, Path),
-    freqs_loc=("Location of words frequencies file", "option", "f", Path),
-    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
-    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
-    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
-    truncate_vectors=(
-        "Optional number of vectors to truncate to when reading in vectors file",
-        "option",
-        "t",
-        int,
-    ),
-    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
-    vectors_name=(
-        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
-        "option",
-        "vn",
-        str,
-    ),
-    model_name=("Optional name for the model meta", "option", "mn", str),
-    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
-    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
-)
-def init_model(
-    lang,
-    output_dir,
-    freqs_loc=None,
-    clusters_loc=None,
-    jsonl_loc=None,
-    vectors_loc=None,
-    truncate_vectors=0,
-    prune_vectors=-1,
-    vectors_name=None,
-    model_name=None,
-    omit_extra_lookups=False,
-    base_model=None,
+@app.command("init-model")
+def init_model_cli(
+    # fmt: off
+    lang: str = Arg(..., help="Model language"),
+    output_dir: Path = Arg(..., help="Model output directory"),
+    freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
+    clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
+    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
+    vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
+    prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
+    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
+    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
+    model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
+    omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
+    base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
+    # fmt: on
 ):
    """
-    Create a new model from raw data, like word frequencies, Brown clusters
-    and word vectors. If vectors are provided in Word2Vec format, they can
-    be either a .txt or zipped as a .zip or .tar.gz.
+    Create a new model from raw data. If vectors are provided in Word2Vec format,
+    they can be either a .txt or zipped as a .zip or .tar.gz.
    """
+    init_model(
+        lang,
+        output_dir,
+        freqs_loc=freqs_loc,
+        clusters_loc=clusters_loc,
+        jsonl_loc=jsonl_loc,
+        vectors_loc=vectors_loc,
+        prune_vectors=prune_vectors,
+        truncate_vectors=truncate_vectors,
+        vectors_name=vectors_name,
+        model_name=model_name,
+        omit_extra_lookups=omit_extra_lookups,
+        base_model=base_model,
+        silent=False,
+    )
+
+
+def init_model(
+    lang: str,
+    output_dir: Path,
+    freqs_loc: Optional[Path] = None,
+    clusters_loc: Optional[Path] = None,
+    jsonl_loc: Optional[Path] = None,
+    vectors_loc: Optional[Path] = None,
+    prune_vectors: int = -1,
+    truncate_vectors: int = 0,
+    vectors_name: Optional[str] = None,
+    model_name: Optional[str] = None,
+    omit_extra_lookups: bool = False,
+    base_model: Optional[str] = None,
+    silent: bool = True,
+) -> Language:
+    msg = Printer(no_print=silent, pretty=not silent)
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
            settings = ["-j"]
@ -95,7 +104,7 @@ def init_model(
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
            msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
-        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
+        lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)

    with msg.loading("Creating model..."):
        nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
@ -110,12 +119,13 @@ def init_model(

    msg.good("Successfully created model")
    if vectors_loc is not None:
-        add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
+        add_vectors(
+            msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
+        )
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
-        "Sucessfully compiled vocab",
-        "{} entries, {} vectors".format(lex_added, vec_added),
+        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
    )
    if not output_dir.exists():
        output_dir.mkdir()
@ -123,7 +133,7 @@ def init_model(
    return nlp


-def open_file(loc):
+def open_file(loc: Union[str, Path]) -> IO:
    """Handle .gz, .tar.gz or unzipped files"""
    loc = ensure_path(loc)
    if tarfile.is_tarfile(str(loc)):
@ -139,7 +149,9 @@ def open_file(loc):
        return loc.open("r", encoding="utf8")


-def read_attrs_from_deprecated(freqs_loc, clusters_loc):
+def read_attrs_from_deprecated(
+    msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
+) -> List[Dict[str, Any]]:
    if freqs_loc is not None:
        with msg.loading("Counting frequencies..."):
            probs, _ = read_freqs(freqs_loc)
@ -167,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
    return lex_attrs


-def create_model(lang, lex_attrs, name=None, base_model=None):
+def create_model(
+    lang: str,
+    lex_attrs: List[Dict[str, Any]],
+    name: Optional[str] = None,
+    base_model: Optional[Union[str, Path]] = None,
+) -> Language:
    if base_model:
        nlp = load_model(base_model)
        # keep the tokenizer but remove any existing pipeline components due to
@ -194,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
    return nlp


-def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
+def add_vectors(
+    msg: Printer,
+    nlp: Language,
+    vectors_loc: Optional[Path],
+    truncate_vectors: int,
+    prune_vectors: int,
+    name: Optional[str] = None,
+) -> None:
    vectors_loc = ensure_path(vectors_loc)
    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
@ -203,9 +227,11 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
        if vectors_loc:
-            with msg.loading("Reading vectors from {}".format(vectors_loc)):
-                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
-            msg.good("Loaded vectors from {}".format(vectors_loc))
+            with msg.loading(f"Reading vectors from {vectors_loc}"):
+                vectors_data, vector_keys = read_vectors(
+                    msg, vectors_loc, truncate_vectors
+                )
+            msg.good(f"Loaded vectors from {vectors_loc}")
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
@ -215,7 +241,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
-        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
    else:
        nlp.vocab.vectors.name = name
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
@ -223,7 +249,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
        nlp.vocab.prune_vectors(prune_vectors)


-def read_vectors(vectors_loc, truncate_vectors=0):
+def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
@ -243,7 +269,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
    return vectors_data, vectors_keys


-def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
+def read_freqs(
+    freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
+):
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
@ -265,14 +293,14 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
                    word = literal_eval(key)
                except SyntaxError:
                    # Take odd strings literally.
-                    word = literal_eval("'%s'" % key)
+                    word = literal_eval(f"'{key}'")
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob


-def read_clusters(clusters_loc):
+def read_clusters(clusters_loc: Path) -> dict:
    clusters = {}
    if ftfy is None:
        warnings.warn(Warnings.W004)
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -1,77 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
-from pathlib import Path
-from wasabi import msg
-
-from ..compat import symlink_to, path2str
-from .. import util
-
-
-@plac.annotations(
-    origin=("package name or local path to model", "positional", None, str),
-    link_name=("name of shortuct link to create", "positional", None, str),
-    force=("force overwriting of existing link", "flag", "f", bool),
-)
-def link(origin, link_name, force=False, model_path=None):
-    """
-    Create a symlink for models within the spacy/data directory. Accepts
-    either the name of a pip package, or the local path to the model data
-    directory. Linking models allows loading them via spacy.load(link_name).
-    """
-    if util.is_package(origin):
-        model_path = util.get_package_path(origin)
-    else:
-        model_path = Path(origin) if model_path is None else Path(model_path)
-    if not model_path.exists():
-        msg.fail(
-            "Can't locate model data",
-            "The data should be located in {}".format(path2str(model_path)),
-            exits=1,
-        )
-    data_path = util.get_data_path()
-    if not data_path or not data_path.exists():
-        spacy_loc = Path(__file__).parent.parent
-        msg.fail(
-            "Can't find the spaCy data path to create model symlink",
-            "Make sure a directory `/data` exists within your spaCy "
-            "installation and try again. The data directory should be located "
-            "here:".format(path=spacy_loc),
-            exits=1,
-        )
-    link_path = util.get_data_path() / link_name
-    if link_path.is_symlink() and not force:
-        msg.fail(
-            "Link '{}' already exists".format(link_name),
-            "To overwrite an existing link, use the --force flag",
-            exits=1,
-        )
-    elif link_path.is_symlink():  # does a symlink exist?
-        # NB: It's important to check for is_symlink here and not for exists,
-        # because invalid/outdated symlinks would return False otherwise.
-        link_path.unlink()
-    elif link_path.exists():  # does it exist otherwise?
-        # NB: Check this last because valid symlinks also "exist".
-        msg.fail(
-            "Can't overwrite symlink '{}'".format(link_name),
-            "This can happen if your data directory contains a directory or "
-            "file of the same name.",
-            exits=1,
-        )
-    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
-    try:
-        symlink_to(link_path, model_path)
-    except:  # noqa: E722
-        # This is quite dirty, but just making sure other errors are caught.
-        msg.fail(
-            "Couldn't link model to '{}'".format(link_name),
-            "Creating a symlink in spacy/data failed. Make sure you have the "
-            "required permissions and try re-running the command as admin, or "
-            "use a virtualenv. You can still import the model as a module and "
-            "call its load() method, or create the symlink manually.",
-        )
-        msg.text(details)
-        raise
-    msg.good("Linking successful", details)
-    msg.text("You can now load the model via spacy.load('{}')".format(link_name))
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,32 +1,57 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
+from typing import Optional, Union, Any, Dict
 import shutil
 from pathlib import Path
-from wasabi import msg, get_raw_input
+from wasabi import Printer, get_raw_input
 import srsly
+import sys

-from ..compat import path2str
+from ._util import app, Arg, Opt
+from ..schemas import validate, ModelMetaSchema
 from .. import util
 from .. import about


-@plac.annotations(
-    input_dir=("Directory with model data", "positional", None, str),
-    output_dir=("Output parent directory", "positional", None, str),
-    meta_path=("Path to meta.json", "option", "m", str),
-    create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
-    force=("Force overwriting existing model in output directory", "flag", "f", bool),
-)
-def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
+@app.command("package")
+def package_cli(
+    # fmt: off
+    input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
+    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
+    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
+    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
+    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
+    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
+    # fmt: on
+):
    """
-    Generate Python package for model data, including meta and required
-    installation files. A new directory will be created in the specified
-    output directory, and model data will be copied over. If --create-meta is
-    set and a meta.json already exists in the output directory, the existing
-    values will be used as the defaults in the command-line prompt.
+    Generate an installable Python package for a model. Includes model data,
+    meta and required installation files. A new directory will be created in the
+    specified output directory, and model data will be copied over. If
+    --create-meta is set and a meta.json already exists in the output directory,
+    the existing values will be used as the defaults in the command-line prompt.
+    After packaging, "python setup.py sdist" is run in the package directory,
+    which will create a .tar.gz archive that can be installed via "pip install".
    """
+    package(
+        input_dir,
+        output_dir,
+        meta_path=meta_path,
+        version=version,
+        create_meta=create_meta,
+        force=force,
+        silent=False,
+    )
+
+
+def package(
+    input_dir: Path,
+    output_dir: Path,
+    meta_path: Optional[Path] = None,
+    version: Optional[str] = None,
+    create_meta: bool = False,
+    force: bool = False,
+    silent: bool = True,
+) -> None:
+    msg = Printer(no_print=silent, pretty=not silent)
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
@ -37,65 +62,69 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

-    meta_path = meta_path or input_path / "meta.json"
-    if meta_path.is_file():
+    meta_path = meta_path or input_dir / "meta.json"
+    if not meta_path.exists() or not meta_path.is_file():
+        msg.fail("Can't load model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
+    meta = get_meta(input_dir, meta)
+    if version is not None:
+        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
        msg.good("Loaded meta.json from file", meta_path)
    else:
-            meta = generate_meta(input_dir, meta, msg)
-    for key in ("lang", "name", "version"):
-        if key not in meta or meta[key] == "":
-            msg.fail(
-                "No '{}' setting found in meta.json".format(key),
-                "This setting is required to build your package.",
-                exits=1,
-            )
+        meta = generate_meta(meta, msg)
+    errors = validate(ModelMetaSchema, meta)
+    if errors:
+        msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
-    main_path = output_path / model_name_v
+    main_path = output_dir / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
-            shutil.rmtree(path2str(package_path))
+            shutil.rmtree(str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
-                "`--force` flag to overwrite existing "
-                "directories.".format(path=path2str(package_path)),
+                "`--force` flag to overwrite existing directories.",
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
-    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
+    shutil.copytree(str(input_dir), str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
-    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
-    msg.text("To build the package, run `python setup.py sdist` in this directory.")
+    msg.good(f"Successfully created package '{model_name_v}'", main_path)
+    with util.working_dir(main_path):
+        util.run_command([sys.executable, "setup.py", "sdist"])
+    zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
+    msg.good(f"Successfully created zipped Python package", zip_file)


-def create_file(file_path, contents):
+def create_file(file_path: Path, contents: str) -> None:
    file_path.touch()
    file_path.open("w", encoding="utf-8").write(contents)


-def generate_meta(model_path, existing_meta, msg):
-    meta = existing_meta or {}
-    settings = [
-        ("lang", "Model language", meta.get("lang", "en")),
-        ("name", "Model name", meta.get("name", "model")),
-        ("version", "Model version", meta.get("version", "0.0.0")),
-        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
-        ("description", "Model description", meta.get("description", False)),
-        ("author", "Author", meta.get("author", False)),
-        ("email", "Author email", meta.get("email", False)),
-        ("url", "Author website", meta.get("url", False)),
-        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
-    ]
+def get_meta(
+    model_path: Union[str, Path], existing_meta: Dict[str, Any]
+) -> Dict[str, Any]:
+    meta = {
+        "lang": "en",
+        "name": "model",
+        "version": "0.0.0",
+        "description": None,
+        "author": None,
+        "email": None,
+        "url": None,
+        "license": "MIT",
+    }
+    meta.update(existing_meta)
    nlp = util.load_model_from_path(Path(model_path))
+    meta["spacy_version"] = util.get_model_version_range(about.__version__)
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
@ -103,6 +132,23 @@ def generate_meta(model_path, existing_meta, msg):
        "keys": nlp.vocab.vectors.n_keys,
        "name": nlp.vocab.vectors.name,
    }
+    if about.__title__ != "spacy":
+        meta["parent_package"] = about.__title__
+    return meta
+
+
+def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
+    meta = existing_meta or {}
+    settings = [
+        ("lang", "Model language", meta.get("lang", "en")),
+        ("name", "Model name", meta.get("name", "model")),
+        ("version", "Model version", meta.get("version", "0.0.0")),
+        ("description", "Model description", meta.get("description", None)),
+        ("author", "Author", meta.get("author", None)),
+        ("email", "Author email", meta.get("email", None)),
+        ("url", "Author website", meta.get("url", None)),
+        ("license", "License", meta.get("license", "MIT")),
+    ]
    msg.divider("Generating meta.json")
    msg.text(
        "Enter the package settings for your model. The following information "
@ -111,16 +157,11 @@ def generate_meta(model_path, existing_meta, msg):
    for setting, desc, default in settings:
        response = get_raw_input(desc, default)
        meta[setting] = default if response == "" and default else response
-    if about.__title__ != "spacy":
-        meta["parent_package"] = about.__title__
    return meta


 TEMPLATE_SETUP = """
 #!/usr/bin/env python
-# coding: utf8
-from __future__ import unicode_literals
-
 import io
 import json
 from os import path, walk
@ -166,16 +207,17 @@ def setup_package():

    setup(
        name=model_name,
-        description=meta['description'],
-        author=meta['author'],
-        author_email=meta['email'],
-        url=meta['url'],
+        description=meta.get('description'),
+        author=meta.get('author'),
+        author_email=meta.get('email'),
+        url=meta.get('url'),
        version=meta['version'],
-        license=meta['license'],
+        license=meta.get('license'),
        packages=[model_name],
        package_data={model_name: list_files(model_dir)},
        install_requires=list_requirements(meta),
        zip_safe=False,
+        entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
    )


@ -190,9 +232,6 @@ include meta.json


 TEMPLATE_INIT = """
-# coding: utf8
-from __future__ import unicode_literals
-
 from pathlib import Path
 from spacy.util import load_model_from_init_py, get_model_meta

--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,217 +1,150 @@
-# coding: utf8
-from __future__ import print_function, unicode_literals
-
-import plac
+from typing import Optional, Dict, Any
 import random
 import numpy
 import time
 import re
 from collections import Counter
 from pathlib import Path
-from thinc.v2v import Affine, Maxout
-from thinc.misc import LayerNorm as LN
-from thinc.neural.util import prefer_gpu
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
+from thinc.api import CosineDistance, L2Distance
 from wasabi import msg
 import srsly
+from functools import partial
+import typer

+from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code
+from ..schemas import ConfigSchema
 from ..errors import Errors
+from ..ml.models.multi_task import build_cloze_multi_task_model
+from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
-from .._ml import MultiSoftmax
 from .. import util
-from .train import _load_pretrained_tok2vec


-@plac.annotations(
-    texts_loc=(
-        "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
-        "key 'tokens'",
-        "positional",
-        None,
-        str,
-    ),
-    vectors_model=("Name or path to spaCy model with vectors to learn from"),
-    output_dir=("Directory to write models to on each epoch", "positional", None, str),
-    width=("Width of CNN layers", "option", "cw", int),
-    conv_depth=("Depth of CNN layers", "option", "cd", int),
-    cnn_window=("Window size for CNN layers", "option", "cW", int),
-    cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
-    use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
-    sa_depth=("Depth of self-attention layers", "option", "sa", int),
-    bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
-    embed_rows=("Number of embedding rows", "option", "er", int),
-    loss_func=(
-        "Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
-        "option",
-        "L",
-        str,
-    ),
-    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
-    dropout=("Dropout rate", "option", "d", float),
-    batch_size=("Number of words per training batch", "option", "bs", int),
-    max_length=(
-        "Max words per example. Longer examples are discarded",
-        "option",
-        "xw",
-        int,
-    ),
-    min_length=(
-        "Min words per example. Shorter examples are discarded",
-        "option",
-        "nw",
-        int,
-    ),
-    seed=("Seed for random number generators", "option", "s", int),
-    n_iter=("Number of iterations to pretrain", "option", "i", int),
-    n_save_every=("Save model every X batches.", "option", "se", int),
-    init_tok2vec=(
-        "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
-        "option",
-        "t2v",
-        Path,
-    ),
-    epoch_start=(
-        "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
-        "renamed. Prevents unintended overwriting of existing weight files.",
-        "option",
-        "es",
-        int,
-    ),
+@app.command(
+    "pretrain",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
-def pretrain(
-    texts_loc,
-    vectors_model,
-    output_dir,
-    width=96,
-    conv_depth=4,
-    cnn_pieces=3,
-    sa_depth=0,
-    cnn_window=1,
-    bilstm_depth=0,
-    use_chars=False,
-    embed_rows=2000,
-    loss_func="cosine",
-    use_vectors=False,
-    dropout=0.2,
-    n_iter=1000,
-    batch_size=3000,
-    max_length=500,
-    min_length=5,
-    seed=0,
-    n_save_every=None,
-    init_tok2vec=None,
-    epoch_start=None,
+def pretrain_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
+    output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
+    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
+    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
+    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    # fmt: on
 ):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
-    using an approximate language-modelling objective. Specifically, we load
-    pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
-    vectors which match the pretrained ones. The weights are saved to a directory
-    after each epoch. You can then pass a path to one of these pretrained weights
-    files to the 'spacy train' command.
+    using an approximate language-modelling objective. Two objective types
+    are available, vector-based and character-based.
+
+    In the vector-based objective, we load word vectors that have been trained
+    using a word2vec-style distributional similarity algorithm, and train a
+    component like a CNN, BiLSTM, etc to predict vectors which match the
+    pretrained ones. The weights are saved to a directory after each epoch. You
+    can then pass a path to one of these pretrained weights files to the
+    'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
-    all settings are the same between pretraining and training. The API and
-    errors around this need some improvement.
+    all settings are the same between pretraining and training. Ideally,
+    this is done by using the same config file for both commands.
    """
-    config = dict(locals())
-    for key in config:
-        if isinstance(config[key], Path):
-            config[key] = str(config[key])
-    util.fix_random_seed(seed)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    pretrain(
+        texts_loc,
+        output_dir,
+        config_path,
+        config_overrides=overrides,
+        resume_path=resume_path,
+        epoch_resume=epoch_resume,
+    )

-    has_gpu = prefer_gpu(gpu_id=1)
-    msg.info("Using GPU" if has_gpu else "Not using GPU")

-    output_dir = Path(output_dir)
-    if output_dir.exists() and [p for p in output_dir.iterdir()]:
-        msg.warn(
-            "Output directory is not empty",
-            "It is better to use an empty directory or refer to a new output path, "
-            "then the new directory will be created for you.",
+def pretrain(
+    texts_loc: Path,
+    output_dir: Path,
+    config_path: Path,
+    config_overrides: Dict[str, Any] = {},
+    resume_path: Optional[Path] = None,
+    epoch_resume: Optional[int] = None,
+):
+    verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
+    msg.info(f"Loading config from: {config_path}")
+    with show_validation_error():
+        config = util.load_config(
+            config_path,
+            create_objects=False,
+            validate=True,
+            schema=ConfigSchema,
+            overrides=config_overrides,
        )
    if not output_dir.exists():
        output_dir.mkdir()
-        msg.good("Created output directory: {}".format(output_dir))
-    srsly.write_json(output_dir / "config.json", config)
-    msg.good("Saved settings to config.json")
+        msg.good(f"Created output directory: {output_dir}")
+
+    use_gpu = config["training"]["use_gpu"]
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+        require_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
+
+    seed = config["pretraining"]["seed"]
+    if seed is not None:
+        fix_random_seed(seed)
+    if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
+        use_pytorch_for_gpu_memory()
+
+    nlp_config = config["nlp"]
+    srsly.write_json(output_dir / "config.json", config)
+    msg.good("Saved config file in the output directory")
+
+    config = util.load_config(config_path, create_objects=True)
+    nlp = util.load_model_from_config(nlp_config)
+    pretrain_config = config["pretraining"]

-    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
-        texts_loc = Path(texts_loc)
-        if not texts_loc.exists():
-            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
-        if not texts:
-            msg.fail("Input file is empty", texts_loc, exits=1)
-        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
-        msg.text("Reading input text from stdin...")
+        msg.info("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

-    with msg.loading("Loading model '{}'...".format(vectors_model)):
-        nlp = util.load_model(vectors_model)
-    msg.good("Loaded model '{}'".format(vectors_model))
-    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
-    model = create_pretraining_model(
-        nlp,
-        Tok2Vec(
-            width,
-            embed_rows,
-            conv_depth=conv_depth,
-            pretrained_vectors=pretrained_vectors,
-            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
-            subword_features=not use_chars,  # Set to False for Chinese etc
-            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
-        ),
-        objective=loss_func
-    )
-    # Load in pretrained weights
-    if init_tok2vec is not None:
-        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
-        msg.text("Loaded pretrained tok2vec for: {}".format(components))
-        # Parse the epoch number from the given weight file
-        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
-        if model_name:
-            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
-        else:
-            if not epoch_start:
-                msg.fail(
-                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
-                    "'--init-tok2vec'",
-                    exits=True,
-                )
-            elif epoch_start < 0:
-                msg.fail(
-                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
-                    % epoch_start,
-                    exits=True,
-                )
-    else:
-        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
-        epoch_start = 0
+    tok2vec_path = pretrain_config["tok2vec_model"]
+    tok2vec = config
+    for subpath in tok2vec_path.split("."):
+        tok2vec = tok2vec.get(subpath)
+    model = create_pretraining_model(nlp, tok2vec, pretrain_config)
+    optimizer = pretrain_config["optimizer"]
+
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        _resume_model(model, resume_path, epoch_resume)
+    else:
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0

-    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
-    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
-            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
-                "wb"
-            ) as file_:
-                file_.write(model.tok2vec.to_bytes())
+            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
@ -222,26 +155,26 @@ def pretrain(
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
-    for epoch in range(epoch_start, n_iter + epoch_start):
-        for batch_id, batch in enumerate(
-            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
-        ):
+    objective = create_objective(pretrain_config["objective"])
+    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
+        batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
+        for batch_id, batch in enumerate(batches):
            docs, count = make_docs(
                nlp,
-                [text for (text, _) in batch],
-                max_length=max_length,
-                min_length=min_length,
+                batch,
+                max_length=pretrain_config["max_length"],
+                min_length=pretrain_config["min_length"],
            )
            skip_counter += count
-            loss = make_update(
-                model, docs, optimizer, objective=loss_func, drop=dropout
-            )
+            loss = make_update(model, docs, optimizer, objective)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
-            if n_save_every and (batch_id % n_save_every == 0):
+            if pretrain_config["n_save_every"] and (
+                batch_id % pretrain_config["n_save_every"] == 0
+            ):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
@ -249,24 +182,36 @@ def pretrain(
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
-        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
+        msg.warn(f"Skipped {skip_counter} empty values")
    msg.good("Successfully finished pretrain")


-def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
+def _resume_model(model, resume_path, epoch_resume):
+    msg.info(f"Resume training tok2vec from: {resume_path}")
+    with resume_path.open("rb") as file_:
+        weights_data = file_.read()
+        model.get_ref("tok2vec").from_bytes(weights_data)
+    # Parse the epoch number from the given weight file
+    model_name = re.search(r"model\d+\.bin", str(resume_path))
+    if model_name:
+        # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+        epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+        msg.info(f"Resuming from epoch: {epoch_resume}")
+    else:
+        msg.info(f"Resuming from epoch: {epoch_resume}")
+
+
+def make_update(model, docs, optimizer, objective_func):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
-    drop (float): The dropout rate.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
-    predictions, backprop = model.begin_update(docs, drop=drop)
-    if objective == "characters":
-        loss, gradients = get_characters_loss(model.ops, docs, predictions)
-    else:
-        loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
-    backprop(gradients, sgd=optimizer)
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = objective_func(model.ops, docs, predictions)
+    backprop(gradients)
+    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
@ -298,18 +243,43 @@ def make_docs(nlp, batch, min_length, max_length):
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
-        if len(doc) >= min_length and len(doc) < max_length:
+        if min_length <= len(doc) < max_length:
            docs.append(doc)
    return docs, skip_count


-def get_vectors_loss(ops, docs, prediction, objective="L2"):
-    """Compute a mean-squared error loss between the documents' vectors and
-    the prediction.
+def create_objective(config):
+    """Create the objective for pretraining.

-    Note that this is ripe for customization! We could compute the vectors
-    in some other word, e.g. with an LSTM language model, or use some other
-    type of objective.
+    We'd like to replace this with a registry function but it's tricky because
+    we're also making a model choice based on this. For now we hard-code support
+    for two types (characters, vectors). For characters you can specify
+    n_characters, for vectors you can specify the loss.
+
+    Bleh.
+    """
+    objective_type = config["type"]
+    if objective_type == "characters":
+        return partial(get_characters_loss, nr_char=config["n_characters"])
+    elif objective_type == "vectors":
+        if config["loss"] == "cosine":
+            return partial(
+                get_vectors_loss,
+                distance=CosineDistance(normalize=True, ignore_zeros=True),
+            )
+        elif config["loss"] == "L2":
+            return partial(
+                get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
+            )
+        else:
+            raise ValueError("Unexpected loss type", config["loss"])
+    else:
+        raise ValueError("Unexpected objective_type", objective_type)
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
    """
    # The simplest way to implement this would be to vstack the
    # token.vector values, but that's a bit inefficient, especially on GPU.
@ -317,47 +287,51 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
-    if objective == "L2":
-        d_target = prediction - target
-        loss = (d_target ** 2).sum()
-    elif objective == "cosine":
-        loss, d_target = get_cossim_loss(prediction, target)
-    else:
-        raise ValueError(Errors.E142.format(loss_func=objective))
+    d_target, loss = distance(prediction, target)
    return loss, d_target


-def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+def create_pretraining_model(nlp, tok2vec, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
    """
-    if objective == "characters":
-        out_sizes = [256] * nr_char
-        output_layer = chain(
-            LN(Maxout(300, pieces=3)),
-            MultiSoftmax(out_sizes, 300)
+    # TODO
+    maxout_pieces = 3
+    hidden_size = 300
+    if pretrain_config["objective"]["type"] == "vectors":
+        model = build_cloze_multi_task_model(
+            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
        )
-    else:
-        output_size = nlp.vocab.vectors.data.shape[1]
-        output_layer = chain(
-            LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+    elif pretrain_config["objective"]["type"] == "characters":
+        model = build_cloze_characters_multi_task_model(
+            nlp.vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=pretrain_config["objective"]["n_characters"],
        )
-    # This is annoying, but the parser etc have the flatten step after
-    # the tok2vec. To load the weights in cleanly, we need to match
-    # the shape of the models' components exactly. So what we cann
-    # "tok2vec" has to be the same set of processes as what the components do.
-    tok2vec = chain(tok2vec, flatten)
-    model = chain(tok2vec, output_layer)
-    model = masked_language_model(nlp.vocab, model)
-    model.tok2vec = tok2vec
-    model.output_layer = output_layer
-    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    set_dropout_rate(model, pretrain_config["dropout"])
    return model


-class ProgressTracker(object):
+class ProgressTracker:
    def __init__(self, frequency=1000000):
        self.loss = 0.0
        self.prev_loss = 0.0
@ -403,3 +377,44 @@ def _smart_round(figure, width=10, max_decimal=4):
        n_decimal = min(n_decimal, max_decimal)
        format_str = "%." + str(n_decimal) + "f"
        return format_str % figure
+
+
+def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if output_dir.exists() and [p for p in output_dir.iterdir()]:
+        if resume_path:
+            msg.warn(
+                "Output directory is not empty. ",
+                "If you're resuming a run from a previous model in this directory, "
+                "the old models for the consecutive epochs will be overwritten "
+                "with the new ones.",
+            )
+        else:
+            msg.warn(
+                "Output directory is not empty. ",
+                "It is better to use an empty directory or refer to a new output path, "
+                "then the new directory will be created for you.",
+            )
+    if texts_loc != "-":  # reading from a file
+        texts_loc = Path(texts_loc)
+        if not texts_loc.exists():
+            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
+
+        for text in srsly.read_jsonl(texts_loc):
+            break
+        else:
+            msg.fail("Input file is empty", texts_loc, exits=1)
+
+    if resume_path is not None:
+        model_name = re.search(r"model\d+\.bin", str(resume_path))
+        if not model_name and not epoch_resume:
+            msg.fail(
+                "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
+                exits=True,
+            )
+        elif not model_name and epoch_resume < 0:
+            msg.fail(
+                f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
+                exits=True,
+            )
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -1,7 +1,4 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
-import plac
+from typing import Optional, Sequence, Union, Iterator
 import tqdm
 from pathlib import Path
 import srsly
@ -9,36 +6,63 @@ import cProfile
 import pstats
 import sys
 import itertools
-import thinc.extra.datasets
-from wasabi import msg
+from wasabi import msg, Printer
+import typer

+from ._util import app, debug_cli, Arg, Opt, NAME
+from ..language import Language
 from ..util import load_model


-@plac.annotations(
-    model=("Model to load", "positional", None, str),
-    inputs=("Location of input file. '-' for stdin.", "positional", None, str),
-    n_texts=("Maximum number of texts to use if available", "option", "n", int),
-)
-def profile(model, inputs=None, n_texts=10000):
+@debug_cli.command("profile")
+@app.command("profile", hidden=True)
+def profile_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read current calling context
+    model: str = Arg(..., help="Model to load"),
+    inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
+    n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
+    # fmt: on
+):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
+    if ctx.parent.command.name == NAME:  # called as top-level command
+        msg.warn(
+            "The profile command is now available via the 'debug profile' "
+            "subcommand. You can run python -m spacy debug --help for an "
+            "overview of the other available debugging commands."
+        )
+    profile(model, inputs=inputs, n_texts=n_texts)
+
+
+def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
+
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
+        try:
+            import ml_datasets
+        except ImportError:
+            msg.fail(
+                "This command, when run without an input file, "
+                "requires the ml_datasets library to be installed: "
+                "pip install ml_datasets",
+                exits=1,
+            )
+
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
-            imdb_train, _ = thinc.extra.datasets.imdb()
+            imdb_train, _ = ml_datasets.imdb()
            inputs, _ = zip(*imdb_train)
-        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
+        msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
        inputs = inputs[:n_inputs]
-    with msg.loading("Loading model '{}'...".format(model)):
+    with msg.loading(f"Loading model '{model}'..."):
        nlp = load_model(model)
-    msg.good("Loaded model '{}'".format(model))
+    msg.good(f"Loaded model '{model}'")
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
@ -46,12 +70,12 @@ def profile(model, inputs=None, n_texts=10000):
    s.strip_dirs().sort_stats("time").print_stats()


-def parse_texts(nlp, texts):
+def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass


-def _read_inputs(loc, msg):
+def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
    if loc == "-":
        msg.info("Reading input from sys.stdin")
        file_ = sys.stdin
@ -60,7 +84,7 @@ def _read_inputs(loc, msg):
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
-        msg.info("Using data from {}".format(input_path.parts[-1]))
+        msg.info(f"Using data from {input_path.parts[-1]}")
        file_ = input_path.open()
    for line in file_:
        data = srsly.json_loads(line)
--- a/spacy/cli/project/init.py
+++ b/spacy/cli/project/init.py
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@ -0,0 +1,157 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import requests
+import tqdm
+import re
+import shutil
+
+from ...util import ensure_path, working_dir
+from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
+
+
+# TODO: find a solution for caches
+# CACHES = [
+#     Path.home() / ".torch",
+#     Path.home() / ".caches" / "torch",
+#     os.environ.get("TORCH_HOME"),
+#     Path.home() / ".keras",
+# ]
+
+
+@project_cli.command("assets")
+def project_assets_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
+    # fmt: on
+):
+    """Fetch project assets like datasets and pretrained weights. Assets are
+    defined in the "assets" section of the project.yml. If a checksum is
+    provided in the project.yml, the file is only downloaded if no local file
+    with the same checksum exists.
+    """
+    project_assets(project_dir)
+
+
+def project_assets(project_dir: Path) -> None:
+    """Fetch assets for a project using DVC if possible.
+
+    project_dir (Path): Path to project directory.
+    """
+    project_path = ensure_path(project_dir)
+    config = load_project_config(project_path)
+    assets = config.get("assets", {})
+    if not assets:
+        msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+    msg.info(f"Fetching {len(assets)} asset(s)")
+    variables = config.get("variables", {})
+    for asset in assets:
+        dest = asset["dest"].format(**variables)
+        url = asset.get("url")
+        checksum = asset.get("checksum")
+        if not url:
+            # project.yml defines asset without URL that the user has to place
+            check_private_asset(dest, checksum)
+            continue
+        url = url.format(**variables)
+        fetch_asset(project_path, url, dest, checksum)
+
+
+def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
+    """Check and validate assets without a URL (private assets that the user
+    has to provide themselves) and give feedback about the checksum.
+
+    dest (Path): Desintation path of the asset.
+    checksum (Optional[str]): Optional checksum of the expected file.
+    """
+    if not Path(dest).exists():
+        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
+        msg.warn(err)
+    else:
+        if checksum and checksum == get_checksum(dest):
+            msg.good(f"Asset exists with matching checksum: {dest}")
+        else:
+            msg.fail(f"Asset available but with incorrect checksum: {dest}")
+
+
+def fetch_asset(
+    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
+) -> None:
+    """Fetch an asset from a given URL or path. If a checksum is provided and a
+    local file exists, it's only re-downloaded if the checksum doesn't match.
+
+    project_path (Path): Path to project directory.
+    url (str): URL or path to asset.
+    checksum (Optional[str]): Optional expected checksum of local file.
+    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
+        the asset failed.
+    """
+    # TODO: add support for caches
+    dest_path = (project_path / dest).resolve()
+    if dest_path.exists() and checksum:
+        # If there's already a file, check for checksum
+        if checksum == get_checksum(dest_path):
+            msg.good(f"Skipping download with matching checksum: {dest}")
+            return dest_path
+    # We might as well support the user here and create parent directories in
+    # case the asset dir isn't listed as a dir to create in the project.yml
+    if not dest_path.parent.exists():
+        dest_path.parent.mkdir(parents=True)
+    with working_dir(project_path):
+        url = convert_asset_url(url)
+        try:
+            download_file(url, dest_path)
+            msg.good(f"Downloaded asset {dest}")
+        except requests.exceptions.RequestException as e:
+            if Path(url).exists() and Path(url).is_file():
+                # If it's a local file, copy to destination
+                shutil.copy(url, str(dest_path))
+                msg.good(f"Copied local asset {dest}")
+            else:
+                msg.fail(f"Download failed: {dest}", e)
+                return
+    if checksum and checksum != get_checksum(dest_path):
+        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
+
+
+def convert_asset_url(url: str) -> str:
+    """Check and convert the asset URL if needed.
+
+    url (str): The asset URL.
+    RETURNS (str): The converted URL.
+    """
+    # If the asset URL is a regular GitHub URL it's likely a mistake
+    if re.match(r"(http(s?)):\/\/github.com", url):
+        converted = url.replace("github.com", "raw.githubusercontent.com")
+        converted = re.sub(r"/(tree|blob)/", "/", converted)
+        msg.warn(
+            "Downloading from a regular GitHub URL. This will only download "
+            "the source of the page, not the actual file. Converting the URL "
+            "to a raw URL.",
+            converted,
+        )
+        return converted
+    return url
+
+
+def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
+    """Download a file using requests.
+
+    url (str): The URL of the file.
+    dest (Path): The destination path.
+    chunk_size (int): The size of chunks to read/write.
+    """
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total = int(response.headers.get("content-length", 0))
+    progress_settings = {
+        "total": total,
+        "unit": "iB",
+        "unit_scale": True,
+        "unit_divisor": chunk_size,
+        "leave": False,
+    }
+    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
+        for data in response.iter_content(chunk_size=chunk_size):
+            size = f.write(data)
+            bar.update(size)
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -0,0 +1,96 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import subprocess
+import shutil
+import re
+
+from ... import about
+from ...util import ensure_path, run_command, make_tempdir
+from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
+
+
+@project_cli.command("clone")
+def project_clone_cli(
+    # fmt: off
+    name: str = Arg(..., help="The name of the template to clone"),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
+    # fmt: on
+):
+    """Clone a project template from a repository. Calls into "git" and will
+    only download the files from the given subdirectory. The GitHub repo
+    defaults to the official spaCy template repo, but can be customized
+    (including using a private repo).
+    """
+    if dest is None:
+        dest = Path.cwd() / name
+    project_clone(name, dest, repo=repo)
+
+
+def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
+    """Clone a project template from a repository.
+
+    name (str): Name of subdirectory to clone.
+    dest (Path): Destination path of cloned project.
+    repo (str): URL of Git repo containing project templates.
+    """
+    dest = ensure_path(dest)
+    check_clone(name, dest, repo)
+    project_dir = dest.resolve()
+    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
+    # We're using Git and sparse checkout to only clone the files we need
+    with make_tempdir() as tmp_dir:
+        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
+        try:
+            run_command(cmd)
+        except subprocess.CalledProcessError:
+            err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
+            msg.fail(err)
+        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
+            f.write(name)
+        try:
+            run_command(["git", "-C", str(tmp_dir), "fetch"])
+            run_command(["git", "-C", str(tmp_dir), "checkout"])
+        except subprocess.CalledProcessError:
+            err = f"Could not clone '{name}' from repo '{repo_name}'"
+            msg.fail(err)
+        # We need Path(name) to make sure we also support subdirectories
+        shutil.move(str(tmp_dir / Path(name)), str(project_dir))
+    msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
+    if not (project_dir / PROJECT_FILE).exists():
+        msg.warn(f"No {PROJECT_FILE} found in directory")
+    else:
+        msg.good(f"Your project is now ready!")
+        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
+
+
+def check_clone(name: str, dest: Path, repo: str) -> None:
+    """Check and validate that the destination path can be used to clone. Will
+    check that Git is available and that the destination path is suitable.
+
+    name (str): Name of the directory to clone from the repo.
+    dest (Path): Local destination of cloned directory.
+    repo (str): URL of the repo to clone from.
+    """
+    try:
+        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            f"Cloning spaCy project templates requires Git and the 'git' command. ",
+            f"To clone a project without Git, copy the files from the '{name}' "
+            f"directory in the {repo} to {dest} manually and then run:",
+            f"{COMMAND} project init {dest}",
+            exits=1,
+        )
+    if not dest:
+        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
+    if dest.exists():
+        # Directory already exists (not allowed, clone needs to create it)
+        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
+    if not dest.parent.exists():
+        # We're not creating parents, parent dir should exist
+        msg.fail(
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            exits=1,
+        )
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -0,0 +1,208 @@
+"""This module contains helpers and subcommands for integrating spaCy projects
+with Data Version Controk (DVC). https://dvc.org"""
+from typing import Dict, Any, List, Optional
+import subprocess
+from pathlib import Path
+from wasabi import msg
+
+from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
+from .._util import Arg, Opt, NAME, COMMAND
+from ...util import working_dir, split_command, join_command, run_command
+
+
+DVC_CONFIG = "dvc.yaml"
+DVC_DIR = ".dvc"
+UPDATE_COMMAND = "dvc"
+DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
+# edited your {PROJECT_FILE}, you can regenerate this file by running:
+# {COMMAND} project {UPDATE_COMMAND}"""
+
+
+@project_cli.command(UPDATE_COMMAND)
+def project_update_dvc_cli(
+    # fmt: off
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
+    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
+    # fmt: on
+):
+    """Auto-generate Data Version Control (DVC) config. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. If no workflow is specified, the first defined
+    workflow is used. The DVC config will only be updated if the project.yml changed.
+    """
+    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+
+
+def project_update_dvc(
+    project_dir: Path,
+    workflow: Optional[str] = None,
+    *,
+    verbose: bool = False,
+    force: bool = False,
+) -> None:
+    """Update the auto-generated Data Version Control (DVC) config file. A DVC
+    project can only define one pipeline, so you need to specify one workflow
+    defined in the project.yml. Will only update the file if the checksum changed.
+
+    project_dir (Path): The project directory.
+    workflow (Optional[str]): Optional name of workflow defined in project.yml.
+        If not set, the first workflow will be used.
+    verbose (bool): Print more info.
+    force (bool): Force update DVC config.
+    """
+    config = load_project_config(project_dir)
+    updated = update_dvc_config(
+        project_dir, config, workflow, verbose=verbose, force=force
+    )
+    help_msg = "To execute the workflow with DVC, run: dvc repro"
+    if updated:
+        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
+    else:
+        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
+
+
+def update_dvc_config(
+    path: Path,
+    config: Dict[str, Any],
+    workflow: Optional[str] = None,
+    verbose: bool = False,
+    silent: bool = False,
+    force: bool = False,
+) -> bool:
+    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
+    project directory. The file is auto-generated based on the config. The
+    first line of the auto-generated file specifies the hash of the config
+    dict, so if any of the config values change, the DVC config is regenerated.
+
+    path (Path): The path to the project directory.
+    config (Dict[str, Any]): The loaded project.yml.
+    verbose (bool): Whether to print additional info (via DVC).
+    silent (bool): Don't output anything (via DVC).
+    force (bool): Force update, even if hashes match.
+    RETURNS (bool): Whether the DVC config file was updated.
+    """
+    ensure_dvc(path)
+    workflows = config.get("workflows", {})
+    workflow_names = list(workflows.keys())
+    check_workflows(workflow_names, workflow)
+    if not workflow:
+        workflow = workflow_names[0]
+    config_hash = get_hash(config)
+    path = path.resolve()
+    dvc_config_path = path / DVC_CONFIG
+    if dvc_config_path.exists():
+        # Check if the file was generated using the current config, if not, redo
+        with dvc_config_path.open("r", encoding="utf8") as f:
+            ref_hash = f.readline().strip().replace("# ", "")
+        if ref_hash == config_hash and not force:
+            return False  # Nothing has changed in project.yml, don't need to update
+        dvc_config_path.unlink()
+    variables = config.get("variables", {})
+    dvc_commands = []
+    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    for name in workflows[workflow]:
+        command = config_commands[name]
+        deps = command.get("deps", [])
+        outputs = command.get("outputs", [])
+        outputs_no_cache = command.get("outputs_no_cache", [])
+        if not deps and not outputs and not outputs_no_cache:
+            continue
+        # Default to the working dir as the project path since dvc.yaml is auto-generated
+        # and we don't want arbitrary paths in there
+        project_cmd = ["python", "-m", NAME, "project", "run", name]
+        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
+        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
+        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
+        dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+        if command.get("no_skip"):
+            dvc_cmd.append("--always-changed")
+        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
+        dvc_commands.append(join_command(full_cmd))
+    with working_dir(path):
+        dvc_flags = {"--verbose": verbose, "--quiet": silent}
+        run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
+    with dvc_config_path.open("r+", encoding="utf8") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
+    return True
+
+
+def run_dvc_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, str] = {},
+    flags: Dict[str, bool] = {},
+) -> None:
+    """Run a sequence of DVC commands in a subprocess, in order.
+
+    commands (List[str]): The string commands without the leading "dvc".
+    variables (Dict[str, str]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
+        easier to pass flags like --quiet that depend on a variable or
+        command-line setting while avoiding lots of nested conditionals.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        dvc_command = ["dvc", *command]
+        # Add the flags if they are set to True
+        for flag, is_active in flags.items():
+            if is_active:
+                dvc_command.append(flag)
+        run_command(dvc_command)
+
+
+def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
+    """Validate workflows provided in project.yml and check that a given
+    workflow can be used to generate a DVC config.
+
+    workflows (List[str]): Names of the available workflows.
+    workflow (Optional[str]): The name of the workflow to convert.
+    """
+    if not workflows:
+        msg.fail(
+            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
+            f"define at least one list of commands.",
+            exits=1,
+        )
+    if workflow is not None and workflow not in workflows:
+        msg.fail(
+            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
+            f"Available workflows: {', '.join(workflows)}",
+            exits=1,
+        )
+    if not workflow:
+        msg.warn(
+            f"No workflow specified for DVC pipeline. Using the first workflow "
+            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
+        )
+
+
+def ensure_dvc(project_dir: Path) -> None:
+    """Ensure that the "dvc" command is available and that the current project
+    directory is an initialized DVC project.
+    """
+    try:
+        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
+    except Exception:
+        msg.fail(
+            "To use spaCy projects with DVC (Data Version Control), DVC needs "
+            "to be installed and the 'dvc' command needs to be available",
+            "You can install the Python package from pip (pip install dvc) or "
+            "conda (conda install -c conda-forge dvc). For more details, see the "
+            "documentation: https://dvc.org/doc/install",
+            exits=1,
+        )
+    if not (project_dir / ".dvc").exists():
+        msg.fail(
+            "Project not initialized as a DVC project",
+            "To initialize a DVC project, you can run 'dvc init' in the project "
+            "directory. For more details, see the documentation: "
+            "https://dvc.org/doc/command-reference/init",
+            exits=1,
+        )
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -0,0 +1,265 @@
+from typing import Optional, List, Dict, Sequence, Any
+from pathlib import Path
+from wasabi import msg
+import sys
+import srsly
+
+from ...util import working_dir, run_command, split_command, is_cwd, join_command
+from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
+from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
+
+
+@project_cli.command("run")
+def project_run_cli(
+    # fmt: off
+    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
+    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
+    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
+    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
+    # fmt: on
+):
+    """Run a named command or workflow defined in the project.yml. If a workflow
+    name is specified, all commands in the workflow are run, in order. If
+    commands define dependencies and/or outputs, they will only be re-run if
+    state has changed.
+    """
+    if show_help or not subcommand:
+        print_run_help(project_dir, subcommand)
+    else:
+        project_run(project_dir, subcommand, force=force, dry=dry)
+
+
+def project_run(
+    project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
+) -> None:
+    """Run a named script defined in the project.yml. If the script is part
+    of the default pipeline (defined in the "run" section), DVC is used to
+    execute the command, so it can determine whether to rerun it. It then
+    calls into "exec" to execute it.
+
+    project_dir (Path): Path to project directory.
+    subcommand (str): Name of command to run.
+    force (bool): Force re-running, even if nothing changed.
+    dry (bool): Perform a dry run and don't execute commands.
+    """
+    config = load_project_config(project_dir)
+    variables = config.get("variables", {})
+    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+    workflows = config.get("workflows", {})
+    validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+    if subcommand in workflows:
+        msg.info(f"Running workflow '{subcommand}'")
+        for cmd in workflows[subcommand]:
+            project_run(project_dir, cmd, force=force, dry=dry)
+    else:
+        cmd = commands[subcommand]
+        variables = config.get("variables", {})
+        for dep in cmd.get("deps", []):
+            dep = dep.format(**variables)
+            if not (project_dir / dep).exists():
+                err = f"Missing dependency specified by command '{subcommand}': {dep}"
+                err_kwargs = {"exits": 1} if not dry else {}
+                msg.fail(err, **err_kwargs)
+        with working_dir(project_dir) as current_dir:
+            rerun = check_rerun(current_dir, cmd, variables)
+            if not rerun and not force:
+                msg.info(f"Skipping '{cmd['name']}': nothing changed")
+            else:
+                msg.divider(subcommand)
+                run_commands(cmd["script"], variables, dry=dry)
+                if not dry:
+                    update_lockfile(current_dir, cmd, variables)
+
+
+def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
+    """Simulate a CLI help prompt using the info available in the project.yml.
+
+    project_dir (Path): The project directory.
+    subcommand (Optional[str]): The subcommand or None. If a subcommand is
+        provided, the subcommand help is shown. Otherwise, the top-level help
+        and a list of available commands is printed.
+    """
+    config = load_project_config(project_dir)
+    config_commands = config.get("commands", [])
+    commands = {cmd["name"]: cmd for cmd in config_commands}
+    workflows = config.get("workflows", {})
+    project_loc = "" if is_cwd(project_dir) else project_dir
+    if subcommand:
+        validate_subcommand(commands.keys(), workflows.keys(), subcommand)
+        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
+        if subcommand in commands:
+            help_text = commands[subcommand].get("help")
+            if help_text:
+                print(f"\n{help_text}\n")
+        elif subcommand in workflows:
+            steps = workflows[subcommand]
+            print(f"\nWorkflow consisting of {len(steps)} commands:")
+            steps_data = [
+                (f"{i + 1}. {step}", commands[step].get("help", ""))
+                for i, step in enumerate(steps)
+            ]
+            msg.table(steps_data)
+            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
+            print(f"For command details, run: {help_cmd}")
+    else:
+        print("")
+        if config_commands:
+            print(f"Available commands in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
+            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
+        if workflows:
+            print(f"Available workflows in {PROJECT_FILE}")
+            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
+            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
+
+
+def run_commands(
+    commands: List[str] = tuple(),
+    variables: Dict[str, Any] = {},
+    silent: bool = False,
+    dry: bool = False,
+) -> None:
+    """Run a sequence of commands in a subprocess, in order.
+
+    commands (List[str]): The string commands.
+    variables (Dict[str, Any]): Dictionary of variable names, mapped to their
+        values. Will be used to substitute format string variables in the
+        commands.
+    silent (bool): Don't print the commands.
+    dry (bool): Perform a dry run and don't execut anything.
+    """
+    for command in commands:
+        # Substitute variables, e.g. "./{NAME}.json"
+        command = command.format(**variables)
+        command = split_command(command)
+        # Not sure if this is needed or a good idea. Motivation: users may often
+        # use commands in their config that reference "python" and we want to
+        # make sure that it's always executing the same Python that spaCy is
+        # executed with and the pip in the same env, not some other Python/pip.
+        # Also ensures cross-compatibility if user 1 writes "python3" (because
+        # that's how it's set up on their system), and user 2 without the
+        # shortcut tries to re-run the command.
+        if len(command) and command[0] in ("python", "python3"):
+            command[0] = sys.executable
+        elif len(command) and command[0] in ("pip", "pip3"):
+            command = [sys.executable, "-m", "pip", *command[1:]]
+        if not silent:
+            print(f"Running command: {join_command(command)}")
+        if not dry:
+            run_command(command)
+
+
+def validate_subcommand(
+    commands: Sequence[str], workflows: Sequence[str], subcommand: str
+) -> None:
+    """Check that a subcommand is valid and defined. Raises an error otherwise.
+
+    commands (Sequence[str]): The available commands.
+    subcommand (str): The subcommand.
+    """
+    if not commands and not workflows:
+        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
+    if subcommand not in commands and subcommand not in workflows:
+        help_msg = []
+        if commands:
+            help_msg.append(f"Available commands: {', '.join(commands)}")
+        if workflows:
+            help_msg.append(f"Available workflows: {', '.join(workflows)}")
+        msg.fail(
+            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
+            ". ".join(help_msg),
+            exits=1,
+        )
+
+
+def check_rerun(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> bool:
+    """Check if a command should be rerun because its settings or inputs/outputs
+    changed.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (bool): Whether to re-run the command.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():  # We don't have a lockfile, run command
+        return True
+    data = srsly.read_yaml(lock_path)
+    if command["name"] not in data:  # We don't have info about this command
+        return True
+    entry = data[command["name"]]
+    # Always run commands with no outputs (otherwise they'd always be skipped)
+    if not entry.get("outs", []):
+        return True
+    # If the entry in the lockfile matches the lockfile entry that would be
+    # generated from the current command, we don't rerun because it means that
+    # all inputs/outputs, hashes and scripts are the same and nothing changed
+    return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
+
+
+def update_lockfile(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> None:
+    """Update the lockfile after running a command. Will create a lockfile if
+    it doesn't yet exist and will add an entry for the current command, its
+    script and dependencies/outputs.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    """
+    lock_path = project_dir / PROJECT_LOCK
+    if not lock_path.exists():
+        srsly.write_yaml(lock_path, {})
+        data = {}
+    else:
+        data = srsly.read_yaml(lock_path)
+    data[command["name"]] = get_lock_entry(project_dir, command, variables)
+    srsly.write_yaml(lock_path, data)
+
+
+def get_lock_entry(
+    project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Get a lockfile entry for a given command. An entry includes the command,
+    the script (command steps) and a list of dependencies and outputs with
+    their paths and file hashes, if available. The format is based on the
+    dvc.lock files, to keep things consistent.
+
+    project_dir (Path): The current project directory.
+    command (Dict[str, Any]): The command, as defined in the project.yml.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (Dict[str, Any]): The lockfile entry.
+    """
+    deps = get_fileinfo(project_dir, command.get("deps", []), variables)
+    outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
+    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
+    return {
+        "cmd": f"{COMMAND} run {command['name']}",
+        "script": command["script"],
+        "deps": deps,
+        "outs": [*outs, *outs_nc],
+    }
+
+
+def get_fileinfo(
+    project_dir: Path, paths: List[str], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """Generate the file information for a list of paths (dependencies, outputs).
+    Includes the file path and the file's checksum.
+
+    project_dir (Path): The current project directory.
+    paths (List[str]): The file paths.
+    variables (Dict[str, Any]): The variables defined in the project.yml.
+    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
+    """
+    data = []
+    for path in paths:
+        path = path.format(**variables)
+        file_path = project_dir / path
+        md5 = get_checksum(file_path) if file_path.exists() else None
+        data.append({"path": path, "md5": md5})
+    return data
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -1,67 +1,49 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
+from typing import Tuple
 from pathlib import Path
 import sys
 import requests
-import srsly
-from wasabi import msg
+from wasabi import msg, Printer

-from ..compat import path2str
-from ..util import get_data_path
+from ._util import app
 from .. import about
+from ..util import get_package_version, get_installed_models, get_base_version
+from ..util import get_package_path, get_model_meta, is_compatible_version


-def validate():
+@app.command("validate")
+def validate_cli():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
-    with msg.loading("Loading compatibility table..."):
-        r = requests.get(about.__compatibility__)
-        if r.status_code != 200:
-            msg.fail(
-                "Server error ({})".format(r.status_code),
-                "Couldn't fetch compatibility table.",
-                exits=1,
-            )
-    msg.good("Loaded compatibility table")
-    compat = r.json()["spacy"]
-    version = about.__version__
-    version = version.rsplit(".dev", 1)[0]
-    current_compat = compat.get(version)
+    validate()
+
+
+def validate() -> None:
+    model_pkgs, compat = get_model_pkgs()
+    spacy_version = get_base_version(about.__version__)
+    current_compat = compat.get(spacy_version, {})
    if not current_compat:
-        msg.fail(
-            "Can't find spaCy v{} in compatibility table".format(version),
-            about.__compatibility__,
-            exits=1,
-        )
-    all_models = set()
-    for spacy_v, models in dict(compat).items():
-        all_models.update(models.keys())
-        for model, model_vs in models.items():
-            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
-    model_links = get_model_links(current_compat)
-    model_pkgs = get_model_pkgs(current_compat, all_models)
-    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
+        msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
-    incompat_models.update(
-        [d["name"] for _, d in model_links.items() if not d["compat"]]
-    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

-    msg.divider("Installed models (spaCy v{})".format(about.__version__))
-    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
+    msg.divider(f"Installed models (spaCy v{about.__version__})")
+    msg.info(f"spaCy installation: {spacy_dir}")

-    if model_links or model_pkgs:
-        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
+    if model_pkgs:
+        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
-            rows.append(get_model_row(current_compat, name, data, msg))
-        for name, data in model_links.items():
-            rows.append(get_model_row(current_compat, name, data, msg, "link"))
+            if data["compat"]:
+                comp = msg.text("", color="green", icon="good", no_print=True)
+                version = msg.text(data["version"], color="green", no_print=True)
+            else:
+                version = msg.text(data["version"], color="red", no_print=True)
+                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
+            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
@ -71,78 +53,55 @@ def validate():
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
-        msg.text(
-            "The following models are not available for spaCy "
-            "v{}: {}".format(about.__version__, ", ".join(na_models))
+        msg.info(
+            f"The following models are custom spaCy models or not "
+            f"available for spaCy v{about.__version__}:",
+            ", ".join(na_models),
        )
-    if incompat_links:
-        msg.text(
-            "You may also want to overwrite the incompatible links using the "
-            "`python -m spacy link` command with `--force`, or remove them "
-            "from the data directory. "
-            "Data path: {path}".format(path=path2str(get_data_path()))
-        )
-    if incompat_models or incompat_links:
+    if incompat_models:
        sys.exit(1)


-def get_model_links(compat):
-    links = {}
-    data_path = get_data_path()
-    if data_path:
-        models = [p for p in data_path.iterdir() if is_model_path(p)]
-        for model in models:
-            meta_path = Path(model) / "meta.json"
-            if not meta_path.exists():
-                continue
-            meta = srsly.read_json(meta_path)
-            link = model.parts[-1]
-            name = meta["lang"] + "_" + meta["name"]
-            links[link] = {
-                "name": name,
-                "version": meta["version"],
-                "compat": is_compat(compat, name, meta["version"]),
-            }
-    return links
-
-
-def get_model_pkgs(compat, all_models):
-    import pkg_resources
-
+def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
+    msg = Printer(no_print=silent, pretty=not silent)
+    with msg.loading("Loading compatibility table..."):
+        r = requests.get(about.__compatibility__)
+        if r.status_code != 200:
+            msg.fail(
+                f"Server error ({r.status_code})",
+                "Couldn't fetch compatibility table.",
+                exits=1,
+            )
+    msg.good("Loaded compatibility table")
+    compat = r.json()["spacy"]
+    all_models = set()
+    installed_models = get_installed_models()
+    for spacy_v, models in dict(compat).items():
+        all_models.update(models.keys())
+        for model, model_vs in models.items():
+            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    pkgs = {}
-    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+    for pkg_name in installed_models:
        package = pkg_name.replace("-", "_")
-        if package in all_models:
-            version = pkg_data.version
+        version = get_package_version(pkg_name)
+        if package in compat:
+            is_compat = version in compat[package]
+            spacy_version = about.__version__
+        else:
+            model_path = get_package_path(package)
+            model_meta = get_model_meta(model_path)
+            spacy_version = model_meta.get("spacy_version", "n/a")
+            is_compat = is_compatible_version(about.__version__, spacy_version)
        pkgs[pkg_name] = {
            "name": package,
            "version": version,
-                "compat": is_compat(compat, package, version),
+            "spacy": spacy_version,
+            "compat": is_compat,
        }
-    return pkgs
+    return pkgs, compat


-def get_model_row(compat, name, data, msg, model_type="package"):
-    if data["compat"]:
-        comp = msg.text("", color="green", icon="good", no_print=True)
-        version = msg.text(data["version"], color="green", no_print=True)
-    else:
-        version = msg.text(data["version"], color="red", no_print=True)
-        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
-    return (model_type, name, data["name"], version, comp)
-
-
-def is_model_path(model_path):
-    exclude = ["cache", "pycache", "__pycache__"]
-    name = model_path.parts[-1]
-    return model_path.is_dir() and name not in exclude and not name.startswith(".")
-
-
-def is_compat(compat, name, version):
-    return name in compat and version in compat[name]
-
-
-def reformat_version(version):
+def reformat_version(version: str) -> str:
    """Hack to reformat old versions ending on '-alpha' to match pip format."""
    if version.endswith("-alpha"):
        return version.replace("-alpha", "a0")
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,4 +1,3 @@
-# coding: utf8
 """
 Helpers for Python and platform compatibility. To distinguish them from
 the builtin functions, replacement functions are suffixed with an underscore,
@ -6,15 +5,9 @@ e.g. `unicode_`.

 DOCS: https://spacy.io/api/top-level#compat
 """
-from __future__ import unicode_literals
-
-import os
 import sys
-import itertools
-import ast
-import types

-from thinc.neural.util import copy_array
+from thinc.util import copy_array

 try:
    import cPickle as pickle
@ -36,91 +29,23 @@ try:
 except ImportError:
    cupy = None

-try:
-    from thinc.neural.optimizers import Optimizer  # noqa: F401
-except ImportError:
-    from thinc.neural.optimizers import Adam as Optimizer  # noqa: F401
+from thinc.api import Optimizer  # noqa: F401

 pickle = pickle
 copy_reg = copy_reg
 CudaStream = CudaStream
 cupy = cupy
 copy_array = copy_array
-izip = getattr(itertools, "izip", zip)

 is_windows = sys.platform.startswith("win")
 is_linux = sys.platform.startswith("linux")
 is_osx = sys.platform == "darwin"

-# See: https://github.com/benjaminp/six/blob/master/six.py
-is_python2 = sys.version_info[0] == 2
-is_python3 = sys.version_info[0] == 3
-is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)

-if is_python2:
-    bytes_ = str
-    unicode_ = unicode  # noqa: F821
-    basestring_ = basestring  # noqa: F821
-    input_ = raw_input  # noqa: F821
-    path2str = lambda path: str(path).decode("utf8")
-    class_types = (type, types.ClassType)
-
-elif is_python3:
-    bytes_ = bytes
-    unicode_ = str
-    basestring_ = str
-    input_ = input
-    path2str = lambda path: str(path)
-    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
-
-
-def b_to_str(b_str):
-    """Convert a bytes object to a string.
-
-    b_str (bytes): The object to convert.
-    RETURNS (unicode): The converted string.
-    """
-    if is_python2:
-        return b_str
-    # Important: if no encoding is set, string becomes "b'...'"
-    return str(b_str, encoding="utf8")
-
-
-def symlink_to(orig, dest):
-    """Create a symlink. Used for model shortcut links.
-
-    orig (unicode / Path): The origin path.
-    dest (unicode / Path): The destination path of the symlink.
-    """
-    if is_windows:
-        import subprocess
-
-        subprocess.check_call(
-            ["mklink", "/d", path2str(orig), path2str(dest)], shell=True
-        )
-    else:
-        orig.symlink_to(dest)
-
-
-def symlink_remove(link):
-    """Remove a symlink. Used for model shortcut links.
-
-    link (unicode / Path): The path to the symlink.
-    """
-    # https://stackoverflow.com/q/26554135/6400719
-    if os.path.isdir(path2str(link)) and is_windows:
-        # this should only be on Py2.7 and windows
-        os.rmdir(path2str(link))
-    else:
-        os.unlink(path2str(link))
-
-
-def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
+def is_config(windows=None, linux=None, osx=None, **kwargs):
    """Check if a specific configuration of Python version and operating system
    matches the user's setup. Mostly used to display targeted error messages.

-    python2 (bool): spaCy is executed with Python 2.x.
-    python3 (bool): spaCy is executed with Python 3.x.
    windows (bool): spaCy is executed on Windows.
    linux (bool): spaCy is executed on Linux.
    osx (bool): spaCy is executed on OS X or macOS.
@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
    DOCS: https://spacy.io/api/top-level#compat.is_config
    """
    return (
-        python2 in (None, is_python2)
-        and python3 in (None, is_python3)
-        and windows in (None, is_windows)
+        windows in (None, is_windows)
        and linux in (None, is_linux)
        and osx in (None, is_osx)
    )
-
-
-def import_file(name, loc):
-    """Import module from a file. Used to load models from a directory.
-
-    name (unicode): Name of module to load.
-    loc (unicode / Path): Path to the file.
-    RETURNS: The loaded module.
-    """
-    loc = path2str(loc)
-    if is_python_pre_3_5:
-        import imp
-
-        return imp.load_source(name, loc)
-    else:
-        import importlib.util
-
-        spec = importlib.util.spec_from_file_location(name, str(loc))
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-
-def unescape_unicode(string):
-    """Python2.7's re module chokes when compiling patterns that have ranges
-    between escaped unicode codepoints if the two codepoints are unrecognised
-    in the unicode database. For instance:
-
-        re.compile('[\\uAA77-\\uAA79]').findall("hello")
-
-    Ends up matching every character (on Python 2). This problem doesn't occur
-    if we're dealing with unicode literals.
-    """
-    if string is None:
-        return string
-    # We only want to unescape the unicode, so we first must protect the other
-    # backslashes.
-    string = string.replace("\\", "\\\\")
-    # Now we remove that protection for the unicode.
-    string = string.replace("\\\\u", "\\u")
-    string = string.replace("\\\\U", "\\U")
-    # Now we unescape by evaling the string with the AST. This can't execute
-    # code -- it only does the representational level.
-    return ast.literal_eval("u'''" + string + "'''")
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -1,17 +1,13 @@
-# coding: utf8
 """
 spaCy's built in visualization suite for dependencies and named entities.

 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from __future__ import unicode_literals
-
 import warnings

 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc, Span
-from ..compat import b_to_str
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter

@ -26,13 +22,13 @@ def render(
    """Render displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
-    RETURNS (unicode): Rendered HTML markup.
+    RETURNS (str): Rendered HTML markup.

    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers
@ -77,13 +73,13 @@ def serve(
    """Serve displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
-    host (unicode): Host to serve visualisation.
+    host (str): Host to serve visualisation.

    DOCS: https://spacy.io/api/top-level#displacy.serve
    USAGE: https://spacy.io/usage/visualizers
@ -95,20 +91,20 @@ def serve(

    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server(host, port, app)
-    print("\nUsing the '{}' visualizer".format(style))
-    print("Serving on http://{}:{} ...\n".format(host, port))
+    print(f"\nUsing the '{style}' visualizer")
+    print(f"Serving on http://{host}:{port} ...\n")
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
-        print("Shutting down server on port {}.".format(port))
+        print(f"Shutting down server on port {port}.")
    finally:
        httpd.server_close()


 def app(environ, start_response):
    # Headers and status need to be bytes in Python 2, see #1227
-    headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
-    start_response(b_to_str(b"200 OK"), headers)
+    headers = [("Content-type", "text/html; charset=utf-8")]
+    start_response("200 OK", headers)
    res = _html["parsed"].encode(encoding="utf-8")
    return [res]

--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import uuid

 from .templates import (
@ -19,7 +16,7 @@ DEFAULT_LANG = "en"
 DEFAULT_DIR = "ltr"


-class DependencyRenderer(object):
+class DependencyRenderer:
    """Render dependency parses as SVGs."""

    style = "dep"
@ -50,7 +47,7 @@ class DependencyRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered SVG or HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        # Create a random ID prefix to make sure parses don't receive the
        # same ID, even if they're identical
@ -61,7 +58,7 @@ class DependencyRenderer(object):
                settings = p.get("settings", {})
                self.direction = settings.get("direction", DEFAULT_DIR)
                self.lang = settings.get("lang", DEFAULT_LANG)
-            render_id = "{}-{}".format(id_prefix, i)
+            render_id = f"{id_prefix}-{i}"
            svg = self.render_svg(render_id, p["words"], p["arcs"])
            rendered.append(svg)
        if page:
@ -81,7 +78,7 @@ class DependencyRenderer(object):
        render_id (int): Unique ID, typically index of document.
        words (list): Individual words and their tags.
        arcs (list): Individual arcs and their start, end, direction and label.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        self.levels = self.get_levels(arcs)
        self.highest_level = len(self.levels)
@ -115,10 +112,10 @@ class DependencyRenderer(object):
    ):
        """Render individual word.

-        text (unicode): Word text.
-        tag (unicode): Part-of-speech tag.
+        text (str): Word text.
+        tag (str): Part-of-speech tag.
        i (int): Unique ID, typically word index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        y = self.offset_y + self.word_spacing
        x = self.offset_x + i * self.distance
@ -134,12 +131,12 @@ class DependencyRenderer(object):
    def render_arrow(self, label, start, end, direction, i):
        """Render individual arrow.

-        label (unicode): Dependency label.
+        label (str): Dependency label.
        start (int): Index of start word.
        end (int): Index of end word.
-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        i (int): Unique ID, typically arrow index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        if start < 0 or end < 0:
            error_args = dict(start=start, end=end, label=label, dir=direction)
@ -182,7 +179,7 @@ class DependencyRenderer(object):
        y (int): Y-coordinate of arrow start and end point.
        y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
        x_end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arc path ('d' attribute).
+        RETURNS (str): Definition of the arc path ('d' attribute).
        """
        template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
        if self.compact:
@ -192,11 +189,11 @@ class DependencyRenderer(object):
    def get_arrowhead(self, direction, x, y, end):
        """Render individual arrow head.

-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        x (int): X-coordinate of arrow start point.
        y (int): Y-coordinate of arrow start and end point.
        end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arrow head path ('d' attribute).
+        RETURNS (str): Definition of the arrow head path ('d' attribute).
        """
        if direction == "left":
            pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@ -227,7 +224,7 @@ class DependencyRenderer(object):
        return sorted(list(levels))


-class EntityRenderer(object):
+class EntityRenderer:
    """Render named entities as HTML."""

    style = "ent"
@ -282,7 +279,7 @@ class EntityRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered HTML markup.
+        RETURNS (str): Rendered HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
@ -303,9 +300,9 @@ class EntityRenderer(object):
    def render_ents(self, text, spans, title):
        """Render entities in text.

-        text (unicode): Original text.
+        text (str): Original text.
        spans (list): Individual entity spans and their start, end and label.
-        title (unicode or None): Document title set in Doc.user_data['title'].
+        title (str / None): Document title set in Doc.user_data['title'].
        """
        markup = ""
        offset = 0
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 # Setting explicit height and max-width: none on the SVG is required for
 # Jupyter to render it properly in a cell

--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 def add_codes(err_cls):
    """Add error codes to string messages via class attribute names."""

@ -19,17 +15,7 @@ def add_codes(err_cls):
 # fmt: off

@add_codes
-class Warnings(object):
-    W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
-            "You can now call spacy.load with the path as its first argument, "
-            "and the model's meta.json will be used to determine the language "
-            "to load. For example:\nnlp = spacy.load('{path}')")
-    W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
-            "instead and pass in the strings as the `words` keyword argument, "
-            "for example:\nfrom spacy.tokens import Doc\n"
-            "doc = Doc(nlp.vocab, words=[...])")
-    W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
-            "the keyword arguments, for example tag=, lemma= or ent_type=.")
+class Warnings:
    W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
            "using ftfy.fix_text if necessary.")
    W005 = ("Doc object not parsed. This means displaCy won't be able to "
@ -49,12 +35,6 @@ class Warnings(object):
            "use context-sensitive tensors. You can always add your own word "
            "vectors, or use one of the larger models instead if available.")
    W008 = ("Evaluating {obj}.similarity based on empty vectors.")
-    W009 = ("Custom factory '{name}' provided by entry points of another "
-            "package overwrites built-in factory.")
-    W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
-            "limit anymore, so the max_length argument is now deprecated. "
-            "If you did not specify this parameter, make sure you call the "
-            "constructor with named arguments instead of positional ones.")
    W011 = ("It looks like you're calling displacy.serve from within a "
            "Jupyter notebook or a similar environment. This likely means "
            "you're already running a local web server, so there's no need to "
@ -68,23 +48,9 @@ class Warnings(object):
            "components are applied. To only create tokenized Doc objects, "
            "try using `nlp.make_doc(text)` or process all texts as a stream "
            "using `list(nlp.tokenizer.pipe(all_texts))`.")
-    W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
-            "efficient and less error-prone Doc.retokenize context manager "
-            "instead.")
-    W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
-            "methods is and should be replaced with `exclude`. This makes it "
-            "consistent with the other serializable objects.")
-    W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
-            "being serialized or deserialized is deprecated. Please use the "
-            "`exclude` argument instead. For example: exclude=['{arg}'].")
-    W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, "
-            "the argument `n_process` controls parallel inference via "
-            "multiprocessing.")
    W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
    W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
            "ignoring the duplicate entry.")
-    W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
-            "previously loaded vectors. See Issue #3853.")
    W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
            "loaded. (Shape: {shape})")
    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
@ -93,10 +59,8 @@ class Warnings(object):
            "lemmatization rules or data. This means that the trained model "
            "may not be able to lemmatize correctly. If this is intentional "
            "or the language you're using doesn't have lemmatization data, "
-            "please ignore this warning. If this is surprising, make sure you "
+            "you can ignore this warning. If this is surprising, make sure you "
            "have the spacy-lookups-data package installed.")
-    W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
-            "'n_process' will be set to 1.")
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
            "the Knowledge Base.")
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
@ -108,25 +72,11 @@ class Warnings(object):
    W028 = ("Doc.from_array was called with a vector of type '{type}', "
            "but is expecting one of type 'uint64' instead. This may result "
            "in problems with the vocab further on in the pipeline.")
-    W029 = ("Unable to align tokens with entities from character offsets. "
-            "Discarding entity annotation for the text: {text}.")
    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
            "entities \"{entities}\". Use "
            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
-            " to check the alignment. Misaligned entities (with BILUO tag '-') "
-            "will be ignored during training.")
-    W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
-            "is incompatible with the current spaCy version ({current}). This "
-            "may lead to unexpected results or runtime errors. To resolve "
-            "this, download a newer compatible model or retrain your custom "
-            "model with the current spaCy version. For more details and "
-            "available updates, run: python -m spacy validate")
-    W032 = ("Unable to determine model compatibility for model '{model}' "
-            "({model_version}) with the current spaCy version ({current}). "
-            "This may lead to unexpected results or runtime errors. To resolve "
-            "this, download a newer compatible model or retrain your custom "
-            "model with the current spaCy version. For more details and "
-            "available updates, run: python -m spacy validate")
+            " to check the alignment. Misaligned entities ('-') will be "
+            "ignored during training.")
    W033 = ("Training a new {model} using a model with no lexeme normalization "
            "table. This may degrade the performance of the model to some "
            "degree. If this is intentional or the language you're using "
@ -135,9 +85,44 @@ class Warnings(object):
            "package installed. The languages with lexeme normalization tables "
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")

+    # TODO: fix numbering after merging develop into master
+    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
+    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
+    W093 = ("Could not find any data to train the {name} on. Is your "
+            "input data correctly formatted ?")
+    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
+            "spaCy version requirement: {version}. This can lead to compatibility "
+            "problems with older versions, or as new spaCy versions are "
+            "released, because the model may say it's compatible when it's "
+            'not. Consider changing the "spacy_version" in your meta.json to a '
+            "version range, with a lower and upper pin. For example: {example}")
+    W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
+            "incompatible with the current version ({current}). This may lead "
+            "to unexpected results or runtime errors. To resolve this, "
+            "download a newer compatible model or retrain your custom model "
+            "with the current spaCy version. For more details and available "
+            "updates, run: python -m spacy validate")
+    W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
+            "instead.")
+    W097 = ("No Model config was provided to create the '{name}' component, "
+            "and no default configuration could be found either.")
+    W098 = ("No Model config was provided to create the '{name}' component, "
+            "so a default configuration was used.")
+    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
+            "but got '{type}' instead, so ignoring it.")
+    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
+            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+            "string \"Field1=Value1,Value2|Field2=Value3\".")
+    W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
+    W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
+    W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
+            "word segmenters: {supported}. Defaulting to {default}.")
+    W104 = ("Skipping modifications for '{target}' segmenter. The current "
+            "segmenter is '{current}'.")
+

@add_codes
-class Errors(object):
+class Errors:
    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
            "calls `nlp.create_pipe` with a component name that's not built "
@ -156,21 +141,16 @@ class Errors(object):
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
    E008 = ("Some current components would be lost when restoring previous "
            "pipeline state. If you added components after calling "
-            "`nlp.disable_pipes()`, you should remove them explicitly with "
+            "`nlp.select_pipes()`, you should remove them explicitly with "
            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
            "the new components: {names}")
-    E009 = ("The `update` method expects same number of docs and golds, but "
-            "got: {n_docs} docs, {n_golds} golds.")
    E010 = ("Word vectors set to length 0. This may be because you don't have "
            "a model installed or loaded, or because your model doesn't "
            "include word vectors. For more info, see the docs:\n"
            "https://spacy.io/usage/models")
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
-    E013 = ("Error selecting action in matcher")
    E014 = ("Unknown tag ID: {tag}")
-    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
-            "`force=True` to overwrite.")
    E016 = ("MultitaskObjective target should be function or one of: dep, "
            "tag, ent, dep_tag_offset, ent_tag.")
    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
@ -178,21 +158,8 @@ class Errors(object):
            "refers to an issue with the `Vocab` or `StringStore`.")
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
            "IDs are enumerated in spacy/syntax/{src}.pyx.")
-    E020 = ("Could not find a gold-standard action to supervise the "
-            "dependency parser. The tree is non-projective (i.e. it has "
-            "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
-            "The ArcEager transition system only supports projective trees. "
-            "To learn non-projective representations, transform the data "
-            "before training and after parsing. Either pass "
-            "`make_projective=True` to the GoldParse class, or use "
-            "spacy.syntax.nonproj.preprocess_training_data.")
-    E021 = ("Could not find a gold-standard action to supervise the "
-            "dependency parser. The GoldParse was projective. The transition "
-            "system has {n_actions} actions. State at failure: {state}")
    E022 = ("Could not find a transition with the name '{name}' in the NER "
            "model.")
-    E023 = ("Error cleaning up beam: The same state occurred twice at "
-            "memory address {addr} and position {i}.")
    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
            "this means that the model can't be updated in a way that's valid "
            "and satisfies the correct annotations specified in the GoldParse. "
@ -217,7 +184,7 @@ class Errors(object):
            "the documentation:\nhttps://spacy.io/usage/models")
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
            "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
@ -227,16 +194,12 @@ class Errors(object):
            "the HEAD attribute would potentially override the sentence "
            "boundaries set by SENT_START.")
    E033 = ("Cannot load into non-empty Doc of length {length}.")
-    E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
-            "either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
-            "Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
    E035 = ("Error creating span with start {start} and end {end} for Doc of "
            "length {length}.")
    E036 = ("Error calculating span: Can't find a token starting at character "
            "offset {start}.")
    E037 = ("Error calculating span: Can't find a token ending at character "
            "offset {end}.")
-    E038 = ("Error finding sentence for span. Infinite loop detected.")
    E039 = ("Array bounds exceeded while searching for root word. This likely "
            "means the parse tree is in an invalid state. Please report this "
            "issue here: http://github.com/explosion/spaCy/issues")
@ -253,15 +216,10 @@ class Errors(object):
    E047 = ("Can't assign a value to unregistered extension attribute "
            "'{name}'. Did you forget to call the `set_extension` method?")
    E048 = ("Can't import language {lang} from spacy.lang: {err}")
-    E049 = ("Can't find spaCy data directory: '{path}'. Check your "
-            "installation and permissions, or use spacy.util.set_data_path "
-            "to customise the location if necessary.")
-    E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
-            "link, a Python package or a valid path to a data directory.")
-    E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
-            "it points to a valid package (not just a data directory).")
+    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
+            "package or a valid path to a data directory.")
    E052 = ("Can't find model directory: {path}")
-    E053 = ("Could not read meta.json from {path}")
+    E053 = ("Could not read {name} from {path}")
    E054 = ("No valid '{setting}' setting found in model meta.json.")
    E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
    E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
@ -272,8 +230,6 @@ class Errors(object):
    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
            "({rows}, {cols}).")
-    E061 = ("Bad file name: {filename}. Example of a valid file name: "
-            "'vectors.128.f.bin'")
    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
            "and 63 are occupied. You can replace one by specifying the "
            "`flag_id` explicitly, e.g. "
@ -287,39 +243,17 @@ class Errors(object):
            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
    E065 = ("Only one of the vector table's width and shape can be specified. "
            "Got width {width} and shape {shape}.")
-    E066 = ("Error creating model helper for extracting columns. Can only "
-            "extract columns by positive integer. Got: {value}.")
    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
            "an entity) without a preceding 'B' (beginning of an entity). "
            "Tag sequence:\n{tags}")
    E068 = ("Invalid BILUO tag: '{tag}'.")
-    E069 = ("Invalid gold-standard parse tree. Found cycle between word "
-            "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
-            "with tokens: {doc_tokens}.")
-    E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
-            "does not align with number of annotations ({n_annots}).")
    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
            "match the one in the vocab ({vocab_orth}).")
-    E072 = ("Error serializing lexeme: expected data length {length}, "
-            "got {bad_length}.")
    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
            "are of length {length}. You can use `vocab.reset_vectors` to "
            "clear the existing vectors and resize the table.")
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
            "to end with the attribute {attr}. Got: {bad_attr}.")
-    E075 = ("Error accepting match: length ({length}) > maximum length "
-            "({max_len}).")
-    E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
-            "has {words} words.")
-    E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
-            "equal number of GoldParse objects ({n_golds}) in batch.")
-    E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
-            "not equal number of words in GoldParse ({words_gold}).")
-    E079 = ("Error computing states in beam: number of predicted beams "
-            "({pbeams}) does not equal number of gold beams ({gbeams}).")
-    E080 = ("Duplicate state found in beam: {key}.")
-    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
-            "does not equal number of losses ({losses}).")
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
            "match.")
@ -327,8 +261,6 @@ class Errors(object):
            "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
    E085 = ("Can't create lexeme for string '{string}'.")
-    E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
-            "not match hash {hash_id} in StringStore.")
    E087 = ("Unknown displaCy style: {style}.")
    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
            "v2.x parser and NER models require roughly 1GB of temporary "
@ -370,17 +302,11 @@ class Errors(object):
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
            "token can only be part of one entity, so make sure the entities "
            "you're setting don't overlap.")
-    E104 = ("Can't find JSON schema for '{name}'.")
-    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
-            "Doc.to_json() instead or write your own function.")
    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
            "settings: {opts}")
    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
-    E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
-            "in favor of the pipe name `sentencizer`, which does the same "
-            "thing. For example, use `nlp.create_pipeline('sentencizer')`")
-    E109 = ("Model for component '{name}' not initialized. Did you forget to "
-            "load a model, or forget to call begin_training()?")
+    E109 = ("Component '{name}' could not be run. Did you forget to "
+            "call begin_training()?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
    E111 = ("Pickling a token is not supported, because tokens are only views "
            "of the parent Doc and can't exist on their own. A pickled token "
@ -393,8 +319,6 @@ class Errors(object):
            "practically no advantage over pickling the parent Doc directly. "
            "So instead of pickling the span, pickle the Doc it belongs to or "
            "use Span.as_doc to convert the span to a standalone Doc object.")
-    E113 = ("The newly split token can only have one root (head = 0).")
-    E114 = ("The newly split token needs to have a root (head = 0).")
    E115 = ("All subtokens must have associated heads.")
    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
            "labels before training begins. This functionality was available "
@ -417,16 +341,9 @@ class Errors(object):
            "equal to span length ({span_len}).")
    E122 = ("Cannot find token to be split. Did it get merged?")
    E123 = ("Cannot find head of token to be split. Did it get merged?")
-    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
    E125 = ("Unexpected value: {value}")
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
-    E127 = ("Cannot create phrase pattern representation for length 0. This "
-            "is likely a bug in spaCy.")
-    E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
-            "arguments to exclude fields from being serialized or deserialized "
-            "is now deprecated. Please use the `exclude` argument instead. "
-            "For example: exclude=['{arg}'].")
    E129 = ("Cannot write the label of an existing Span object because a Span "
            "is a read-only view of the underlying Token objects stored in the "
            "Doc. Instead, create a new Span object and specify the `label` "
@ -450,8 +367,6 @@ class Errors(object):
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
-    E136 = ("This additional feature requires the jsonschema library to be "
-            "installed:\npip install jsonschema")
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
            "to provide a valid JSON object as input with either the `text` "
            "or `tokens` key. For more info, see the docs:\n"
@ -459,18 +374,13 @@ class Errors(object):
    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
            "includes either the `text` or `tokens` key. For more info, see "
            "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
-            "forget to call set_kb()?")
+    E139 = ("Knowledge Base for component '{name}' is empty.")
    E140 = ("The list of entities, prior probabilities and entity vectors "
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
            "provided {found}.")
-    E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
-            "'cosine'.")
    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
            "call add_label()?")
-    E144 = ("Could not find parameter `{param}` when building the entity "
-            "linker model.")
    E145 = ("Error reading `{param}` from input file.")
    E146 = ("Could not access `{path}`.")
    E147 = ("Unexpected error in the {method} functionality of the "
@ -482,8 +392,6 @@ class Errors(object):
            "the component matches the model being loaded.")
    E150 = ("The language of the `nlp` object and the `vocab` should be the "
            "same, but found '{nlp}' and '{vocab}' respectively.")
-    E151 = ("Trying to call nlp.update without required annotation types. "
-            "Expected top-level keys: {exp}. Got: {unexp}.")
    E152 = ("The attribute {attr} is not supported for token patterns. "
            "Please use the option validate=True with Matcher, PhraseMatcher, "
            "or EntityRuler for more details.")
@ -520,11 +428,6 @@ class Errors(object):
            "that case.")
    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
            "Current DocBin: {current}\nOther DocBin: {other}")
-    E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
-            "happen if the tagger was trained with a different set of "
-            "morphological features. If you're using a pretrained model, make "
-            "sure that your models are up to date:\npython -m spacy validate")
-    E168 = ("Unknown field: {field}")
    E169 = ("Can't find module: {module}")
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
    E171 = ("Matcher.add received invalid on_match callback argument: expected "
@ -532,11 +435,6 @@ class Errors(object):
    E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
            "Lemmatizer, initialize the class directly. See the docs for "
            "details: https://spacy.io/api/lemmatizer")
-    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
-            "Lookups containing the lemmatization tables. See the docs for "
-            "details: https://spacy.io/api/lemmatizer#init")
-    E174 = ("Architecture '{name}' not found in registry. Available "
-            "names: {names}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
    E177 = ("Ill-formed IOB input detected: {tag}")
@ -564,9 +462,6 @@ class Errors(object):
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
    E187 = ("Only unicode strings are supported as labels.")
-    E188 = ("Could not match the gold entity links to entities in the doc - "
-            "make sure the gold EL data refers to valid results of the "
-            "named entity recognizer in the `nlp` pipeline.")
    E189 = ("Each argument to `get_doc` should be of equal length.")
    E190 = ("Token head out of range in `Doc.from_array()` for token index "
            "'{index}' with value '{value}' (equivalent to relative head "
@ -587,20 +482,74 @@ class Errors(object):
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
            "table, which contains {n_rows} vectors.")
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
-    E200 = ("Specifying a base model with a pretrained component '{component}' "
-            "can not be combined with adding a pretrained Tok2Vec layer.")
+
+    # TODO: fix numbering after merging develop into master
+    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
+    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
+    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
+            "array and {doc_length} for the Doc itself.")
+    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
+    E973 = ("Unexpected type for NER data")
+    E974 = ("Unknown {obj} attribute: {key}")
+    E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
+            "but received None.")
+    E977 = ("Can not compare a MorphAnalysis with a string object. "
+            "This is likely a bug in spaCy, so feel free to open an issue.")
+    E978 = ("The '{method}' method of {name} takes a list of Example objects, "
+            "but found {types} instead.")
+    E979 = ("Cannot convert {type} to an Example object.")
+    E980 = ("Each link annotation should refer to a dictionary with at most one "
+            "identifier mapping to 1.0, and all others to 0.0.")
+    E981 = ("The offsets of the annotations for 'links' could not be aligned "
+            "to token boundaries.")
+    E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
+            "into {values}, but found {value}.")
+    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
+            "{keys}")
+    E985 = ("The pipeline component '{component}' is already available in the base "
+            "model. The settings in the component block in the config file are "
+            "being ignored. If you want to replace this component instead, set "
+            "'replace' to True in the training configuration.")
+    E986 = ("Could not create any training batches: check your input. "
+            "Perhaps discard_oversize should be set to False ?")
+    E987 = ("The text of an example training instance is either a Doc or "
+            "a string, but found {type} instead.")
+    E988 = ("Could not parse any training examples. Ensure the data is "
+            "formatted correctly.")
+    E989 = ("'nlp.update()' was called with two positional arguments. This "
+            "may be due to a backwards-incompatible change to the format "
+            "of the training data in spaCy 3.0 onwards. The 'update' "
+            "function should now be called with a batch of 'Example' "
+            "objects, instead of (text, annotation) tuples. ")
+    E990 = ("An entity linking component needs to be initialized with a "
+            "KnowledgeBase object, but found {type} instead.")
+    E991 = ("The function 'select_pipes' should be called with either a "
+            "'disable' argument to list the names of the pipe components "
+            "that should be disabled, or with an 'enable' argument that "
+            "specifies which pipes should not be disabled.")
+    E992 = ("The function `select_pipes` was called with `enable`={enable} "
+            "and `disable`={disable} but that information is conflicting "
+            "for the `nlp` pipeline with components {names}.")
+    E993 = ("The config for 'nlp' should include either a key 'name' to "
+            "refer to an existing model by name or path, or a key 'lang' "
+            "to create a new blank model.")
+    E996 = ("Could not parse {file}: {msg}")
+    E997 = ("Tokenizer special cases are not allowed to modify the text. "
+            "This would map '{chunk}' to '{orth}' given token attributes "
+            "'{token_attrs}'.")
+    E999 = ("Unable to merge the `Doc` objects because they do not all share "
+            "the same `Vocab`.")
+    E1000 = ("No pkuseg model available. Provide a pkuseg model when "
+            "initializing the pipeline: "
+            '`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
+            'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')


@add_codes
-class TempErrors(object):
+class TempErrors:
    T003 = ("Resizing pretrained Tagger models is not currently supported.")
-    T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
            "issue tracker: http://github.com/explosion/spaCy/issues")
-    T008 = ("Bad configuration of Tagger. This is probably a bug within "
-            "spaCy. We changed the name of an internal attribute for loading "
-            "pretrained vectors, and the class has been passed the old name "
-            "(pretrained_dims) but not the new name (pretrained_vectors).")


 # fmt: on
@ -610,14 +559,14 @@ class MatchPatternError(ValueError):
    def __init__(self, key, errors):
        """Custom error for validating match patterns.

-        key (unicode): The name of the matcher rule.
+        key (str): The name of the matcher rule.
        errors (dict): Validation errors (sequence of strings) mapped to pattern
            ID, i.e. the index of the added pattern.
        """
-        msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
+        msg = f"Invalid token patterns for matcher rule '{key}'\n"
        for pattern_idx, error_msgs in errors.items():
-            pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
-            msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
+            pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
+            msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
        ValueError.__init__(self, msg)


--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,12 +1,8 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.

-    term (unicode): The term to explain.
-    RETURNS (unicode): The explanation, or `None` if not found in the glossary.
+    term (str): The term to explain.
+    RETURNS (str): The explanation, or `None` if not found in the glossary.

    EXAMPLE:
        >>> spacy.explain(u'NORP')
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -1,41 +0,0 @@
-from cymem.cymem cimport Pool
-
-from .structs cimport TokenC
-from .typedefs cimport attr_t
-from .syntax.transition_system cimport Transition
-
-
-cdef struct GoldParseC:
-    int* tags
-    int* heads
-    int* has_dep
-    int* sent_start
-    attr_t* labels
-    int** brackets
-    Transition* ner
-
-
-cdef class GoldParse:
-    cdef Pool mem
-
-    cdef GoldParseC c
-
-    cdef int length
-    cdef public int loss
-    cdef public list words
-    cdef public list tags
-    cdef public list morphology
-    cdef public list heads
-    cdef public list labels
-    cdef public dict orths
-    cdef public list ner
-    cdef public list ents
-    cdef public dict brackets
-    cdef public object cats
-    cdef public dict links
-
-    cdef readonly list cand_to_gold
-    cdef readonly list gold_to_cand
-    cdef readonly list orig_annot
-
-
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
--- a/spacy/gold/init.pxd
+++ b/spacy/gold/init.pxd
--- a/spacy/gold/init.py
+++ b/spacy/gold/init.py
@ -0,0 +1,11 @@
+from .corpus import Corpus
+from .example import Example
+from .align import Alignment
+
+from .iob_utils import iob_to_biluo, biluo_to_iob
+from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
+from .iob_utils import spans_from_biluo_tags
+from .iob_utils import tags_to_entities
+
+from .gold_io import docs_to_json
+from .gold_io import read_json_file
--- a/spacy/gold/align.py
+++ b/spacy/gold/align.py
@ -0,0 +1,30 @@
+from typing import List
+import numpy
+from thinc.types import Ragged
+from dataclasses import dataclass
+import tokenizations
+
+
+@dataclass
+class Alignment:
+    x2y: Ragged
+    y2x: Ragged
+
+    @classmethod
+    def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
+        x2y = _make_ragged(x2y)
+        y2x = _make_ragged(y2x)
+        return Alignment(x2y=x2y, y2x=y2x)
+    
+    @classmethod
+    def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
+        x2y, y2x = tokenizations.get_alignments(A, B)
+        return Alignment.from_indices(x2y=x2y, y2x=y2x)
+
+
+def _make_ragged(indices):
+    lengths = numpy.array([len(x) for x in indices], dtype="i")
+    flat = []
+    for x in indices:
+        flat.extend(x)
+    return Ragged(numpy.array(flat, dtype="i"), lengths)
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@ -0,0 +1,111 @@
+import random
+import itertools
+
+
+def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
+    raw_text = example.text
+    orig_dict = example.to_dict()
+    variant_text, variant_token_annot = make_orth_variants(
+        nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
+    )
+    doc = nlp.make_doc(variant_text)
+    orig_dict["token_annotation"] = variant_token_annot
+    return example.from_dict(doc, orig_dict)
+
+
+def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
+    if random.random() >= orth_variant_level:
+        return raw_text, orig_token_dict
+    if not orig_token_dict:
+        return raw_text, orig_token_dict
+    raw = raw_text
+    token_dict = orig_token_dict
+    lower = False
+    if random.random() >= 0.5:
+        lower = True
+        if raw is not None:
+            raw = raw.lower()
+    ndsv = nlp.Defaults.single_orth_variants
+    ndpv = nlp.Defaults.paired_orth_variants
+    words = token_dict.get("words", [])
+    tags = token_dict.get("tags", [])
+    # keep unmodified if words or tags are not defined
+    if words and tags:
+        if lower:
+            words = [w.lower() for w in words]
+        # single variants
+        punct_choices = [random.choice(x["variants"]) for x in ndsv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndsv)):
+                if (
+                    tags[word_idx] in ndsv[punct_idx]["tags"]
+                    and words[word_idx] in ndsv[punct_idx]["variants"]
+                ):
+                    words[word_idx] = punct_choices[punct_idx]
+        # paired variants
+        punct_choices = [random.choice(x["variants"]) for x in ndpv]
+        for word_idx in range(len(words)):
+            for punct_idx in range(len(ndpv)):
+                if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
+                    word_idx
+                ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                    # backup option: random left vs. right from pair
+                    pair_idx = random.choice([0, 1])
+                    # best option: rely on paired POS tags like `` / ''
+                    if len(ndpv[punct_idx]["tags"]) == 2:
+                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                    # next best option: rely on position in variants
+                    # (may not be unambiguous, so order of variants matters)
+                    else:
+                        for pair in ndpv[punct_idx]["variants"]:
+                            if words[word_idx] in pair:
+                                pair_idx = pair.index(words[word_idx])
+                    words[word_idx] = punct_choices[punct_idx][pair_idx]
+        token_dict["words"] = words
+        token_dict["tags"] = tags
+    # modify raw
+    if raw is not None:
+        variants = []
+        for single_variants in ndsv:
+            variants.extend(single_variants["variants"])
+        for paired_variants in ndpv:
+            variants.extend(
+                list(itertools.chain.from_iterable(paired_variants["variants"]))
+            )
+        # store variants in reverse length order to be able to prioritize
+        # longer matches (e.g., "---" before "--")
+        variants = sorted(variants, key=lambda x: len(x))
+        variants.reverse()
+        variant_raw = ""
+        raw_idx = 0
+        # add initial whitespace
+        while raw_idx < len(raw) and raw[raw_idx].isspace():
+            variant_raw += raw[raw_idx]
+            raw_idx += 1
+        for word in words:
+            match_found = False
+            # skip whitespace words
+            if word.isspace():
+                match_found = True
+            # add identical word
+            elif word not in variants and raw[raw_idx:].startswith(word):
+                variant_raw += word
+                raw_idx += len(word)
+                match_found = True
+            # add variant word
+            else:
+                for variant in variants:
+                    if not match_found and raw[raw_idx:].startswith(variant):
+                        raw_idx += len(variant)
+                        variant_raw += word
+                        match_found = True
+            # something went wrong, abort
+            # (add a warning message?)
+            if not match_found:
+                return raw_text, orig_token_dict
+            # add following whitespace
+            while raw_idx < len(raw) and raw[raw_idx].isspace():
+                variant_raw += raw[raw_idx]
+                raw_idx += 1
+        raw = variant_raw
+    return raw, token_dict
--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -0,0 +1,4 @@
+from .iob2docs import iob2docs  # noqa: F401
+from .conll_ner2docs import conll_ner2docs  # noqa: F401
+from .json2docs import json2docs
+from .conllu2docs import conllu2docs  # noqa: F401
--- a/spacy/gold/converters/conll_ner2docs.py
+++ b/spacy/gold/converters/conll_ner2docs.py
@ -1,20 +1,18 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from wasabi import Printer

+from .. import tags_to_entities
 from ...gold import iob_to_biluo
 from ...lang.xx import MultiLanguage
-from ...tokens.doc import Doc
+from ...tokens import Doc, Span
 from ...util import load_model


-def conll_ner2json(
+def conll_ner2docs(
    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
    """
    Convert files in the CoNLL-2003 NER format and similar
-    whitespace-separated columns into JSON format for use with train cli.
+    whitespace-separated columns into Doc objects.

    The first column is the tokens, the final column is the IOB tags. If an
    additional second column is present, the second column is the tags.
@ -64,9 +62,9 @@ def conll_ner2json(
        # sentence segmentation required for document segmentation
        if n_sents > 0 and not seg_sents:
            msg.warn(
-                "No sentence boundaries found to use with option `-n {}`. "
-                "Use `-s` to automatically segment sentences or `-n 0` "
-                "to disable.".format(n_sents)
+                f"No sentence boundaries found to use with option `-n {n_sents}`. "
+                f"Use `-s` to automatically segment sentences or `-n 0` "
+                f"to disable."
            )
        else:
            n_sents_info(msg, n_sents)
@ -84,17 +82,25 @@ def conll_ner2json(
            "No document delimiters found. Use `-n` to automatically group "
            "sentences into documents."
        )
+
+    if model:
+        nlp = load_model(model)
+    else:
+        nlp = MultiLanguage()
    output_docs = []
-    for doc in input_data.strip().split(doc_delimiter):
-        doc = doc.strip()
-        if not doc:
+    for conll_doc in input_data.strip().split(doc_delimiter):
+        conll_doc = conll_doc.strip()
+        if not conll_doc:
            continue
-        output_doc = []
-        for sent in doc.split("\n\n"):
-            sent = sent.strip()
-            if not sent:
+        words = []
+        sent_starts = []
+        pos_tags = []
+        biluo_tags = []
+        for conll_sent in conll_doc.split("\n\n"):
+            conll_sent = conll_sent.strip()
+            if not conll_sent:
                continue
-            lines = [line.strip() for line in sent.split("\n") if line.strip()]
+            lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
            cols = list(zip(*[line.split() for line in lines]))
            if len(cols) < 2:
                raise ValueError(
@ -102,25 +108,19 @@ def conll_ner2json(
                    "Try checking whitespace and delimiters. See "
                    "https://spacy.io/api/cli#convert"
                )
-            words = cols[0]
-            iob_ents = cols[-1]
-            if len(cols) > 2:
-                tags = cols[1]
-            else:
-                tags = ["-"] * len(words)
-            biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append(
-                {
-                    "tokens": [
-                        {"orth": w, "tag": tag, "ner": ent}
-                        for (w, tag, ent) in zip(words, tags, biluo_ents)
-                    ]
-                }
-            )
-        output_docs.append(
-            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
-        )
-        output_doc = []
+            length = len(cols[0])
+            words.extend(cols[0])
+            sent_starts.extend([True] + [False] * (length - 1))
+            biluo_tags.extend(iob_to_biluo(cols[-1]))
+            pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
+
+        doc = Doc(nlp.vocab, words=words)
+        for i, token in enumerate(doc):
+            token.tag_ = pos_tags[i]
+            token.is_sent_start = sent_starts[i]
+        entities = tags_to_entities(biluo_tags)
+        doc.ents = [Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities]
+        output_docs.append(doc)
    return output_docs


@ -129,7 +129,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
    if model:
        nlp = load_model(model)
        if "parser" in nlp.pipe_names:
-            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
+            msg.info(f"Segmenting sentences with parser from model '{model}'.")
            sentencizer = nlp.get_pipe("parser")
    if not sentencizer:
        msg.info(
@ -166,7 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):


 def n_sents_info(msg, n_sents):
-    msg.info("Grouping every {} sentences into a document.".format(n_sents))
+    msg.info(f"Grouping every {n_sents} sentences into a document.")
    if n_sents == 1:
        msg.warn(
            "To generate better training data, you may want to group "
--- a/spacy/gold/converters/conllu2docs.py
+++ b/spacy/gold/converters/conllu2docs.py
@ -0,0 +1,295 @@
+import re
+
+from .conll_ner2docs import n_sents_info
+from ...gold import Example
+from ...gold import iob_to_biluo, spans_from_biluo_tags
+from ...language import Language
+from ...tokens import Doc, Token, Span
+from wasabi import Printer
+
+
+def conllu2docs(
+    input_data,
+    n_sents=10,
+    append_morphology=False,
+    ner_map=None,
+    merge_subtokens=False,
+    no_print=False,
+    **_
+):
+    """
+    Convert conllu files into JSON format for use with train cli.
+    append_morphology parameter enables appending morphology to tags, which is
+    useful for languages such as Spanish, where UD tags are not so rich.
+
+    Extract NER tags if available and convert them so that they follow
+    BILUO and the Wikipedia scheme
+    """
+    MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
+    msg = Printer(no_print=no_print)
+    n_sents_info(msg, n_sents)
+    sent_docs = read_conllx(
+        input_data,
+        append_morphology=append_morphology,
+        ner_tag_pattern=MISC_NER_PATTERN,
+        ner_map=ner_map,
+        merge_subtokens=merge_subtokens,
+    )
+    docs = []
+    sent_docs_to_merge = []
+    for sent_doc in sent_docs:
+        sent_docs_to_merge.append(sent_doc)
+        if len(sent_docs_to_merge) % n_sents == 0:
+            docs.append(Doc.from_docs(sent_docs_to_merge))
+            sent_docs_to_merge = []
+    if sent_docs_to_merge:
+        docs.append(Doc.from_docs(sent_docs_to_merge))
+    return docs
+
+
+def has_ner(input_data, ner_tag_pattern):
+    """
+    Check the MISC column for NER tags.
+    """
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
+        if lines:
+            while lines[0].startswith("#"):
+                lines.pop(0)
+            for line in lines:
+                parts = line.split("\t")
+                id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+                for misc_part in misc.split("|"):
+                    if re.match(ner_tag_pattern, misc_part):
+                        return True
+    return False
+
+
+def read_conllx(
+    input_data,
+    append_morphology=False,
+    merge_subtokens=False,
+    ner_tag_pattern="",
+    ner_map=None,
+):
+    """ Yield docs, one for each sentence """
+    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
+        if lines:
+            while lines[0].startswith("#"):
+                lines.pop(0)
+            doc = doc_from_conllu_sentence(
+                vocab,
+                lines,
+                ner_tag_pattern,
+                merge_subtokens=merge_subtokens,
+                append_morphology=append_morphology,
+                ner_map=ner_map,
+            )
+            yield doc
+
+
+def get_entities(lines, tag_pattern, ner_map=None):
+    """Find entities in the MISC column according to the pattern and map to
+    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
+    the pattern is not matched.
+
+    lines (str): CONLL-U lines for one sentences
+    tag_pattern (str): Regex pattern for entity tag
+    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
+    RETURNS (list): List of BILUO entity tags
+    """
+    miscs = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_ or "." in id_:
+            continue
+        miscs.append(misc)
+
+    iob = []
+    for misc in miscs:
+        iob_tag = "O"
+        for misc_part in misc.split("|"):
+            tag_match = re.match(tag_pattern, misc_part)
+            if tag_match:
+                prefix = tag_match.group(2)
+                suffix = tag_match.group(3)
+                if prefix and suffix:
+                    iob_tag = prefix + "-" + suffix
+                    if ner_map:
+                        suffix = ner_map.get(suffix, suffix)
+                        if suffix == "":
+                            iob_tag = "O"
+                        else:
+                            iob_tag = prefix + "-" + suffix
+                break
+        iob.append(iob_tag)
+    return iob_to_biluo(iob)
+
+
+def doc_from_conllu_sentence(
+    vocab,
+    lines,
+    ner_tag_pattern,
+    merge_subtokens=False,
+    append_morphology=False,
+    ner_map=None,
+):
+    """Create an Example from the lines for one CoNLL-U sentence, merging
+    subtokens and appending morphology to tags if required.
+
+    lines (str): The non-comment lines for a CoNLL-U sentence
+    ner_tag_pattern (str): The regex pattern for matching NER in MISC col
+    RETURNS (Example): An example containing the annotation
+    """
+    # create a Doc with each subtoken as its own token
+    # if merging subtokens, each subtoken orth is the merged subtoken form
+    if not Token.has_extension("merged_orth"):
+        Token.set_extension("merged_orth", default="")
+    if not Token.has_extension("merged_lemma"):
+        Token.set_extension("merged_lemma", default="")
+    if not Token.has_extension("merged_morph"):
+        Token.set_extension("merged_morph", default="")
+    if not Token.has_extension("merged_spaceafter"):
+        Token.set_extension("merged_spaceafter", default="")
+    words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
+    heads, deps = [], []
+    subtok_word = ""
+    in_subtok = False
+    for i in range(len(lines)):
+        line = lines[i]
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "." in id_:
+            continue
+        if "-" in id_:
+            in_subtok = True
+        if "-" in id_:
+            in_subtok = True
+            subtok_word = word
+            subtok_start, subtok_end = id_.split("-")
+            subtok_spaceafter = "SpaceAfter=No" not in misc
+            continue
+        if merge_subtokens and in_subtok:
+            words.append(subtok_word)
+        else:
+            words.append(word)
+        if in_subtok:
+            if id_ == subtok_end:
+                spaces.append(subtok_spaceafter)
+            else:
+                spaces.append(False)
+        elif "SpaceAfter=No" in misc:
+            spaces.append(False)
+        else:
+            spaces.append(True)
+        if in_subtok and id_ == subtok_end:
+            subtok_word = ""
+            in_subtok = False
+        id_ = int(id_) - 1
+        head = (int(head) - 1) if head not in ("0", "_") else id_
+        tag = pos if tag == "_" else tag
+        morph = morph if morph != "_" else ""
+        dep = "ROOT" if dep == "root" else dep
+        lemmas.append(lemma)
+        poses.append(pos)
+        tags.append(tag)
+        morphs.append(morph)
+        heads.append(head)
+        deps.append(dep)
+
+    doc = Doc(vocab, words=words, spaces=spaces)
+    for i in range(len(doc)):
+        doc[i].tag_ = tags[i]
+        doc[i].pos_ = poses[i]
+        doc[i].dep_ = deps[i]
+        doc[i].lemma_ = lemmas[i]
+        doc[i].head = doc[heads[i]]
+        doc[i]._.merged_orth = words[i]
+        doc[i]._.merged_morph = morphs[i]
+        doc[i]._.merged_lemma = lemmas[i]
+        doc[i]._.merged_spaceafter = spaces[i]
+    ents = get_entities(lines, ner_tag_pattern, ner_map)
+    doc.ents = spans_from_biluo_tags(doc, ents)
+    doc.is_parsed = True
+    doc.is_tagged = True
+
+    if merge_subtokens:
+        doc = merge_conllu_subtokens(lines, doc)
+
+    # create final Doc from custom Doc annotation
+    words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
+    heads, deps = [], []
+    for i, t in enumerate(doc):
+        words.append(t._.merged_orth)
+        lemmas.append(t._.merged_lemma)
+        spaces.append(t._.merged_spaceafter)
+        morphs.append(t._.merged_morph)
+        if append_morphology and t._.merged_morph:
+            tags.append(t.tag_ + "__" + t._.merged_morph)
+        else:
+            tags.append(t.tag_)
+        poses.append(t.pos_)
+        heads.append(t.head.i)
+        deps.append(t.dep_)
+
+    doc_x = Doc(vocab, words=words, spaces=spaces)
+    for i in range(len(doc)):
+        doc_x[i].tag_ = tags[i]
+        doc_x[i].morph_ = morphs[i]
+        doc_x[i].lemma_ = lemmas[i]
+        doc_x[i].pos_ = poses[i]
+        doc_x[i].dep_ = deps[i]
+        doc_x[i].head = doc_x[heads[i]]
+    doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
+    doc_x.is_parsed = True
+    doc_x.is_tagged = True
+
+    return doc_x
+
+
+def merge_conllu_subtokens(lines, doc):
+    # identify and process all subtoken spans to prepare attrs for merging
+    subtok_spans = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_:
+            subtok_start, subtok_end = id_.split("-")
+            subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)]
+            subtok_spans.append(subtok_span)
+            # create merged tag, morph, and lemma values
+            tags = []
+            morphs = {}
+            lemmas = []
+            for token in subtok_span:
+                tags.append(token.tag_)
+                lemmas.append(token.lemma_)
+                if token._.merged_morph:
+                    for feature in token._.merged_morph.split("|"):
+                        field, values = feature.split("=", 1)
+                        if field not in morphs:
+                            morphs[field] = set()
+                        for value in values.split(","):
+                            morphs[field].add(value)
+            # create merged features for each morph field
+            for field, values in morphs.items():
+                morphs[field] = field + "=" + ",".join(sorted(values))
+            # set the same attrs on all subtok tokens so that whatever head the
+            # retokenizer chooses, the final attrs are available on that token
+            for token in subtok_span:
+                token._.merged_orth = token.orth_
+                token._.merged_lemma = " ".join(lemmas)
+                token.tag_ = "_".join(tags)
+                token._.merged_morph = "|".join(sorted(morphs.values()))
+                token._.merged_spaceafter = (
+                    True if subtok_span[-1].whitespace_ else False
+                )
+
+    with doc.retokenize() as retokenizer:
+        for span in subtok_spans:
+            retokenizer.merge(span)
+
+    return doc
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@ -0,0 +1,64 @@
+from wasabi import Printer
+
+from .conll_ner2docs import n_sents_info
+from ...gold import iob_to_biluo, tags_to_entities
+from ...tokens import Doc, Span
+from ...util import minibatch
+
+
+def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
+    """
+    Convert IOB files with one sentence per line and tags separated with '|'
+    into Doc objects so they can be saved. IOB and IOB2 are accepted.
+
+    Sample formats:
+
+    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
+    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
+    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
+    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
+    """
+    msg = Printer(no_print=no_print)
+    if n_sents > 0:
+        n_sents_info(msg, n_sents)
+    docs = read_iob(input_data.split("\n"), vocab, n_sents)
+    return docs
+
+
+def read_iob(raw_sents, vocab, n_sents):
+    docs = []
+    for group in minibatch(raw_sents, size=n_sents):
+        tokens = []
+        words = []
+        tags = []
+        iob = []
+        sent_starts = []
+        for line in group:
+            if not line.strip():
+                continue
+            sent_tokens = [t.split("|") for t in line.split()]
+            if len(sent_tokens[0]) == 3:
+                sent_words, sent_tags, sent_iob = zip(*sent_tokens)
+            elif len(sent_tokens[0]) == 2:
+                sent_words, sent_iob = zip(*sent_tokens)
+                sent_tags = ["-"] * len(sent_words)
+            else:
+                raise ValueError(
+                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
+                )
+            words.extend(sent_words)
+            tags.extend(sent_tags)
+            iob.extend(sent_iob)
+            tokens.extend(sent_tokens)
+            sent_starts.append(True)
+            sent_starts.extend([False for _ in sent_words[1:]])
+        doc = Doc(vocab, words=words)
+        for i, tag in enumerate(tags):
+            doc[i].tag_ = tag
+        for i, sent_start in enumerate(sent_starts):
+            doc[i].is_sent_start = sent_start
+        biluo = iob_to_biluo(iob)
+        entities = tags_to_entities(biluo)
+        doc.ents = [Span(doc, start=s, end=e + 1, label=L) for (L, s, e) in entities]
+        docs.append(doc)
+    return docs
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -0,0 +1,22 @@
+import srsly
+from ..gold_io import json_iterate, json_to_annotations
+from ..example import annotations2doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.xx import MultiLanguage
+
+
+def json2docs(input_data, model=None, **kwargs):
+    nlp = load_model(model) if model is not None else MultiLanguage()
+    if not isinstance(input_data, bytes):
+        if not isinstance(input_data, str):
+            input_data = srsly.json_dumps(input_data)
+        input_data = input_data.encode("utf8")
+    docs = []
+    for json_doc in json_iterate(input_data):
+        for json_para in json_to_annotations(json_doc):
+            example_dict = _fix_legacy_dict_data(json_para)
+            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
+            docs.append(doc)
+    return docs
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -0,0 +1,129 @@
+import random
+from .. import util
+from .example import Example
+from ..tokens import DocBin, Doc
+
+
+class Corpus:
+    """An annotated corpus, reading train and dev datasets from
+    the DocBin (.spacy) format.
+
+    DOCS: https://spacy.io/api/corpus
+    """
+
+    def __init__(self, train_loc, dev_loc, limit=0):
+        """Create a Corpus.
+
+        train (str / Path): File or directory of training data.
+        dev (str / Path): File or directory of development data.
+        limit (int): Max. number of examples returned
+        RETURNS (Corpus): The newly created object.
+        """
+        self.train_loc = train_loc
+        self.dev_loc = dev_loc
+        self.limit = limit
+
+    @staticmethod
+    def walk_corpus(path):
+        path = util.ensure_path(path)
+        if not path.is_dir():
+            return [path]
+        paths = [path]
+        locs = []
+        seen = set()
+        for path in paths:
+            if str(path) in seen:
+                continue
+            seen.add(str(path))
+            if path.parts[-1].startswith("."):
+                continue
+            elif path.is_dir():
+                paths.extend(path.iterdir())
+            elif path.parts[-1].endswith(".spacy"):
+                locs.append(path)
+        return locs
+
+    def _make_example(self, nlp, reference, gold_preproc):
+        if gold_preproc or reference.has_unknown_spaces:
+            return Example(
+                Doc(
+                    nlp.vocab,
+                    words=[word.text for word in reference],
+                    spaces=[bool(word.whitespace_) for word in reference],
+                ),
+                reference,
+            )
+        else:
+            return Example(nlp.make_doc(reference.text), reference)
+
+    def make_examples(self, nlp, reference_docs, max_length=0):
+        for reference in reference_docs:
+            if len(reference) == 0:
+                continue
+            elif max_length == 0 or len(reference) < max_length:
+                yield self._make_example(nlp, reference, False)
+            elif reference.is_sentenced:
+                for ref_sent in reference.sents:
+                    if len(ref_sent) == 0:
+                        continue
+                    elif max_length == 0 or len(ref_sent) < max_length:
+                        yield self._make_example(nlp, ref_sent.as_doc(), False)
+
+    def make_examples_gold_preproc(self, nlp, reference_docs):
+        for reference in reference_docs:
+            if reference.is_sentenced:
+                ref_sents = [sent.as_doc() for sent in reference.sents]
+            else:
+                ref_sents = [reference]
+            for ref_sent in ref_sents:
+                eg = self._make_example(nlp, ref_sent, True)
+                if len(eg.x):
+                    yield eg
+
+    def read_docbin(self, vocab, locs):
+        """ Yield training examples as example dicts """
+        i = 0
+        for loc in locs:
+            loc = util.ensure_path(loc)
+            if loc.parts[-1].endswith(".spacy"):
+                with loc.open("rb") as file_:
+                    doc_bin = DocBin().from_bytes(file_.read())
+                docs = doc_bin.get_docs(vocab)
+                for doc in docs:
+                    if len(doc):
+                        yield doc
+                        i += 1
+                        if self.limit >= 1 and i >= self.limit:
+                            break
+
+    def count_train(self, nlp):
+        """Returns count of words in train examples"""
+        n = 0
+        i = 0
+        for example in self.train_dataset(nlp):
+            n += len(example.predicted)
+            if self.limit >= 0 and i >= self.limit:
+                break
+            i += 1
+        return n
+
+    def train_dataset(
+        self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs
+    ):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
+        if gold_preproc:
+            examples = self.make_examples_gold_preproc(nlp, ref_docs)
+        else:
+            examples = self.make_examples(nlp, ref_docs, max_length)
+        if shuffle:
+            examples = list(examples)
+            random.shuffle(examples)
+        yield from examples
+
+    def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
+        if gold_preproc:
+            examples = self.make_examples_gold_preproc(nlp, ref_docs)
+        else:
+            examples = self.make_examples(nlp, ref_docs, max_length=0)
+        yield from examples
--- a/spacy/gold/example.pxd
+++ b/spacy/gold/example.pxd
@ -0,0 +1,7 @@
+from ..tokens.doc cimport Doc
+
+
+cdef class Example:
+    cdef readonly Doc x
+    cdef readonly Doc y
+    cdef readonly object _alignment
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -0,0 +1,427 @@
+import warnings
+
+import numpy
+
+from ..tokens.doc cimport Doc
+from ..tokens.span cimport Span
+from ..tokens.span import Span
+from ..attrs import IDS
+from .align import Alignment
+from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
+from .iob_utils import spans_from_biluo_tags
+from ..errors import Errors, Warnings
+from ..syntax import nonproj
+
+
+cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
+    """ Create a Doc from dictionaries with token and doc annotations. """
+    attrs, array = _annot2array(vocab, tok_annot, doc_annot)
+    output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
+    if "entities" in doc_annot:
+       _add_entities_to_doc(output, doc_annot["entities"])
+    if array.size:
+        output = output.from_array(attrs, array)
+    # links are currently added with ENT_KB_ID on the token level
+    output.cats.update(doc_annot.get("cats", {}))
+    return output
+
+
+cdef class Example:
+    def __init__(self, Doc predicted, Doc reference, *, alignment=None):
+        if predicted is None:
+            raise TypeError(Errors.E972.format(arg="predicted"))
+        if reference is None:
+            raise TypeError(Errors.E972.format(arg="reference"))
+        self.x = predicted
+        self.y = reference
+        self._alignment = alignment
+
+    def __len__(self):
+        return len(self.predicted)
+
+    property predicted:
+        def __get__(self):
+            return self.x
+
+        def __set__(self, doc):
+            self.x = doc
+
+    property reference:
+        def __get__(self):
+            return self.y
+
+        def __set__(self, doc):
+            self.y = doc
+
+    def copy(self):
+        return Example(
+            self.x.copy(),
+            self.y.copy()
+        )
+
+    @classmethod
+    def from_dict(cls, Doc predicted, dict example_dict):
+        if predicted is None:
+            raise ValueError(Errors.E976.format(n="first", type="Doc"))
+        if example_dict is None:
+            raise ValueError(Errors.E976.format(n="second", type="dict"))
+        example_dict = _fix_legacy_dict_data(example_dict)
+        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+        if "ORTH" not in tok_dict:
+            tok_dict["ORTH"] = [tok.text for tok in predicted]
+            tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
+        return Example(
+            predicted,
+            annotations2doc(predicted.vocab, tok_dict, doc_dict)
+        )
+
+    @property
+    def alignment(self):
+        if self._alignment is None:
+            spacy_words = [token.orth_ for token in self.predicted]
+            gold_words = [token.orth_ for token in self.reference]
+            if gold_words == []:
+                gold_words = spacy_words
+            self._alignment = Alignment.from_strings(spacy_words, gold_words)
+        return self._alignment
+
+    def get_aligned(self, field, as_string=False):
+        """Return an aligned array for a token attribute."""
+        align = self.alignment.x2y
+
+        vocab = self.reference.vocab
+        gold_values = self.reference.to_array([field])
+        output = [None] * len(self.predicted)
+        for token in self.predicted:
+            if token.is_space:
+                output[token.i] = None
+            else:
+                values = gold_values[align[token.i].dataXd]
+                values = values.ravel()
+                if len(values) == 0:
+                    output[token.i] = None
+                elif len(values) == 1:
+                    output[token.i] = values[0]
+                elif len(set(list(values))) == 1:
+                    # If all aligned tokens have the same value, use it.
+                    output[token.i] = values[0]
+                else:
+                    output[token.i] = None
+        if as_string and field not in ["ENT_IOB", "SENT_START"]:
+            output = [vocab.strings[o] if o is not None else o for o in output]
+        return output
+
+    def get_aligned_parse(self, projectivize=True):
+        cand_to_gold = self.alignment.x2y
+        gold_to_cand = self.alignment.y2x
+        aligned_heads = [None] * self.x.length
+        aligned_deps = [None] * self.x.length
+        heads = [token.head.i for token in self.y]
+        deps = [token.dep_ for token in self.y]
+        if projectivize:
+            heads, deps = nonproj.projectivize(heads, deps)
+        for cand_i in range(self.x.length):
+            if cand_to_gold.lengths[cand_i] == 1:
+                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
+                if gold_to_cand.lengths[heads[gold_i]] == 1:
+                    aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
+                    aligned_deps[cand_i] = deps[gold_i]
+        return aligned_heads, aligned_deps
+
+    def get_aligned_spans_x2y(self, x_spans):
+        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
+
+    def get_aligned_spans_y2x(self, y_spans):
+        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
+    
+    def _get_aligned_spans(self, doc, spans, align):
+        seen = set()
+        output = []
+        for span in spans:
+            indices = align[span.start : span.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen]
+            if len(indices) >= 1:
+                aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
+                target_text = span.text.lower().strip().replace(" ", "")
+                our_text = aligned_span.text.lower().strip().replace(" ", "")
+                if our_text == target_text:
+                    output.append(aligned_span)
+                    seen.update(indices)
+        return output
+
+    def get_aligned_ner(self):
+        if not self.y.is_nered:
+            return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
+        x_ents = self.get_aligned_spans_y2x(self.y.ents)
+        # Default to 'None' for missing values
+        x_tags = biluo_tags_from_offsets(
+            self.x,
+            [(e.start_char, e.end_char, e.label_) for e in x_ents],
+            missing=None
+        )
+        # Now fill the tokens we can align to O.
+        O = 2 # I=1, O=2, B=3
+        for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
+            if x_tags[i] is None:
+                if ent_iob == O:
+                    x_tags[i] = "O"
+                elif self.x[i].is_space:
+                    x_tags[i] = "O"
+        return x_tags
+
+    def to_dict(self):
+        return {
+            "doc_annotation": {
+                "cats": dict(self.reference.cats),
+                "entities": biluo_tags_from_doc(self.reference),
+                "links": self._links_to_dict()
+            },
+            "token_annotation": {
+                "ids": [t.i+1 for t in self.reference],
+                "words": [t.text for t in self.reference],
+                "tags": [t.tag_ for t in self.reference],
+                "lemmas": [t.lemma_ for t in self.reference],
+                "pos": [t.pos_ for t in self.reference],
+                "morphs": [t.morph_ for t in self.reference],
+                "heads": [t.head.i for t in self.reference],
+                "deps": [t.dep_ for t in self.reference],
+                "sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
+            }
+        }
+
+    def _links_to_dict(self):
+        links = {}
+        for ent in self.reference.ents:
+            if ent.kb_id_:
+                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
+        return links
+
+    def split_sents(self):
+        """ Split the token annotations into multiple Examples based on
+        sent_starts and return a list of the new Examples"""
+        if not self.reference.is_sentenced:
+            return [self]
+        
+        align = self.alignment.y2x
+        seen_indices = set()
+        output = []
+        for y_sent in self.reference.sents:
+            indices = align[y_sent.start : y_sent.end].data.ravel()
+            indices = [idx for idx in indices if idx not in seen_indices]
+            if indices:
+                x_sent = self.predicted[indices[0] : indices[-1] + 1]
+                output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
+                seen_indices.update(indices)
+        return output
+
+    property text:
+        def __get__(self):
+            return self.x.text
+
+    def __str__(self):
+        return str(self.to_dict())
+
+    def __repr__(self):
+        return str(self.to_dict())
+
+
+def _annot2array(vocab, tok_annot, doc_annot):
+    attrs = []
+    values = []
+
+    for key, value in doc_annot.items():
+        if value:
+            if key == "entities":
+                pass
+            elif key == "links":
+                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
+                tok_annot["ENT_KB_ID"] = ent_kb_ids
+            elif key == "cats":
+                pass
+            else:
+                raise ValueError(Errors.E974.format(obj="doc", key=key))
+
+    for key, value in tok_annot.items():
+        if key not in IDS:
+            raise ValueError(Errors.E974.format(obj="token", key=key))
+        elif key in ["ORTH", "SPACY"]:
+            pass
+        elif key == "HEAD":
+            attrs.append(key)
+            values.append([h-i for i, h in enumerate(value)])
+        elif key == "SENT_START":
+            attrs.append(key)
+            values.append(value)
+        elif key == "MORPH":
+            attrs.append(key)
+            values.append([vocab.morphology.add(v) for v in value])
+        else:
+            attrs.append(key)
+            try:
+                values.append([vocab.strings.add(v) for v in value])
+            except TypeError:
+                types= set([type(v) for v in value])
+                raise TypeError(Errors.E969.format(field=key, types=types))
+
+    array = numpy.asarray(values, dtype="uint64")
+    return attrs, array.T
+
+
+def _add_entities_to_doc(doc, ner_data):
+    if ner_data is None:
+        return
+    elif ner_data == []:
+        doc.ents = []
+    elif isinstance(ner_data[0], tuple):
+        return _add_entities_to_doc(
+            doc,
+            biluo_tags_from_offsets(doc, ner_data)
+        )
+    elif isinstance(ner_data[0], str) or ner_data[0] is None:
+        return _add_entities_to_doc(
+            doc,
+            spans_from_biluo_tags(doc, ner_data)
+        )
+    elif isinstance(ner_data[0], Span):
+        # Ugh, this is super messy. Really hard to set O entities
+        doc.ents = ner_data
+        doc.ents = [span for span in ner_data if span.label_]
+    else:
+        raise ValueError(Errors.E973)
+
+
+def _parse_example_dict_data(example_dict):
+    return (
+        example_dict["token_annotation"],
+        example_dict["doc_annotation"]
+    )
+
+
+def _fix_legacy_dict_data(example_dict):
+    token_dict = example_dict.get("token_annotation", {})
+    doc_dict = example_dict.get("doc_annotation", {})
+    for key, value in example_dict.items():
+        if value:
+            if key in ("token_annotation", "doc_annotation"):
+                pass
+            elif key == "ids":
+                pass
+            elif key in ("cats", "links"):
+                doc_dict[key] = value
+            elif key in ("ner", "entities"):
+                doc_dict["entities"] = value
+            else:
+                token_dict[key] = value
+    # Remap keys
+    remapping = {
+        "words": "ORTH",
+        "tags": "TAG",
+        "pos": "POS",
+        "lemmas": "LEMMA",
+        "deps": "DEP",
+        "heads": "HEAD",
+        "sent_starts": "SENT_START",
+        "morphs": "MORPH",
+        "spaces": "SPACY",
+    }
+    old_token_dict = token_dict
+    token_dict = {}
+    for key, value in old_token_dict.items():
+        if key in ("text", "ids", "brackets"):
+            pass
+        elif key.lower() in remapping:
+            token_dict[remapping[key.lower()]] = value
+        else:
+            raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
+    text = example_dict.get("text", example_dict.get("raw"))
+    if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"):
+        token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
+    if "HEAD" in token_dict and "SENT_START" in token_dict:
+        # If heads are set, we don't also redundantly specify SENT_START.
+        token_dict.pop("SENT_START")
+        warnings.warn(Warnings.W092)
+    return {
+        "token_annotation": token_dict,
+        "doc_annotation": doc_dict
+    }
+
+def _has_field(annot, field):
+    if field not in annot:
+        return False
+    elif annot[field] is None:
+        return False
+    elif len(annot[field]) == 0:
+        return False
+    elif all([value is None for value in annot[field]]):
+        return False
+    else:
+        return True
+
+
+def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
+    if isinstance(biluo_or_offsets[0], (list, tuple)):
+        # Convert to biluo if necessary
+        # This is annoying but to convert the offsets we need a Doc
+        # that has the target tokenization.
+        reference = Doc(vocab, words=words, spaces=spaces)
+        biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
+    else:
+        biluo = biluo_or_offsets
+    ent_iobs = []
+    ent_types = []
+    for iob_tag in biluo_to_iob(biluo):
+        if iob_tag in (None, "-"):
+            ent_iobs.append("")
+            ent_types.append("")
+        else:
+            ent_iobs.append(iob_tag.split("-")[0])
+            if iob_tag.startswith("I") or iob_tag.startswith("B"):
+                ent_types.append(iob_tag.split("-", 1)[1])
+            else:
+                ent_types.append("")
+    return ent_iobs, ent_types
+
+def _parse_links(vocab, words, spaces, links):
+    reference = Doc(vocab, words=words, spaces=spaces)
+    starts = {token.idx: token.i for token in reference}
+    ends = {token.idx + len(token): token.i for token in reference}
+    ent_kb_ids = ["" for _ in reference]
+
+    for index, annot_dict in links.items():
+        true_kb_ids = []
+        for key, value in annot_dict.items():
+            if value == 1.0:
+                true_kb_ids.append(key)
+        if len(true_kb_ids) > 1:
+            raise ValueError(Errors.E980)
+
+        if len(true_kb_ids) == 1:
+            start_char, end_char = index
+            start_token = starts.get(start_char)
+            end_token = ends.get(end_char)
+            if start_token is None or end_token is None:
+                raise ValueError(Errors.E981)
+            for i in range(start_token, end_token+1):
+                ent_kb_ids[i] = true_kb_ids[0]
+
+    return ent_kb_ids
+
+
+def _guess_spaces(text, words):
+    if text is None:
+        return None
+    spaces = []
+    text_pos = 0
+    # align words with text
+    for word in words:
+        try:
+            word_start = text[text_pos:].index(word)
+        except ValueError:
+            spaces.append(True)
+            continue
+        text_pos += word_start + len(word)
+        if text_pos < len(text) and text[text_pos] == " ":
+            spaces.append(True)
+        else:
+            spaces.append(False)
+    return spaces
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -0,0 +1,201 @@
+import warnings
+import srsly
+from .. import util
+from ..errors import Warnings
+from ..tokens import Doc
+from .iob_utils import biluo_tags_from_offsets, tags_to_entities
+import json
+
+
+def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
+    """Convert a list of Doc objects into the JSON-serializable format used by
+    the spacy train command.
+
+    docs (iterable / Doc): The Doc object(s) to convert.
+    doc_id (int): Id for the JSON.
+    RETURNS (dict): The data in spaCy's JSON format
+        - each input doc will be treated as a paragraph in the output doc
+    """
+    if isinstance(docs, Doc):
+        docs = [docs]
+    json_doc = {"id": doc_id, "paragraphs": []}
+    for i, doc in enumerate(docs):
+        json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
+        for cat, val in doc.cats.items():
+            json_cat = {"label": cat, "value": val}
+            json_para["cats"].append(json_cat)
+        # warning: entities information is currently duplicated as
+        # doc-level "entities" and token-level "ner"
+        for ent in doc.ents:
+            ent_tuple = (ent.start_char, ent.end_char, ent.label_)
+            json_para["entities"].append(ent_tuple)
+            if ent.kb_id_:
+                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
+                json_para["links"].append(link_dict)
+        biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
+        for j, sent in enumerate(doc.sents):
+            json_sent = {"tokens": [], "brackets": []}
+            for token in sent:
+                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
+                if doc.is_tagged:
+                    json_token["tag"] = token.tag_
+                    json_token["pos"] = token.pos_
+                    json_token["morph"] = token.morph_
+                    json_token["lemma"] = token.lemma_
+                if doc.is_parsed:
+                    json_token["head"] = token.head.i-token.i
+                    json_token["dep"] = token.dep_
+                json_token["ner"] = biluo_tags[token.i]
+                json_sent["tokens"].append(json_token)
+            json_para["sentences"].append(json_sent)
+        json_doc["paragraphs"].append(json_para)
+    return json_doc
+
+
+def read_json_file(loc, docs_filter=None, limit=None):
+    """Read Example dictionaries from a json file or directory."""
+    loc = util.ensure_path(loc)
+    if loc.is_dir():
+        for filename in loc.iterdir():
+            yield from read_json_file(loc / filename, limit=limit)
+    else:
+        with loc.open("rb") as file_:
+            utf8_str = file_.read()
+        for json_doc in json_iterate(utf8_str):
+            if docs_filter is not None and not docs_filter(json_doc):
+                continue
+            for json_paragraph in json_to_annotations(json_doc):
+                yield json_paragraph
+
+
+def json_to_annotations(doc):
+    """Convert an item in the JSON-formatted training data to the format
+    used by Example.
+
+    doc (dict): One entry in the training data.
+    YIELDS (tuple): The reformatted data - one training example per paragraph
+    """
+    for paragraph in doc["paragraphs"]:
+        example = {"text": paragraph.get("raw", None)}
+        words = []
+        spaces = []
+        ids = []
+        tags = []
+        ner_tags = []
+        pos = []
+        morphs = []
+        lemmas = []
+        heads = []
+        labels = []
+        sent_starts = []
+        brackets = []
+        for sent in paragraph["sentences"]:
+            sent_start_i = len(words)
+            for i, token in enumerate(sent["tokens"]):
+                words.append(token["orth"])
+                spaces.append(token.get("space", None))
+                ids.append(token.get('id', sent_start_i + i))
+                tags.append(token.get("tag", None))
+                pos.append(token.get("pos", None))
+                morphs.append(token.get("morph", None))
+                lemmas.append(token.get("lemma", None))
+                if "head" in token:
+                    heads.append(token["head"] + sent_start_i + i)
+                else:
+                    heads.append(None)
+                if "dep" in token:
+                    labels.append(token["dep"])
+                    # Ensure ROOT label is case-insensitive
+                    if labels[-1].lower() == "root":
+                        labels[-1] = "ROOT"
+                else:
+                    labels.append(None)
+                ner_tags.append(token.get("ner", None))
+                if i == 0:
+                    sent_starts.append(1)
+                else:
+                    sent_starts.append(0)
+            if "brackets" in sent:
+                brackets.extend((b["first"] + sent_start_i,
+                                 b["last"] + sent_start_i, b["label"])
+                                 for b in sent["brackets"])
+
+        example["token_annotation"] = dict(
+            ids=ids,
+            words=words,
+            spaces=spaces,
+            sent_starts=sent_starts,
+            brackets=brackets
+        )
+        # avoid including dummy values that looks like gold info was present
+        if any(tags):
+            example["token_annotation"]["tags"] = tags
+        if any(pos):
+            example["token_annotation"]["pos"] = pos
+        if any(morphs):
+            example["token_annotation"]["morphs"] = morphs
+        if any(lemmas):
+            example["token_annotation"]["lemmas"] = lemmas
+        if any(head is not None for head in heads):
+            example["token_annotation"]["heads"] = heads
+        if any(labels):
+            example["token_annotation"]["deps"] = labels
+
+        cats = {}
+        for cat in paragraph.get("cats", {}):
+            cats[cat["label"]] = cat["value"]
+        example["doc_annotation"] = dict(
+            cats=cats,
+            entities=ner_tags,
+            links=paragraph.get("links", [])
+        )
+        yield example
+
+def json_iterate(bytes utf8_str):
+    # We should've made these files jsonl...But since we didn't, parse out
+    # the docs one-by-one to reduce memory usage.
+    # It's okay to read in the whole file -- just don't parse it into JSON.
+    cdef long file_length = len(utf8_str)
+    if file_length > 2 ** 30:
+        warnings.warn(Warnings.W027.format(size=file_length))
+
+    raw = <char*>utf8_str
+    cdef int square_depth = 0
+    cdef int curly_depth = 0
+    cdef int inside_string = 0
+    cdef int escape = 0
+    cdef long start = -1
+    cdef char c
+    cdef char quote = ord('"')
+    cdef char backslash = ord("\\")
+    cdef char open_square = ord("[")
+    cdef char close_square = ord("]")
+    cdef char open_curly = ord("{")
+    cdef char close_curly = ord("}")
+    for i in range(file_length):
+        c = raw[i]
+        if escape:
+            escape = False
+            continue
+        if c == backslash:
+            escape = True
+            continue
+        if c == quote:
+            inside_string = not inside_string
+            continue
+        if inside_string:
+            continue
+        if c == open_square:
+            square_depth += 1
+        elif c == close_square:
+            square_depth -= 1
+        elif c == open_curly:
+            if square_depth == 1 and curly_depth == 0:
+                start = i
+            curly_depth += 1
+        elif c == close_curly:
+            curly_depth -= 1
+            if square_depth == 1 and curly_depth == 0:
+                substr = utf8_str[start : i + 1].decode("utf8")
+                yield srsly.json_loads(substr)
+                start = -1
--- a/spacy/gold/iob_utils.py
+++ b/spacy/gold/iob_utils.py
@ -0,0 +1,209 @@
+import warnings
+from ..errors import Errors, Warnings
+from ..tokens import Span
+
+
+def iob_to_biluo(tags):
+    out = []
+    tags = list(tags)
+    while tags:
+        out.extend(_consume_os(tags))
+        out.extend(_consume_ent(tags))
+    return out
+
+
+def biluo_to_iob(tags):
+    out = []
+    for tag in tags:
+        if tag is None:
+            out.append(tag)
+        else:
+            tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
+            out.append(tag)
+    return out
+
+
+def _consume_os(tags):
+    while tags and tags[0] == "O":
+        yield tags.pop(0)
+
+
+def _consume_ent(tags):
+    if not tags:
+        return []
+    tag = tags.pop(0)
+    target_in = "I" + tag[1:]
+    target_last = "L" + tag[1:]
+    length = 1
+    while tags and tags[0] in {target_in, target_last}:
+        length += 1
+        tags.pop(0)
+    label = tag[2:]
+    if length == 1:
+        if len(label) == 0:
+            raise ValueError(Errors.E177.format(tag=tag))
+        return ["U-" + label]
+    else:
+        start = "B-" + label
+        end = "L-" + label
+        middle = [f"I-{label}" for _ in range(1, length - 1)]
+        return [start] + middle + [end]
+
+
+def biluo_tags_from_doc(doc, missing="O"):
+    return biluo_tags_from_offsets(
+        doc,
+        [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
+        missing=missing,
+    )
+
+
+def biluo_tags_from_offsets(doc, entities, missing="O"):
+    """Encode labelled spans into per-token tags, using the
+    Begin/In/Last/Unit/Out scheme (BILUO).
+
+    doc (Doc): The document that the entity offsets refer to. The output tags
+        will refer to the token boundaries within the document.
+    entities (iterable): A sequence of `(start, end, label)` triples. `start`
+        and `end` should be character-offset integers denoting the slice into
+        the original string.
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
+        string will be of the form either "", "O" or "{action}-{label}", where
+        action is one of "B", "I", "L", "U". The string "-" is used where the
+        entity offsets don't align with the tokenization in the `Doc` object.
+        The training algorithm will view these as missing values. "O" denotes a
+        non-entity token. "B" denotes the beginning of a multi-token entity,
+        "I" the inside of an entity of three or more tokens, and "L" the end
+        of an entity of two or more tokens. "U" denotes a single-token entity.
+
+    EXAMPLE:
+        >>> text = 'I like London.'
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> doc = nlp.tokenizer(text)
+        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ["O", "O", 'U-LOC', "O"]
+    """
+    # Ensure no overlapping entity labels exist
+    tokens_in_ents = {}
+
+    starts = {token.idx: token.i for token in doc}
+    ends = {token.idx + len(token): token.i for token in doc}
+    biluo = ["-" for _ in doc]
+    # Handle entity cases
+    for start_char, end_char, label in entities:
+        if not label:
+            for s in starts:  # account for many-to-one
+                if s >= start_char and s < end_char:
+                    biluo[starts[s]] = "O"
+        else:
+            for token_index in range(start_char, end_char):
+                if token_index in tokens_in_ents.keys():
+                    raise ValueError(
+                        Errors.E103.format(
+                            span1=(
+                                tokens_in_ents[token_index][0],
+                                tokens_in_ents[token_index][1],
+                                tokens_in_ents[token_index][2],
+                            ),
+                            span2=(start_char, end_char, label),
+                        )
+                    )
+                tokens_in_ents[token_index] = (start_char, end_char, label)
+
+            start_token = starts.get(start_char)
+            end_token = ends.get(end_char)
+            # Only interested if the tokenization is correct
+            if start_token is not None and end_token is not None:
+                if start_token == end_token:
+                    biluo[start_token] = f"U-{label}"
+                else:
+                    biluo[start_token] = f"B-{label}"
+                    for i in range(start_token + 1, end_token):
+                        biluo[i] = f"I-{label}"
+                    biluo[end_token] = f"L-{label}"
+    # Now distinguish the O cases from ones where we miss the tokenization
+    entity_chars = set()
+    for start_char, end_char, label in entities:
+        for i in range(start_char, end_char):
+            entity_chars.add(i)
+    for token in doc:
+        for i in range(token.idx, token.idx + len(token)):
+            if i in entity_chars:
+                break
+        else:
+            biluo[token.i] = missing
+    if "-" in biluo and missing != "-":
+        ent_str = str(entities)
+        warnings.warn(
+            Warnings.W030.format(
+                text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
+                entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
+            )
+        )
+    return biluo
+
+
+def spans_from_biluo_tags(doc, tags):
+    """Encode per-token tags following the BILUO scheme into Span object, e.g.
+    to overwrite the doc.ents.
+
+    doc (Doc): The document that the BILUO tags refer to.
+    entities (iterable): A sequence of BILUO tags with each tag describing one
+        token. Each tags string will be of the form of either "", "O" or
+        "{action}-{label}", where action is one of "B", "I", "L", "U".
+    RETURNS (list): A sequence of Span objects.
+    """
+    token_offsets = tags_to_entities(tags)
+    spans = []
+    for label, start_idx, end_idx in token_offsets:
+        span = Span(doc, start_idx, end_idx + 1, label=label)
+        spans.append(span)
+    return spans
+
+
+def offsets_from_biluo_tags(doc, tags):
+    """Encode per-token tags following the BILUO scheme into entity offsets.
+
+    doc (Doc): The document that the BILUO tags refer to.
+    entities (iterable): A sequence of BILUO tags with each tag describing one
+        token. Each tags string will be of the form of either "", "O" or
+        "{action}-{label}", where action is one of "B", "I", "L", "U".
+    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
+        `end` will be character-offset integers denoting the slice into the
+        original string.
+    """
+    spans = spans_from_biluo_tags(doc, tags)
+    return [(span.start_char, span.end_char, span.label_) for span in spans]
+
+
+def tags_to_entities(tags):
+    """ Note that the end index returned by this function is inclusive.
+    To use it for Span creation, increment the end by 1."""
+    entities = []
+    start = None
+    for i, tag in enumerate(tags):
+        if tag is None:
+            continue
+        if tag.startswith("O"):
+            # TODO: We shouldn't be getting these malformed inputs. Fix this.
+            if start is not None:
+                start = None
+            else:
+                entities.append(("", i, i))
+            continue
+        elif tag == "-":
+            continue
+        elif tag.startswith("I"):
+            if start is None:
+                raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
+            continue
+        if tag.startswith("U"):
+            entities.append((tag[2:], i, i))
+        elif tag.startswith("B"):
+            start = i
+        elif tag.startswith("L"):
+            entities.append((tag[2:], start, i))
+            start = None
+        else:
+            raise ValueError(Errors.E068.format(tag=tag))
+    return entities
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -1,15 +1,15 @@
 """Knowledge-base for entity or concept linking."""
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
-
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE

 from .vocab cimport Vocab
 from .typedefs cimport hash_t
-
 from .structs cimport KBEntryC, AliasC
+
+
 ctypedef vector[KBEntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
 ctypedef vector[float] float_vec
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -1,6 +1,4 @@
-# cython: infer_types=True
-# cython: profile=True
-# coding: utf8
+# cython: infer_types=True, profile=True
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
@ -8,12 +6,11 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 from libc.stdint cimport int32_t, int64_t
 from libcpp.vector cimport vector

+from pathlib import Path
 import warnings
 from os import path
-from pathlib import Path

 from .typedefs cimport hash_t
-
 from .errors import Errors, Warnings


@ -41,7 +38,7 @@ cdef class Candidate:

    @property
    def entity_(self):
-        """RETURNS (unicode): ID/name of this entity in the KB"""
+        """RETURNS (str): ID/name of this entity in the KB"""
        return self.kb.vocab.strings[self.entity_hash]

    @property
@ -51,7 +48,7 @@ cdef class Candidate:

    @property
    def alias_(self):
-        """RETURNS (unicode): ID of the original alias"""
+        """RETURNS (str): ID of the original alias"""
        return self.kb.vocab.strings[self.alias_hash]

    @property
@ -445,6 +442,8 @@ cdef class KnowledgeBase:

 cdef class Writer:
    def __init__(self, object loc):
+        if path.exists(loc):
+            assert not path.isdir(loc), f"{loc} is directory"
        if isinstance(loc, Path):
            loc = bytes(loc)
        if path.exists(loc):
--- a/spacy/lang/af/init.py
+++ b/spacy/lang/af/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...attrs import LANG
--- a/Show More
+++ b/Show More