Merge branch 'develop' into master-tmp

2025-07-17 11:42:30 +03:00 · 2020-06-20 15:52:00 +02:00 · 2020-06-20 15:52:00 +02:00 · 52728d8fa3
commit 52728d8fa3
parent ff6a084e9c dbe9c29f61
711 changed files with 11924 additions and 11926 deletions
--- a/.gitignore
+++ b/.gitignore
@ -44,6 +44,7 @@ __pycache__/
 .env*
 .~env/
 .venv
+env3.6/
 venv/
 env3.*/
 .dev
@ -118,3 +119,6 @@ Desktop.ini

 # Pycharm project files
 *.idea
+
+# IPython
+.ipynb_checkpoints/
--- a/.travis.yml
+++ b/.travis.yml
@ -1,23 +0,0 @@
-language: python
-sudo: false
-cache: pip
-dist: trusty
-group: edge
-python:
-   - "2.7"
-os:
-  - linux
-install:
-  - "pip install -r requirements.txt"
-  - "python setup.py build_ext --inplace"
-  - "pip install -e ."
-script:
-  - "cat /proc/cpuinfo | grep flags | head -n 1"
-  - "python -m pytest --tb=native spacy"
-branches:
-  except:
-    - spacy.io
-notifications:
-  slack:
-    secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
-  email: false
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -280,23 +280,7 @@ except:  # noqa: E722

 ### Python conventions

-All Python code must be written in an **intersection of Python 2 and Python 3**.
-This is easy in Cython, but somewhat ugly in Python. Logic that deals with
-Python or platform compatibility should only live in
-[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
-functions, replacement functions are suffixed with an underscore, for example
-`unicode_`. If you need to access the user's version or platform information,
-for example to show more specific error messages, you can use the `is_config()`
-helper function.
-
-```python
-from .compat import unicode_, is_config
-
-compatible_unicode = unicode_('hello world')
-if is_config(windows=True, python2=True):
-    print("You are using Python 2 on Windows.")
-```
-
+All Python code must be written **compatible with Python 3.6+**.
 Code that interacts with the file-system should accept objects that follow the
 `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
 If the function is user-facing and takes a path as an argument, it should check
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,5 @@
 recursive-include include *.h
-recursive-include spacy *.txt *.pyx *.pxd
+recursive-include spacy *.pyx *.pxd *.txt *.cfg
 include LICENSE
 include README.md
 include bin/spacy
--- a/4
+++ b/4
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 version := $(shell "bin/get-version.sh")

 dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
 	chmod a+rx $@
 	cp $@ dist/spacy.pex

@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl

 wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 	$(VENV)/bin/pip wheel . -w ./wheelhouse
-	$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
+	$(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
 	touch $@

 wheelhouse/pytest-%.whl : $(VENV)/bin/pex
--- a/README.md
+++ b/README.md
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
 [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

 [![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
-[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
 [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
@ -98,12 +97,19 @@ For detailed installation instructions, see the

 - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
  Studio)
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
+- **Python version**: Python 3.6+ (only 64 bit)
 - **Package managers**: [pip] · [conda] (via `conda-forge`)

 [pip]: https://pypi.org/project/spacy/
 [conda]: https://anaconda.org/conda-forge/spacy

+> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
+> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
+> providers and other tooling to support it. This means that in order to run
+> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
+> the library and its Cython dependencies locally. If this is causing problems
+> for you, the easiest solution is to **use Python 3.7** in the meantime.
+
 ### pip

 Using pip, spaCy releases are available as source packages and binary wheels (as
@ -263,9 +269,7 @@ and git preinstalled.
 Install a version of the
 [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
 or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
-matches the version that was used to compile your Python interpreter. For
-official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
-VS 2015 (Python 3.5).
+matches the version that was used to compile your Python interpreter.

 ## Run tests

--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -27,7 +27,7 @@ jobs:
    inputs:
      versionSpec: '3.7'
  - script: |
-      pip install flake8
+      pip install flake8==3.5.0
      python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
    displayName: 'flake8'

@ -35,12 +35,6 @@ jobs:
  dependsOn: 'Validate'
  strategy:
    matrix:
-      Python35Linux:
-        imageName: 'ubuntu-16.04'
-        python.version: '3.5'
-      Python35Windows:
-        imageName: 'vs2017-win2016'
-        python.version: '3.5'
      Python36Linux:
        imageName: 'ubuntu-16.04'
        python.version: '3.6'
@ -58,7 +52,7 @@ jobs:
      #   imageName: 'vs2017-win2016'
      #   python.version: '3.7'
      # Python37Mac:
-      #   imageName: 'macos-10.13'
+      #   imageName: 'macos-10.14'
      #   python.version: '3.7'
      Python38Linux:
        imageName: 'ubuntu-16.04'
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@ -1,169 +0,0 @@
-#!/usr/bin/env python
-""" cythonize.py
-
-Cythonize pyx files into C++ files as needed.
-
-Usage: cythonize.py [root]
-
-Checks pyx files to see if they have been changed relative to their
-corresponding C++ files. If they have, then runs cython on these files to
-recreate the C++ files.
-
-Additionally, checks pxd files and setup.py if they have been changed. If
-they have, rebuilds everything.
-
-Change detection based on file hashes stored in JSON format.
-
-For now, this script should be run by developers when changing Cython files
-and the resulting C++ files checked in, so that end-users (and Python-only
-developers) do not get the Cython dependencies.
-
-Based upon:
-
-https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
-https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
-
-Note: this script does not check any of the dependent C++ libraries.
-"""
-from __future__ import print_function
-
-import os
-import sys
-import json
-import hashlib
-import subprocess
-import argparse
-
-
-HASH_FILE = "cythonize.json"
-
-
-def process_pyx(fromfile, tofile, language_level="-2"):
-    print("Processing %s" % fromfile)
-    try:
-        from Cython.Compiler.Version import version as cython_version
-        from distutils.version import LooseVersion
-
-        if LooseVersion(cython_version) < LooseVersion("0.19"):
-            raise Exception("Require Cython >= 0.19")
-
-    except ImportError:
-        pass
-
-    flags = ["--fast-fail", language_level]
-    if tofile.endswith(".cpp"):
-        flags += ["--cplus"]
-
-    try:
-        try:
-            r = subprocess.call(
-                ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
-            )  # See Issue #791
-            if r != 0:
-                raise Exception("Cython failed")
-        except OSError:
-            # There are ways of installing Cython that don't result in a cython
-            # executable on the path, see gh-2397.
-            r = subprocess.call(
-                [
-                    sys.executable,
-                    "-c",
-                    "import sys; from Cython.Compiler.Main import "
-                    "setuptools_main as main; sys.exit(main())",
-                ]
-                + flags
-                + ["-o", tofile, fromfile]
-            )
-            if r != 0:
-                raise Exception("Cython failed")
-    except OSError:
-        raise OSError("Cython needs to be installed")
-
-
-def preserve_cwd(path, func, *args):
-    orig_cwd = os.getcwd()
-    try:
-        os.chdir(path)
-        func(*args)
-    finally:
-        os.chdir(orig_cwd)
-
-
-def load_hashes(filename):
-    try:
-        return json.load(open(filename))
-    except (ValueError, IOError):
-        return {}
-
-
-def save_hashes(hash_db, filename):
-    with open(filename, "w") as f:
-        f.write(json.dumps(hash_db))
-
-
-def get_hash(path):
-    return hashlib.md5(open(path, "rb").read()).hexdigest()
-
-
-def hash_changed(base, path, db):
-    full_path = os.path.normpath(os.path.join(base, path))
-    return not get_hash(full_path) == db.get(full_path)
-
-
-def hash_add(base, path, db):
-    full_path = os.path.normpath(os.path.join(base, path))
-    db[full_path] = get_hash(full_path)
-
-
-def process(base, filename, db):
-    root, ext = os.path.splitext(filename)
-    if ext in [".pyx", ".cpp"]:
-        if hash_changed(base, filename, db) or not os.path.isfile(
-            os.path.join(base, root + ".cpp")
-        ):
-            preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
-            hash_add(base, root + ".cpp", db)
-            hash_add(base, root + ".pyx", db)
-
-
-def check_changes(root, db):
-    res = False
-    new_db = {}
-
-    setup_filename = "setup.py"
-    hash_add(".", setup_filename, new_db)
-    if hash_changed(".", setup_filename, db):
-        res = True
-
-    for base, _, files in os.walk(root):
-        for filename in files:
-            if filename.endswith(".pxd"):
-                hash_add(base, filename, new_db)
-                if hash_changed(base, filename, db):
-                    res = True
-
-    if res:
-        db.clear()
-        db.update(new_db)
-    return res
-
-
-def run(root):
-    db = load_hashes(HASH_FILE)
-
-    try:
-        check_changes(root, db)
-        for base, _, files in os.walk(root):
-            for filename in files:
-                process(base, filename, db)
-    finally:
-        save_hashes(db, HASH_FILE)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Cythonize pyx files into C++ files as needed"
-    )
-    parser.add_argument("root", help="root directory")
-    args = parser.parse_args()
-    run(args.root)
--- a/bin/ud/ud_run_test.py
+++ b/bin/ud/ud_run_test.py
@ -13,23 +13,12 @@ import srsly
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
-from spacy.util import compounding, minibatch_by_words
-from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher

-# from spacy.morphology import Fused_begin, Fused_inside
-from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer

 Fused_begin = None
 Fused_inside = None

-import itertools
-import random
-import numpy.random
-
 from . import conll17_ud_eval

 from spacy import lang
@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
    nlp.add_pipe(nlp.create_pipe("parser"))
    return nlp

--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@ -14,7 +14,7 @@ import spacy
 import spacy.util
 from bin.ud import conll17_ud_eval
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, Example
 from spacy.util import compounding, minibatch, minibatch_by_words
 from spacy.syntax.nonproj import projectivize
 from spacy.matcher import Matcher
@ -53,7 +53,7 @@ def read_data(
    max_doc_length=None,
    limit=None,
 ):
-    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    """Read the CONLLU format into Example objects. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
@ -98,15 +98,16 @@ def read_data(
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
-                    return docs, golds
+                    return golds_to_gold_data(docs, golds)

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
-            return docs, golds
-    return docs, golds
+            return golds_to_gold_data(docs, golds)
+    return golds_to_gold_data(docs, golds)
+

 def _parse_morph_string(morph_string):
    if morph_string == '_':
@ -120,6 +121,7 @@ def _parse_morph_string(morph_string):
        output.append('%s_%s' % (key, value.lower()))
    return set(output)

+
 def read_conllu(file_):
    docs = []
    sent = []
@ -180,16 +182,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 #############################


-def golds_to_gold_tuples(docs, golds):
-    """Get out the annoying 'tuples' format used by begin_training, given the
+def golds_to_gold_data(docs, golds):
+    """Get out the training data format used by begin_training, given the
    GoldParse objects."""
-    tuples = []
+    data = []
    for doc, gold in zip(docs, golds):
-        text = doc.text
-        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
-        sents = [((ids, words, tags, heads, labels, iob), [])]
-        tuples.append((text, sents))
-    return tuples
+        example = Example(doc=doc)
+        example.add_doc_annotation(cats=gold.cats)
+        token_annotation_dict = gold.orig.to_dict()
+        example.add_token_annotation(**token_annotation_dict)
+        example.goldparse = gold
+        data.append(example)
+    return data


 ##############
@ -327,7 +331,6 @@ def get_token_conllu(token, i):
    return "\n".join(lines)


-
 ##################
 # Initialization #
 ##################
@ -348,7 +351,7 @@ def load_nlp(corpus, config, vectors=None):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config, device):
+def initialize_pipeline(nlp, examples, config, device):
    nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
    nlp.add_pipe(nlp.create_pipe("morphologizer"))
    nlp.add_pipe(nlp.create_pipe("parser"))
@ -356,14 +359,15 @@ def initialize_pipeline(nlp, docs, golds, config, device):
        nlp.parser.add_multitask_objective("tag")
    if config.multitask_sent:
        nlp.parser.add_multitask_objective("sent_start")
-    for gold in golds:
+    for ex in examples:
+        gold = ex.gold
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    if torch is not None and device != -1:
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    optimizer = nlp.begin_training(
-        lambda: golds_to_gold_tuples(docs, golds),
+        lambda: examples,
        device=device,
        subword_features=config.subword_features,
        conv_depth=config.conv_depth,
@ -382,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc):
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
-            component.tok2vec.from_bytes(weights_data)
+        if hasattr(component, "model") and component.model.has_ref("tok2vec"):
+            component.get_ref("tok2vec").from_bytes(weights_data)
            loaded.append(name)
    return loaded

@ -491,6 +495,10 @@ def main(
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)

+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
@ -505,7 +513,7 @@ def main(
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

-    docs, golds = read_data(
+    examples = read_data(
        nlp,
        paths.train.conllu.open(encoding="utf8"),
        paths.train.text.open(encoding="utf8"),
@ -513,12 +521,12 @@ def main(
        limit=limit,
    )

-    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
+    optimizer = initialize_pipeline(nlp, examples, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
-        docs, golds = read_data(
+        examples = read_data(
            nlp,
            paths.train.conllu.open(encoding="utf8"),
            paths.train.text.open(encoding="utf8"),
@ -527,22 +535,19 @@ def main(
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
-        Xs = list(zip(docs, golds))
-        random.shuffle(Xs)
+        random.shuffle(examples)
        if config.batch_by_words:
-            batches = minibatch_by_words(Xs, size=batch_sizes)
+            batches = minibatch_by_words(examples, size=batch_sizes)
        else:
-            batches = minibatch(Xs, size=batch_sizes)
+            batches = minibatch(examples, size=batch_sizes)
        losses = {}
-        n_train_words = sum(len(doc) for doc in docs)
+        n_train_words = sum(len(ex.doc) for ex in examples)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                batch_docs, batch_gold = zip(*batch)
-                pbar.update(sum(len(doc) for doc in batch_docs))
+                pbar.update(sum(len(ex.doc) for ex in batch))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
-                    batch_docs,
-                    batch_gold,
+                    batch,
                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -14,7 +14,7 @@ pip install keras==2.0.9

 Compatible with: spaCy v2.0.0+
 """
-
+import ml_datasets
 import plac
 import random
 import pathlib
@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
 from keras.layers import LSTM, Dense, Embedding, Bidirectional
 from keras.layers import TimeDistributed
 from keras.optimizers import Adam
-import thinc.extra.datasets
 from spacy.compat import pickle
 import spacy

@ -224,7 +223,7 @@ def main(
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
    if train_dir is None or dev_dir is None:
-        imdb_data = thinc.extra.datasets.imdb()
+        imdb_data = ml_datasets.imdb()
    if is_runtime:
        if dev_dir is None:
            dev_texts, dev_labels = zip(*imdb_data[1])
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -0,0 +1,126 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+noise_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+omit_extra_lookups = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+#[optimizer.learn_rate]
+#@schedules = "warmup_linear.v1"
+#warmup_steps = 250
+#total_steps = 20000
+#initial_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@ -0,0 +1,145 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+noise_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[pretraining]
+max_epochs = 1000
+min_length = 5
+max_length = 500
+dropout = 0.2
+n_save_every = null
+batch_size = 3000
+seed = ${training:seed}
+use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
+tok2vec_model = "nlp.pipeline.tok2vec.model"
+
+[pretraining.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[pretraining.loss_func]
+@losses = "CosineDistance.v1"
+normalize = true
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -0,0 +1,74 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+noise_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = 0
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedBiLSTM.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+embed_size = 2000
+subword_features = true
+maxout_pieces = 3
+dropout = null
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@ -0,0 +1,75 @@
+[training]
+patience = 10000
+eval_frequency = 200
+dropout = 0.2
+init_tok2vec = null
+vectors = null
+max_epochs = 100
+orth_variant_level = 0.0
+noise_level = 0.0
+gold_preproc = true
+max_length = 0
+use_gpu = -1
+scores = ["tags_acc", "uas", "las"]
+score_weights = {"las": 0.8, "tags_acc": 0.2}
+limit = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 1
+beam_width = 1
+beam_update_prob = 1.0
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 96
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+dropout = null
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@ -0,0 +1,69 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.CharacterEmbed.v1"
+width = 96
+nM = 64
+nC = 8
+rows = 2000
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+dropout = null
+
+[nlp.pipeline.tok2vec.model.extract.features]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+maxout_pieces = 4
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+window_size = 1
+maxout_pieces = 2
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -0,0 +1,48 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_p", "ents_r", "ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+seed = 0
+accumulate_gradient = 2
+discard_oversize = false
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 3000
+stop = 3000
+compound = 1.001
+
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.ner]
+factory = "simple_ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.BiluoTagger.v1"
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+width = 128
+depth = 4
+embed_size = 7000
+maxout_pieces = 3
+window_size = 1
+subword_features = true
+pretrained_vectors = null
+dropout = null
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -13,9 +13,10 @@ Prerequisites: pip install joblib
 from __future__ import print_function, unicode_literals

 from pathlib import Path
+
+import ml_datasets
 from joblib import Parallel, delayed
 from functools import partial
-import thinc.extra.datasets
 import plac
 import spacy
 from spacy.util import minibatch
@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
-    data, _ = thinc.extra.datasets.imdb()
+    data, _ = ml_datasets.imdb()
    texts, _ = zip(*data[-limit:])
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
--- a/examples/streamlit_spacy.py
+++ b/examples/streamlit_spacy.py
@ -1,7 +1,7 @@
 # coding: utf-8
 """
 Example of a Streamlit app for an interactive spaCy model visualizer. You can
-either download the script, or point streamlit run to the raw URL of this
+either download the script, or point `streamlit run` to the raw URL of this
 file. For more details, see https://streamlit.io.

 Installation:
@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py
 """
 from __future__ import unicode_literals

+import base64
+
 import streamlit as st
 import spacy
 from spacy import displacy
@ -54,6 +56,14 @@ model_load_state.empty()
 text = st.text_area("Text to analyze", DEFAULT_TEXT)
 doc = process_text(spacy_model, text)

+
+def render_svg(svg):
+    """Renders the given svg string."""
+    b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
+    html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
+    st.write(html, unsafe_allow_html=True)
+
+
 if "parser" in nlp.pipe_names:
    st.header("Dependency Parse & Part-of-speech tags")
    st.sidebar.header("Dependency Parse")
@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names:
    }
    docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
    for sent in docs:
-        html = displacy.render(sent, options=options)
+        html = displacy.render(sent, options=options, style="dep")
        # Double newlines seem to mess with the rendering
        html = html.replace("\n\n", "\n")
        if split_sents and len(docs) > 1:
            st.markdown(f"> {sent.text}")
-        st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+        render_svg(html)
+        # this didn't show the dep arc labels properly, cf #5089
+        # st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

 if "ner" in nlp.pipe_names:
    st.header("Named Entities")
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -12,7 +12,7 @@ import tqdm
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
-from spacy.gold import GoldParse
+from spacy.gold import GoldParse, Example
 from spacy.syntax.nonproj import projectivize
 from collections import defaultdict
 from spacy.matcher import Matcher
@ -33,25 +33,25 @@ random.seed(0)
 numpy.random.seed(0)


-def minibatch_by_words(items, size=5000):
-    random.shuffle(items)
+def minibatch_by_words(examples, size=5000):
+    random.shuffle(examples)
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
-    items = iter(items)
+    examples = iter(examples)
    while True:
        batch_size = next(size_)
        batch = []
        while batch_size >= 0:
            try:
-                doc, gold = next(items)
+                example = next(examples)
            except StopIteration:
                if batch:
                    yield batch
                return
-            batch_size -= len(doc)
-            batch.append((doc, gold))
+            batch_size -= len(example.doc)
+            batch.append(example)
        if batch:
            yield batch
        else:
@ -78,7 +78,7 @@ def read_data(
    max_doc_length=None,
    limit=None,
 ):
-    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    """Read the CONLLU format into Example objects. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
@ -119,15 +119,15 @@ def read_data(
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
-                    return docs, golds
+                    return golds_to_gold_data(docs, golds)

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
-            return docs, golds
-    return docs, golds
+            return golds_to_gold_data(docs, golds)
+    return golds_to_gold_data(docs, golds)


 def read_conllu(file_):
@ -181,16 +181,18 @@ def _make_gold(nlp, text, sent_annots):
 #############################


-def golds_to_gold_tuples(docs, golds):
-    """Get out the annoying 'tuples' format used by begin_training, given the
+def golds_to_gold_data(docs, golds):
+    """Get out the training data format used by begin_training, given the
    GoldParse objects."""
-    tuples = []
+    data = []
    for doc, gold in zip(docs, golds):
-        text = doc.text
-        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
-        sents = [((ids, words, tags, heads, labels, iob), [])]
-        tuples.append((text, sents))
-    return tuples
+        example = Example(doc=doc)
+        example.add_doc_annotation(cats=gold.cats)
+        token_annotation_dict = gold.orig.to_dict()
+        example.add_token_annotation(**token_annotation_dict)
+        example.goldparse = gold
+        data.append(example)
+    return data


 ##############
@ -303,7 +305,7 @@ def load_nlp(corpus, config):
    return nlp


-def initialize_pipeline(nlp, docs, golds, config):
+def initialize_pipeline(nlp, examples, config):
    nlp.add_pipe(nlp.create_pipe("parser"))
    if config.multitask_tag:
        nlp.parser.add_multitask_objective("tag")
@ -311,18 +313,19 @@ def initialize_pipeline(nlp, docs, golds, config):
        nlp.parser.add_multitask_objective("sent_start")
    nlp.parser.moves.add_action(2, "subtok")
    nlp.add_pipe(nlp.create_pipe("tagger"))
-    for gold in golds:
-        for tag in gold.tags:
+    for ex in examples:
+        for tag in ex.gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split("-")[1] for act in actions if "-" in act])
-    for gold in golds:
+    for ex in examples:
+        gold = ex.gold
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split("||")[0]
-    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
+    return nlp.begin_training(lambda: examples)


 ########################
@ -391,13 +394,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
    Token.set_extension("begins_fused", default=False)
    Token.set_extension("inside_fused", default=False)

+    Token.set_extension("get_conllu_lines", method=get_token_conllu)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
+
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config)

-    docs, golds = read_data(
+    examples = read_data(
        nlp,
        paths.train.conllu.open(encoding="utf8"),
        paths.train.text.open(encoding="utf8"),
@ -405,23 +412,18 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
        limit=limit,
    )

-    optimizer = initialize_pipeline(nlp, docs, golds, config)
+    optimizer = initialize_pipeline(nlp, examples, config)

    for i in range(config.nr_epoch):
-        docs = [nlp.make_doc(doc.text) for doc in docs]
-        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
+        docs = [nlp.make_doc(example.doc.text) for example in examples]
+        batches = minibatch_by_words(examples, size=config.batch_size)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
-                batch_docs, batch_gold = zip(*batch)
-                pbar.update(sum(len(doc) for doc in batch_docs))
+                pbar.update(sum(len(ex.doc) for ex in batch))
                nlp.update(
-                    batch_docs,
-                    batch_gold,
-                    sgd=optimizer,
-                    drop=config.dropout,
-                    losses=losses,
+                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
                )

        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@ -31,14 +31,13 @@ random.seed(0)

 PWD = os.path.dirname(__file__)

-TRAIN_DATA = list(read_json_file(
-    os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
+TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))


-def get_position_label(i, words, tags, heads, labels, ents):
+def get_position_label(i, token_annotation):
    """Return labels indicating the position of the word in the document.
    """
-    if len(words) < 20:
+    if len(token_annotation.words) < 20:
        return "short-doc"
    elif i == 0:
        return "first-word"
@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
        return "early-word"
    elif i < 20:
        return "mid-word"
-    elif i == len(words) - 1:
+    elif i == len(token_annotation.words) - 1:
        return "last-word"
    else:
        return "late-word"
@ -60,17 +59,17 @@ def main(n_iter=10):
    print(nlp.pipeline)

    print("Create data", len(TRAIN_DATA))
-    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
+    optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
-        for text, annot_brackets in TRAIN_DATA:
-            for annotations, _ in annot_brackets:
-                doc = Doc(nlp.vocab, words=annotations[1])
-                gold = GoldParse.from_annot_tuples(doc, annotations)
+        for example in TRAIN_DATA:
+            for token_annotation in example.token_annotations:
+                doc = Doc(nlp.vocab, words=token_annotation.words)
+                gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
+
                nlp.update(
-                    [doc],  # batch of texts
-                    [gold],  # batch of annotations
+                    examples=[(doc, gold)],  # 1 example
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses,
@ -78,9 +77,9 @@ def main(n_iter=10):
        print(losses.get("nn_labeller", 0.0), losses["ner"])

    # test the trained model
-    for text, _ in TRAIN_DATA:
-        if text is not None:
-            doc = nlp(text)
+    for example in TRAIN_DATA:
+        if example.text is not None:
+            doc = nlp(example.text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@ -1,217 +0,0 @@
-"""This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pretrained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pretrained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-"""
-import plac
-import tqdm
-import random
-import spacy
-import thinc.extra.datasets
-from spacy.util import minibatch, use_gpu, compounding
-from spacy._ml import Tok2Vec
-from spacy.pipeline import TextCategorizer
-import numpy
-
-
-def load_texts(limit=0):
-    train, dev = thinc.extra.datasets.imdb()
-    train_texts, train_labels = zip(*train)
-    dev_texts, dev_labels = zip(*train)
-    train_texts = list(train_texts)
-    dev_texts = list(dev_texts)
-    random.shuffle(train_texts)
-    random.shuffle(dev_texts)
-    if limit >= 1:
-        return train_texts[:limit]
-    else:
-        return list(train_texts) + list(dev_texts)
-
-
-def load_textcat_data(limit=0):
-    """Load data from the IMDB dataset."""
-    # Partition off part of the train data for evaluation
-    train_data, eval_data = thinc.extra.datasets.imdb()
-    random.shuffle(train_data)
-    train_data = train_data[-limit:]
-    texts, labels = zip(*train_data)
-    eval_texts, eval_labels = zip(*eval_data)
-    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
-    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
-    return (texts, cats), (eval_texts, eval_cats)
-
-
-def prefer_gpu():
-    used = spacy.util.use_gpu(0)
-    if used is None:
-        return False
-    else:
-        import cupy.random
-
-        cupy.random.seed(0)
-        return True
-
-
-def build_textcat_model(tok2vec, nr_class, width):
-    from thinc.v2v import Model, Softmax, Maxout
-    from thinc.api import flatten_add_lengths, chain
-    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
-    from thinc.misc import Residual, LayerNorm
-    from spacy._ml import logistic, zero_init
-
-    with Model.define_operators({">>": chain}):
-        model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> Pooling(mean_pool)
-            >> Softmax(nr_class, width)
-        )
-    model.tok2vec = tok2vec
-    return model
-
-
-def block_gradients(model):
-    from thinc.api import wrap
-
-    def forward(X, drop=0.0):
-        Y, _ = model.begin_update(X, drop=drop)
-        return Y, None
-
-    return wrap(forward, model)
-
-
-def create_pipeline(width, embed_size, vectors_model):
-    print("Load vectors")
-    nlp = spacy.load(vectors_model)
-    print("Start training")
-    textcat = TextCategorizer(
-        nlp.vocab,
-        labels=["POSITIVE", "NEGATIVE"],
-        model=build_textcat_model(
-            Tok2Vec(width=width, embed_size=embed_size), 2, width
-        ),
-    )
-
-    nlp.add_pipe(textcat)
-    return nlp
-
-
-def train_tensorizer(nlp, texts, dropout, n_iter):
-    tensorizer = nlp.create_pipe("tensorizer")
-    nlp.add_pipe(tensorizer)
-    optimizer = nlp.begin_training()
-    for i in range(n_iter):
-        losses = {}
-        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
-            docs = [nlp.make_doc(text) for text in batch]
-            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
-        print(losses)
-    return optimizer
-
-
-def train_textcat(nlp, n_texts, n_iter=10):
-    textcat = nlp.get_pipe("textcat")
-    tok2vec_weights = textcat.model.tok2vec.to_bytes()
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
-    print(
-        "Using {} examples ({} training, {} evaluation)".format(
-            n_texts, len(train_texts), len(dev_texts)
-        )
-    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
-
-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train textcat
-        optimizer = nlp.begin_training()
-        textcat.model.tok2vec.from_bytes(tok2vec_weights)
-        print("Training the model...")
-        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
-        for i in range(n_iter):
-            losses = {"textcat": 0.0}
-            # batch up the examples using spaCy's minibatch
-            batches = minibatch(tqdm.tqdm(train_data), size=2)
-            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
-            with textcat.model.use_params(optimizer.averages):
-                # evaluate on the dev data split off in load_data()
-                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print(
-                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
-                    losses["textcat"],
-                    scores["textcat_p"],
-                    scores["textcat_r"],
-                    scores["textcat_f"],
-                )
-            )
-
-
-def evaluate_textcat(tokenizer, textcat, texts, cats):
-    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8
-    fp = 1e-8
-    tn = 1e-8
-    fn = 1e-8
-    for i, doc in enumerate(textcat.pipe(docs)):
-        gold = cats[i]
-        for label, score in doc.cats.items():
-            if label not in gold:
-                continue
-            if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.0
-            elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.0
-            elif score < 0.5 and gold[label] < 0.5:
-                tn += 1
-            elif score < 0.5 and gold[label] >= 0.5:
-                fn += 1
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    f_score = 2 * (precision * recall) / (precision + recall)
-    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
-
-
-@plac.annotations(
-    width=("Width of CNN layers", "positional", None, int),
-    embed_size=("Embedding rows", "positional", None, int),
-    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
-    train_iters=("Number of iterations to train", "option", "tn", int),
-    train_examples=("Number of labelled examples", "option", "eg", int),
-    vectors_model=("Name or path to vectors model to learn from"),
-)
-def main(
-    width,
-    embed_size,
-    vectors_model,
-    pretrain_iters=30,
-    train_iters=30,
-    train_examples=1000,
-):
-    random.seed(0)
-    numpy.random.seed(0)
-    use_gpu = prefer_gpu()
-    print("Using GPU?", use_gpu)
-
-    nlp = create_pipeline(width, embed_size, vectors_model)
-    print("Load data")
-    texts = load_texts(limit=0)
-    print("Train tensorizer")
-    optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
-    print("Train textcat")
-    train_textcat(nlp, train_examples, n_iter=train_iters)
-
-
-if __name__ == "__main__":
-    plac.call(main)
--- a/examples/training/rehearsal.py
+++ b/examples/training/rehearsal.py
@ -59,17 +59,14 @@ def main(model_name, unlabelled_loc):
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
-    optimizer.alpha = 0.1
+    optimizer.learn_rate = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    sizes = compounding(1.0, 4.0, 1.001)
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
@ -79,8 +76,7 @@ def main(model_name, unlabelled_loc):
            # batch up the examples using spaCy's minibatch
            raw_batches = minibatch(raw_docs, size=4)
            for batch in minibatch(TRAIN_DATA, size=sizes):
-                docs, golds = zip(*batch)
-                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
+                nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
                raw_batch = list(next(raw_batches))
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
--- a/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
+++ b/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
@ -5,16 +5,17 @@ from spacy.gold import docs_to_json
 import srsly
 import sys

+
@plac.annotations(
    model=("Model name. Defaults to 'en'.", "option", "m", str),
    input_file=("Input file (jsonl)", "positional", None, Path),
    output_dir=("Output directory", "positional", None, Path),
    n_texts=("Number of texts to convert", "option", "t", int),
 )
-def convert(model='en', input_file=None, output_dir=None, n_texts=0):
+def convert(model="en", input_file=None, output_dir=None, n_texts=0):
    # Load model with tokenizer + sentencizer only
    nlp = spacy.load(model)
-    nlp.disable_pipes(*nlp.pipe_names)
+    nlp.select_pipes(disable=nlp.pipe_names)
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer, first=True)

@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):

    srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])

+
 if __name__ == "__main__":
    plac.call(convert)
--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -18,7 +18,6 @@ import random
 from pathlib import Path

 from spacy.vocab import Vocab
-
 import spacy
 from spacy.kb import KnowledgeBase
 from spacy.pipeline import EntityRuler
@ -66,36 +65,38 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    vocab = Vocab().from_disk(vocab_path)
    # create blank English model with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
-    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
+    nlp.vocab.vectors.name = "nel_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
-    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
-    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
+    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        # use only the predicted EL score and not the prior probability (for demo purposes)
-        cfg = {"incl_prior": False}
-        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
-        entity_linker.set_kb(kb)
+
+        # use only the predicted EL score and not the prior probability (for demo purposes)
+        cfg = {"kb": kb, "incl_prior": False}
+        entity_linker = nlp.create_pipe("entity_linker", cfg)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
-    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+    # Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
-        with nlp.disable_pipes("entity_linker"):
+        with nlp.select_pipes(disable="entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
@ -110,22 +111,18 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train entity linker
+    with nlp.select_pipes(enable="entity_linker"):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
+
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
                nlp.update(
-                    texts,  # batch of texts
-                    annotations,  # batch of annotations
+                    batch,
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -124,9 +124,7 @@ def main(model=None, output_dir=None, n_iter=15):
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

-    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train parser
+    with nlp.select_pipes(enable="parser"):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
@ -134,8 +132,7 @@ def main(model=None, output_dir=None, n_iter=15):
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+                nlp.update(batch, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_morphologizer.py
+++ b/examples/training/train_morphologizer.py
@ -0,0 +1,133 @@
+#!/usr/bin/env python
+# coding: utf8
+"""
+A simple example for training a morphologizer. For more details, see
+the documentation:
+* Training: https://spacy.io/usage/training
+
+Compatible with: spaCy v3.0.0+
+Last tested with: v3.0.0
+"""
+from __future__ import unicode_literals, print_function
+
+import plac
+import random
+from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
+from spacy.morphology import Morphology
+
+
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
+TRAIN_DATA = [
+    (
+        "I like green eggs",
+        {
+            "morphs": [
+                "PronType=Prs|Person=1",
+                "VerbForm=Fin",
+                "Degree=Pos",
+                "Number=Plur",
+            ],
+            "pos": ["PRON", "VERB", "ADJ", "NOUN"],
+        },
+    ),
+    (
+        "Eat blue ham",
+        {
+            "morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
+            "pos": ["VERB", "ADJ", "NOUN"],
+        },
+    ),
+    (
+        "She was blue",
+        {
+            "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
+            "pos": ["PRON", "VERB", "ADJ"],
+        },
+    ),
+    (
+        "He was blue today",
+        {
+            "morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
+            "pos": ["PRON", "VERB", "ADJ", "ADV"],
+        },
+    ),
+]
+
+# The POS tags are optional, set `with_pos_tags = False` to omit them for
+# this example:
+with_pos_tags = True
+
+if not with_pos_tags:
+    for i in range(len(TRAIN_DATA)):
+        del TRAIN_DATA[i][1]["pos"]
+
+
+@plac.annotations(
+    lang=("ISO Code of language to use", "option", "l", str),
+    output_dir=("Optional output directory", "option", "o", Path),
+    n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
+    """Create a new model, set up the pipeline and train the tagger. In order to
+    train the tagger with a custom tag map, we're creating a new Language
+    instance with a custom vocab.
+    """
+    nlp = spacy.blank(lang)
+    # add the tagger to the pipeline
+    # nlp.create_pipe works for built-ins that are registered with spaCy
+    morphologizer = nlp.create_pipe("morphologizer")
+    nlp.add_pipe(morphologizer)
+
+    # add labels
+    for _, annotations in TRAIN_DATA:
+        morph_labels = annotations.get("morphs")
+        pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
+        assert len(morph_labels) == len(pos_labels)
+        for morph, pos in zip(morph_labels, pos_labels):
+            morph_dict = Morphology.feats_to_dict(morph)
+            if pos:
+                morph_dict["POS"] = pos
+            morph = Morphology.dict_to_feats(morph_dict)
+            morphologizer.add_label(morph)
+
+    optimizer = nlp.begin_training()
+    for i in range(n_iter):
+        random.shuffle(TRAIN_DATA)
+        losses = {}
+        # batch up the examples using spaCy's minibatch
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            nlp.update(batch, sgd=optimizer, losses=losses)
+        print("Losses", losses)
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    print("Morphs", [(t.text, t.morph) for t in doc])
+
+    # save model to output directory
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+        print("Saved model to", output_dir)
+
+        # test the save model
+        print("Loading from", output_dir)
+        nlp2 = spacy.load(output_dir)
+        doc = nlp2(test_text)
+        print("Morphs", [(t.text, t.morph) for t in doc])
+
+
+if __name__ == "__main__":
+    plac.call(main)
+
+# Expected output:
+# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -43,41 +43,39 @@ def main(model=None, output_dir=None, n_iter=100):

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
-    if "ner" not in nlp.pipe_names:
-        ner = nlp.create_pipe("ner")
+    if "simple_ner" not in nlp.pipe_names:
+        ner = nlp.create_pipe("simple_ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
-        ner = nlp.get_pipe("ner")
+        ner = nlp.get_pipe("simple_ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
+            print("Add label", ent[2])
            ner.add_label(ent[2])

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    # only train NER
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
+        print(
+            "Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))
+        )
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
                nlp.update(
-                    texts,  # batch of texts
-                    annotations,  # batch of annotations
-                    drop=0.5,  # dropout - make it harder to memorise data
+                    batch,
+                    drop=0.0,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -95,13 +95,9 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    # only train NER
-    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
+    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
-        warnings.filterwarnings("once", category=UserWarning, module='spacy')
+        warnings.filterwarnings("once", category=UserWarning, module="spacy")

        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
@ -110,8 +106,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
+                nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -64,10 +64,7 @@ def main(model=None, output_dir=None, n_iter=15):
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train parser
+    with nlp.select_pipes(enable="parser"):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
@ -75,8 +72,7 @@ def main(model=None, output_dir=None, n_iter=15):
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+                nlp.update(batch, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25):
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
-            texts, annotations = zip(*batch)
-            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            nlp.update(batch, sgd=optimizer, losses=losses)
        print("Losses", losses)

    # test the trained model
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -2,89 +2,87 @@
 # coding: utf8
 """Train a convolutional neural network text classifier on the
 IMDB dataset, using the TextCategorizer component. The dataset will be loaded
-automatically via Thinc's built-in dataset loader. The model is added to
+automatically via the package `ml_datasets`. The model is added to
 spacy.pipeline, and predictions are available via `doc.cats`. For more details,
 see the documentation:
 * Training: https://spacy.io/usage/training

-Compatible with: spaCy v2.0.0+
+Compatible with: spaCy v3.0.0+
 """
 from __future__ import unicode_literals, print_function
+
 import plac
 import random
 from pathlib import Path
-import thinc.extra.datasets
+from ml_datasets import loaders

 import spacy
+from spacy import util
 from spacy.util import minibatch, compounding
+from spacy.gold import Example, GoldParse


@plac.annotations(
-    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
+    config_path=("Path to config file", "positional", None, Path),
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int),
    init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
+    dataset=("Dataset to train on (default: imdb)", "option", "d", str),
+    threshold=("Min. number of instances for a given label (default 20)", "option", "m", int)
 )
-def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
+def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20):
+    if not config_path or not config_path.exists():
+        raise ValueError(f"Config file not found at {config_path}")
+
+    spacy.util.fix_random_seed()
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

-    if model is not None:
-        nlp = spacy.load(model)  # load existing spaCy model
-        print("Loaded model '%s'" % model)
-    else:
-        nlp = spacy.blank("en")  # create blank Language class
-        print("Created blank 'en' model")
+    print(f"Loading nlp model from {config_path}")
+    nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
+    nlp = util.load_model_from_config(nlp_config)

-    # add the text classifier to the pipeline if it doesn't exist
-    # nlp.create_pipe works for built-ins that are registered with spaCy
+    # ensure the nlp object was defined with a textcat component
    if "textcat" not in nlp.pipe_names:
-        textcat = nlp.create_pipe(
-            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
-        )
-        nlp.add_pipe(textcat, last=True)
-    # otherwise, get it, so we can add labels to it
-    else:
-        textcat = nlp.get_pipe("textcat")
+        raise ValueError(f"The nlp definition in the config does not contain a textcat component")

-    # add label to text classifier
-    textcat.add_label("POSITIVE")
-    textcat.add_label("NEGATIVE")
+    textcat = nlp.get_pipe("textcat")

-    # load the IMDB dataset
-    print("Loading IMDB data...")
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
-    train_texts = train_texts[:n_texts]
-    train_cats = train_cats[:n_texts]
+    # load the dataset
+    print(f"Loading dataset {dataset} ...")
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts)
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
-    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
+    train_examples = []
+    for text, cats in zip(train_texts, train_cats):
+        doc = nlp.make_doc(text)
+        gold = GoldParse(doc, cats=cats)
+        for cat in cats:
+            textcat.add_label(cat)
+        ex = Example.from_gold(gold, doc=doc)
+        train_examples.append(ex)

-    # get names of other pipes to disable them during training
-    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
-    with nlp.disable_pipes(*other_pipes):  # only train textcat
+    with nlp.select_pipes(enable="textcat"):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
-                textcat.model.tok2vec.from_bytes(file_.read())
+                textcat.model.get_ref("tok2vec").from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
-            random.shuffle(train_data)
-            batches = minibatch(train_data, size=batch_sizes)
+            random.shuffle(train_examples)
+            batches = minibatch(train_examples, size=batch_sizes)
            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
+                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
@ -97,7 +95,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
                )
            )

-    # test the trained model
+    # test the trained model (only makes sense for sentiment analysis)
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)
@ -114,14 +112,48 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
        print(test_text, doc2.cats)


-def load_data(limit=0, split=0.8):
-    """Load data from the IMDB dataset."""
+def load_data(dataset, threshold, limit=0, split=0.8):
+    """Load data from the provided dataset."""
    # Partition off part of the train data for evaluation
-    train_data, _ = thinc.extra.datasets.imdb()
+    data_loader = loaders.get(dataset)
+    train_data, _ = data_loader(limit=int(limit/split))
    random.shuffle(train_data)
-    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
-    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+
+    unique_labels = set()
+    for label_set in labels:
+        if isinstance(label_set, int) or isinstance(label_set, str):
+            unique_labels.add(label_set)
+        elif isinstance(label_set, list) or isinstance(label_set, set):
+            unique_labels.update(label_set)
+    unique_labels = sorted(unique_labels)
+    print(f"# of unique_labels: {len(unique_labels)}")
+
+    count_values_train = dict()
+    for text, annot_list in train_data:
+        if isinstance(annot_list, int) or isinstance(annot_list, str):
+            count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
+        else:
+            for annot in annot_list:
+                count_values_train[annot] = count_values_train.get(annot, 0) + 1
+    for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
+        if count < threshold:
+            unique_labels.remove(value)
+
+    print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}")
+
+    if unique_labels == {0, 1}:
+        cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+    else:
+        cats = []
+        for y in labels:
+            if isinstance(y, str) or isinstance(y, int):
+                cats.append({str(label): (label == y) for label in unique_labels})
+            elif isinstance(y, set):
+                cats.append({str(label): (label in y) for label in unique_labels})
+            else:
+                raise ValueError(f"Unrecognised type of labels: {type(y)}")
+
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

--- a/examples/training/train_textcat_config.cfg
+++ b/examples/training/train_textcat_config.cfg
@ -0,0 +1,19 @@
+[nlp]
+lang = "en"
+
+[nlp.pipeline.textcat]
+factory = "textcat"
+
+[nlp.pipeline.textcat.model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+
+[nlp.pipeline.textcat.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 96
+depth = 4
+embed_size = 2000
+window_size = 1
+maxout_pieces = 3
+subword_features = true
--- a/fabfile.py
+++ b/fabfile.py
@ -1,9 +1,6 @@
-# coding: utf-8
-from __future__ import unicode_literals, print_function
-
 import contextlib
 from pathlib import Path
-from fabric.api import local, lcd, env, settings, prefix
+from fabric.api import local, lcd
 from os import path, environ
 import shutil
 import sys
@ -82,9 +79,7 @@ def pex():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            sha = local("git rev-parse --short HEAD", capture=True)
-            venv_local(
-                "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
-            )
+            venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)


 def clean():
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,6 +6,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc==7.4.1",
+    "thinc==8.0.0a9",
+    "blis>=0.4.0,<0.5.0"
 ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,20 +1,23 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc==7.4.1
+thinc==8.0.0a9
 blis>=0.4.0,<0.5.0
+ml_datasets>=0.1.1
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
-srsly>=1.0.2,<1.1.0
+srsly>=2.0.0,<3.0.0
 catalogue>=0.0.7,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 plac>=0.9.6,<1.2.0
-pathlib==1.0.1; python_version < "3.4"
 tqdm>=4.38.0,<5.0.0
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
+pydantic>=1.3.0,<2.0.0
+# Official Python utilities
+setuptools
+packaging
+importlib_metadata>=0.20; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
--- a/setup.cfg
+++ b/setup.cfg
@ -16,10 +16,7 @@ classifiers =
    Operating System :: MacOS :: MacOS X
    Operating System :: Microsoft :: Windows
    Programming Language :: Cython
-    Programming Language :: Python :: 2
-    Programming Language :: Python :: 2.7
    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.5
    Programming Language :: Python :: 3.6
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
@ -30,32 +27,37 @@ zip_safe = false
 include_package_data = true
 scripts =
    bin/spacy
-python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
+python_requires = >=3.6
 setup_requires =
    wheel
    cython>=0.25
+    numpy>=1.15.0
    # We also need our Cython packages here to compile against
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc==7.4.1
+    thinc==8.0.0a9
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc==7.4.1
+    thinc==8.0.0a9
    blis>=0.4.0,<0.5.0
    wasabi>=0.4.0,<1.1.0
-    srsly>=1.0.2,<1.1.0
+    srsly>=2.0.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
+    ml_datasets>=0.1.1
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
-    setuptools
    numpy>=1.15.0
    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
-    pathlib==1.0.1; python_version < "3.4"
+    pydantic>=1.3.0,<2.0.0
+    # Official Python utilities
+    setuptools
+    packaging
+    importlib_metadata>=0.20; python_version < "3.8"

 [options.extras_require]
 lookups =
--- a/setup.py
+++ b/setup.py
@ -1,35 +1,27 @@
 #!/usr/bin/env python
-from __future__ import print_function
-import io
-import os
-import subprocess
 import sys
-import contextlib
+import platform
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
 import distutils.util
 from distutils import ccompiler, msvccompiler
 from setuptools import Extension, setup, find_packages
+import numpy
+from pathlib import Path
+import shutil
+from Cython.Build import cythonize
+from Cython.Compiler import Options


-def is_new_osx():
-    """Check whether we're on OSX >= 10.10"""
-    name = distutils.util.get_platform()
-    if sys.platform != "darwin":
-        return False
-    elif name.startswith("macosx-10"):
-        minor_version = int(name.split("-")[1].split(".")[1])
-        if minor_version >= 7:
-            return True
-        else:
-            return False
-    else:
-        return False
+ROOT = Path(__file__).parent
+PACKAGE_ROOT = ROOT / "spacy"


+# Preserve `__doc__` on functions and classes
+# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
+Options.docstrings = True
+
 PACKAGES = find_packages()
-
-
 MOD_NAMES = [
    "spacy.parts_of_speech",
    "spacy.strings",
@ -62,16 +54,38 @@ MOD_NAMES = [
    "spacy.symbols",
    "spacy.vectors",
 ]
-
-
 COMPILE_OPTIONS = {
    "msvc": ["/Ox", "/EHsc"],
    "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
    "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 }
-
-
 LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
+COMPILER_DIRECTIVES = {
+    "language_level": -3,
+    "embedsignature": True,
+    "annotation_typing": False,
+}
+# Files to copy into the package that are otherwise not included
+COPY_FILES = {
+    ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
+    ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
+}
+
+
+def is_new_osx():
+    """Check whether we're on OSX >= 10.7"""
+    name = distutils.util.get_platform()
+    if sys.platform != "darwin":
+        return False
+    mac_ver = platform.mac_ver()[0]
+    if mac_ver.startswith("10"):
+        minor_version = int(mac_ver.split('.')[1])
+        if minor_version >= 7:
+            return True
+        else:
+            return False
+    return False


 if is_new_osx():
@ -104,95 +118,53 @@ class build_ext_subclass(build_ext, build_ext_options):
        build_ext.build_extensions(self)


-def generate_cython(root, source):
-    print("Cythonizing sources")
-    p = subprocess.call(
-        [sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
-        env=os.environ,
-    )
-    if p != 0:
-        raise RuntimeError("Running cythonize failed")
-
-
-def is_source_release(path):
-    return os.path.exists(os.path.join(path, "PKG-INFO"))
-
-
 def clean(path):
-    for name in MOD_NAMES:
-        name = name.replace(".", "/")
-        for ext in [".so", ".html", ".cpp", ".c"]:
-            file_path = os.path.join(path, name + ext)
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-
-
-@contextlib.contextmanager
-def chdir(new_dir):
-    old_dir = os.getcwd()
-    try:
-        os.chdir(new_dir)
-        sys.path.insert(0, new_dir)
-        yield
-    finally:
-        del sys.path[0]
-        os.chdir(old_dir)
+    for path in path.glob("**/*"):
+        if path.is_file() and path.suffix in (".so", ".cpp"):
+            print(f"Deleting {path.name}")
+            path.unlink()


 def setup_package():
-    root = os.path.abspath(os.path.dirname(__file__))
-
    if len(sys.argv) > 1 and sys.argv[1] == "clean":
-        return clean(root)
+        return clean(PACKAGE_ROOT)

-    with chdir(root):
-        with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
-            about = {}
-            exec(f.read(), about)
+    with (PACKAGE_ROOT / "about.py").open("r") as f:
+        about = {}
+        exec(f.read(), about)

-        include_dirs = [
-            get_python_inc(plat_specific=True),
-            os.path.join(root, "include"),
-        ]
+    for copy_file, target_dir in COPY_FILES.items():
+        if copy_file.exists():
+            shutil.copy(str(copy_file), str(target_dir))
+            print(f"Copied {copy_file} -> {target_dir}")

-        if (
-            ccompiler.new_compiler().compiler_type == "msvc"
-            and msvccompiler.get_build_version() == 9
-        ):
-            include_dirs.append(os.path.join(root, "include", "msvc9"))
+    include_dirs = [
+        get_python_inc(plat_specific=True),
+        numpy.get_include(),
+        str(ROOT / "include"),
+    ]
+    if (
+        ccompiler.new_compiler().compiler_type == "msvc"
+        and msvccompiler.get_build_version() == 9
+    ):
+        include_dirs.append(str(ROOT / "include" / "msvc9"))
+    ext_modules = []
+    for name in MOD_NAMES:
+        mod_path = name.replace(".", "/") + ".pyx"
+        ext = Extension(name, [mod_path], language="c++")
+        ext_modules.append(ext)
+    print("Cythonizing sources")
+    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)

-        ext_modules = []
-        for mod_name in MOD_NAMES:
-            mod_path = mod_name.replace(".", "/") + ".cpp"
-            extra_link_args = []
-            # ???
-            # Imported from patch from @mikepb
-            # See Issue #267. Running blind here...
-            if sys.platform == "darwin":
-                dylib_path = [".." for _ in range(mod_name.count("."))]
-                dylib_path = "/".join(dylib_path)
-                dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
-                extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
-            ext_modules.append(
-                Extension(
-                    mod_name,
-                    [mod_path],
-                    language="c++",
-                    include_dirs=include_dirs,
-                    extra_link_args=extra_link_args,
-                )
-            )
-
-        if not is_source_release(root):
-            generate_cython(root, "spacy")
-
-        setup(
-            name="spacy",
-            packages=PACKAGES,
-            version=about["__version__"],
-            ext_modules=ext_modules,
-            cmdclass={"build_ext": build_ext_subclass},
-        )
+    setup(
+        name="spacy",
+        packages=PACKAGES,
+        version=about["__version__"],
+        ext_modules=ext_modules,
+        cmdclass={"build_ext": build_ext_subclass},
+        include_dirs=include_dirs,
+        package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
+    )


 if __name__ == "__main__":
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
 import warnings
 import sys

@ -7,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

 # These are imported as part of the API
-from thinc.neural.util import prefer_gpu, require_gpu
+from thinc.api import prefer_gpu, require_gpu

 from . import pipeline
 from .cli.info import info as cli_info
@ -23,6 +21,9 @@ if sys.maxunicode == 65535:
    raise SystemError(Errors.E130)


+config = registry
+
+
 def load(name, **overrides):
    depr_path = overrides.get("path")
    if depr_path not in (True, False, None):
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,21 +1,16 @@
-# coding: utf8
-from __future__ import print_function
-
-# NB! This breaks in plac on Python 2!!
-# from __future__ import unicode_literals
-
 if __name__ == "__main__":
    import plac
    import sys
    from wasabi import msg
-    from spacy.cli import download, link, info, package, train, pretrain, convert
+    from spacy.cli import download, link, info, package, pretrain, convert
    from spacy.cli import init_model, profile, evaluate, validate, debug_data
+    from spacy.cli import train_cli

    commands = {
        "download": download,
        "link": link,
        "info": info,
-        "train": train,
+        "train": train_cli,
        "pretrain": pretrain,
        "debug-data": debug_data,
        "evaluate": evaluate,
@ -28,9 +23,9 @@ if __name__ == "__main__":
    if len(sys.argv) == 1:
        msg.info("Available commands", ", ".join(commands), exits=1)
    command = sys.argv.pop(1)
-    sys.argv[0] = "spacy %s" % command
+    sys.argv[0] = f"spacy {command}"
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
-        available = "Available: {}".format(", ".join(commands))
-        msg.fail("Unknown command: {}".format(command), available, exits=1)
+        available = f"Available: {', '.join(commands)}"
+        msg.fail(f"Unknown command: {command}", available, exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,988 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import numpy
-import warnings
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
-from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.i2v import HashEmbed
-from thinc.misc import Residual, FeatureExtracter
-from thinc.misc import LayerNorm as LN
-from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.api import with_getitem, flatten_add_lengths
-from thinc.api import uniqued, wrap, noop
-from thinc.linear.linear import LinearModel
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
-from thinc.neural.optimizers import Adam
-
-from thinc import describe
-from thinc.describe import Dimension, Synapses, Biases, Gradient
-from thinc.neural._classes.affine import _set_dimensions_if_needed
-import thinc.extra.load_nlp
-
-from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
-from .errors import Errors, Warnings
-from . import util
-from . import ml as new_ml
-from .ml import _legacy_tok2vec
-
-
-VECTORS_KEY = "spacy_pretrained_vectors"
-# Backwards compatibility with <2.2.2
-USE_MODEL_REGISTRY_TOK2VEC = False
-
-
-def cosine(vec1, vec2):
-    xp = get_array_module(vec1)
-    norm1 = xp.linalg.norm(vec1)
-    norm2 = xp.linalg.norm(vec2)
-    if norm1 == 0.0 or norm2 == 0.0:
-        return 0
-    else:
-        return vec1.dot(vec2) / (norm1 * norm2)
-
-
-def create_default_optimizer(ops, **cfg):
-    learn_rate = util.env_opt("learn_rate", 0.001)
-    beta1 = util.env_opt("optimizer_B1", 0.9)
-    beta2 = util.env_opt("optimizer_B2", 0.999)
-    eps = util.env_opt("optimizer_eps", 1e-8)
-    L2 = util.env_opt("L2_penalty", 1e-6)
-    max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
-    optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
-    optimizer.max_grad_norm = max_grad_norm
-    optimizer.device = ops.device
-    return optimizer
-
-
-@layerize
-def _flatten_add_lengths(seqs, pad=0, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=pad)
-
-    X = ops.flatten(seqs, pad=pad)
-    return (X, lengths), finish_update
-
-
-def _zero_init(model):
-    def _zero_init_impl(self, *args, **kwargs):
-        self.W.fill(0)
-
-    model.on_init_hooks.append(_zero_init_impl)
-    if model.W is not None:
-        model.W.fill(0.0)
-    return model
-
-
-def with_cpu(ops, model):
-    """Wrap a model that should run on CPU, transferring inputs and outputs
-    as necessary."""
-    model.to_cpu()
-
-    def with_cpu_forward(inputs, drop=0.0):
-        cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
-        gpu_outputs = _to_device(ops, cpu_outputs)
-
-        def with_cpu_backprop(d_outputs, sgd=None):
-            cpu_d_outputs = _to_cpu(d_outputs)
-            return backprop(cpu_d_outputs, sgd=sgd)
-
-        return gpu_outputs, with_cpu_backprop
-
-    return wrap(with_cpu_forward, model)
-
-
-def _to_cpu(X):
-    if isinstance(X, numpy.ndarray):
-        return X
-    elif isinstance(X, tuple):
-        return tuple([_to_cpu(x) for x in X])
-    elif isinstance(X, list):
-        return [_to_cpu(x) for x in X]
-    elif hasattr(X, "get"):
-        return X.get()
-    else:
-        return X
-
-
-def _to_device(ops, X):
-    if isinstance(X, tuple):
-        return tuple([_to_device(ops, x) for x in X])
-    elif isinstance(X, list):
-        return [_to_device(ops, x) for x in X]
-    else:
-        return ops.asarray(X)
-
-
-class extract_ngrams(Model):
-    def __init__(self, ngram_size, attr=LOWER):
-        Model.__init__(self)
-        self.ngram_size = ngram_size
-        self.attr = attr
-
-    def begin_update(self, docs, drop=0.0):
-        batch_keys = []
-        batch_vals = []
-        for doc in docs:
-            unigrams = doc.to_array([self.attr])
-            ngrams = [unigrams]
-            for n in range(2, self.ngram_size + 1):
-                ngrams.append(self.ops.ngrams(n, unigrams))
-            keys = self.ops.xp.concatenate(ngrams)
-            keys, vals = self.ops.xp.unique(keys, return_counts=True)
-            batch_keys.append(keys)
-            batch_vals.append(vals)
-        # The dtype here matches what thinc is expecting -- which differs per
-        # platform (by int definition). This should be fixed once the problem
-        # is fixed on Thinc's side.
-        lengths = self.ops.asarray(
-            [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
-        )
-        batch_keys = self.ops.xp.concatenate(batch_keys)
-        batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
-        return (batch_keys, batch_vals, lengths), None
-
-
-@describe.on_data(
-    _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
-)
-@describe.attributes(
-    nI=Dimension("Input size"),
-    nF=Dimension("Number of features"),
-    nO=Dimension("Output size"),
-    nP=Dimension("Maxout pieces"),
-    W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
-    b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
-    pad=Synapses(
-        "Pad",
-        lambda obj: (1, obj.nF, obj.nO, obj.nP),
-        lambda M, ops: ops.normal_init(M, 1.0),
-    ),
-    d_W=Gradient("W"),
-    d_pad=Gradient("pad"),
-    d_b=Gradient("b"),
-)
-class PrecomputableAffine(Model):
-    def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.nO = nO
-        self.nP = nP
-        self.nI = nI
-        self.nF = nF
-
-    def begin_update(self, X, drop=0.0):
-        Yf = self.ops.gemm(
-            X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
-        )
-        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
-        Yf = self._add_padding(Yf)
-
-        def backward(dY_ids, sgd=None):
-            dY, ids = dY_ids
-            dY, ids = self._backprop_padding(dY, ids)
-            Xf = X[ids]
-            Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
-
-            self.d_b += dY.sum(axis=0)
-            dY = dY.reshape((dY.shape[0], self.nO * self.nP))
-
-            Wopfi = self.W.transpose((1, 2, 0, 3))
-            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
-            Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
-            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
-
-            # Reuse the buffer
-            dWopfi = Wopfi
-            dWopfi.fill(0.0)
-            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
-            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
-            # (o, p, f, i) --> (f, o, p, i)
-            self.d_W += dWopfi.transpose((2, 0, 1, 3))
-
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return dXf.reshape((dXf.shape[0], self.nF, self.nI))
-
-        return Yf, backward
-
-    def _add_padding(self, Yf):
-        Yf_padded = self.ops.xp.vstack((self.pad, Yf))
-        return Yf_padded
-
-    def _backprop_padding(self, dY, ids):
-        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        mask = ids < 0.0
-        mask = mask.sum(axis=1)
-        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
-        self.d_pad += d_pad.sum(axis=0)
-        return dY, ids
-
-    @staticmethod
-    def init_weights(model):
-        """This is like the 'layer sequential unit variance', but instead
-        of taking the actual inputs, we randomly generate whitened data.
-
-        Why's this all so complicated? We have a huge number of inputs,
-        and the maxout unit makes guessing the dynamics tricky. Instead
-        we set the maxout weights to values that empirically result in
-        whitened outputs given whitened inputs.
-        """
-        if (model.W ** 2).sum() != 0.0:
-            return
-        ops = model.ops
-        xp = ops.xp
-        ops.normal_init(model.W, model.nF * model.nI, inplace=True)
-
-        ids = ops.allocate((5000, model.nF), dtype="f")
-        ids += xp.random.uniform(0, 1000, ids.shape)
-        ids = ops.asarray(ids, dtype="i")
-        tokvecs = ops.allocate((5000, model.nI), dtype="f")
-        tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
-            tokvecs.shape
-        )
-
-        def predict(ids, tokvecs):
-            # nS ids. nW tokvecs. Exclude the padding array.
-            hiddens = model(tokvecs[:-1])  # (nW, f, o, p)
-            vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
-            # need nS vectors
-            hiddens = hiddens.reshape(
-                (hiddens.shape[0] * model.nF, model.nO * model.nP)
-            )
-            model.ops.scatter_add(vectors, ids.flatten(), hiddens)
-            vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
-            vectors += model.b
-            vectors = model.ops.asarray(vectors)
-            if model.nP >= 2:
-                return model.ops.maxout(vectors)[0]
-            else:
-                return vectors * (vectors >= 0)
-
-        tol_var = 0.01
-        tol_mean = 0.01
-        t_max = 10
-        t_i = 0
-        for t_i in range(t_max):
-            acts1 = predict(ids, tokvecs)
-            var = model.ops.xp.var(acts1)
-            mean = model.ops.xp.mean(acts1)
-            if abs(var - 1.0) >= tol_var:
-                model.W /= model.ops.xp.sqrt(var)
-            elif abs(mean) >= tol_mean:
-                model.b -= mean
-            else:
-                break
-
-
-def link_vectors_to_models(vocab, skip_rank=False):
-    vectors = vocab.vectors
-    if vectors.name is None:
-        vectors.name = VECTORS_KEY
-        if vectors.data.size != 0:
-            warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
-    ops = Model.ops
-    if not skip_rank:
-        for word in vocab:
-            if word.orth in vectors.key2row:
-                word.rank = vectors.key2row[word.orth]
-            else:
-                word.rank = util.OOV_RANK
-    data = ops.asarray(vectors.data)
-    # Set an entry here, so that vectors are accessed by StaticVectors
-    # (unideal, I know)
-    key = (ops.device, vectors.name)
-    if key in thinc.extra.load_nlp.VECTORS:
-        if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
-            # This is a hack to avoid the problem in #3853.
-            old_name = vectors.name
-            new_name = vectors.name + "_%d" % data.shape[0]
-            warnings.warn(Warnings.W019.format(old=old_name, new=new_name))
-            vectors.name = new_name
-            key = (ops.device, vectors.name)
-    thinc.extra.load_nlp.VECTORS[key] = data
-
-
-def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
-    import torch.nn
-    from thinc.api import with_square_sequences
-    from thinc.extra.wrappers import PyTorchWrapperRNN
-
-    if depth == 0:
-        return layerize(noop())
-    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
-    return with_square_sequences(PyTorchWrapperRNN(model))
-
-
-def Tok2Vec(width, embed_size, **kwargs):
-    if not USE_MODEL_REGISTRY_TOK2VEC:
-        # Preserve prior tok2vec for backwards compat, in v2.2.2
-        return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
-    pretrained_vectors = kwargs.get("pretrained_vectors", None)
-    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
-    subword_features = kwargs.get("subword_features", True)
-    char_embed = kwargs.get("char_embed", False)
-    conv_depth = kwargs.get("conv_depth", 4)
-    bilstm_depth = kwargs.get("bilstm_depth", 0)
-    conv_window = kwargs.get("conv_window", 1)
-
-    cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
-
-    doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
-    if char_embed:
-        embed_cfg = {
-            "arch": "spacy.CharacterEmbed.v1",
-            "config": {
-                "width": 64,
-                "chars": 6,
-                "@mix": {
-                    "arch": "spacy.LayerNormalizedMaxout.v1",
-                    "config": {"width": width, "pieces": 3},
-                },
-                "@embed_features": None,
-            },
-        }
-    else:
-        embed_cfg = {
-            "arch": "spacy.MultiHashEmbed.v1",
-            "config": {
-                "width": width,
-                "rows": embed_size,
-                "columns": cols,
-                "use_subwords": subword_features,
-                "@pretrained_vectors": None,
-                "@mix": {
-                    "arch": "spacy.LayerNormalizedMaxout.v1",
-                    "config": {"width": width, "pieces": 3},
-                },
-            },
-        }
-        if pretrained_vectors:
-            embed_cfg["config"]["@pretrained_vectors"] = {
-                "arch": "spacy.PretrainedVectors.v1",
-                "config": {
-                    "vectors_name": pretrained_vectors,
-                    "width": width,
-                    "column": cols.index("ID"),
-                },
-            }
-    if cnn_maxout_pieces >= 2:
-        cnn_cfg = {
-            "arch": "spacy.MaxoutWindowEncoder.v1",
-            "config": {
-                "width": width,
-                "window_size": conv_window,
-                "pieces": cnn_maxout_pieces,
-                "depth": conv_depth,
-            },
-        }
-    else:
-        cnn_cfg = {
-            "arch": "spacy.MishWindowEncoder.v1",
-            "config": {"width": width, "window_size": conv_window, "depth": conv_depth},
-        }
-    bilstm_cfg = {
-        "arch": "spacy.TorchBiLSTMEncoder.v1",
-        "config": {"width": width, "depth": bilstm_depth},
-    }
-    if conv_depth == 0 and bilstm_depth == 0:
-        encode_cfg = {}
-    elif conv_depth >= 1 and bilstm_depth >= 1:
-        encode_cfg = {
-            "arch": "thinc.FeedForward.v1",
-            "config": {"children": [cnn_cfg, bilstm_cfg]},
-        }
-    elif conv_depth >= 1:
-        encode_cfg = cnn_cfg
-    else:
-        encode_cfg = bilstm_cfg
-    config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
-    return new_ml.Tok2Vec(config)
-
-
-def reapply(layer, n_times):
-    def reapply_fwd(X, drop=0.0):
-        backprops = []
-        for i in range(n_times):
-            Y, backprop = layer.begin_update(X, drop=drop)
-            X = Y
-            backprops.append(backprop)
-
-        def reapply_bwd(dY, sgd=None):
-            dX = None
-            for backprop in reversed(backprops):
-                dY = backprop(dY, sgd=sgd)
-                if dX is None:
-                    dX = dY
-                else:
-                    dX += dY
-            return dX
-
-        return Y, reapply_bwd
-
-    return wrap(reapply_fwd, layer)
-
-
-def asarray(ops, dtype):
-    def forward(X, drop=0.0):
-        return ops.asarray(X, dtype=dtype), None
-
-    return layerize(forward)
-
-
-def _divide_array(X, size):
-    parts = []
-    index = 0
-    while index < len(X):
-        parts.append(X[index : index + size])
-        index += size
-    return parts
-
-
-def get_col(idx):
-    if idx < 0:
-        raise IndexError(Errors.E066.format(value=idx))
-
-    def forward(X, drop=0.0):
-        if isinstance(X, numpy.ndarray):
-            ops = NumpyOps()
-        else:
-            ops = CupyOps()
-        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
-
-        def backward(y, sgd=None):
-            dX = ops.allocate(X.shape)
-            dX[:, idx] += y
-            return dX
-
-        return output, backward
-
-    return layerize(forward)
-
-
-def doc2feats(cols=None):
-    if cols is None:
-        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-
-    def forward(docs, drop=0.0):
-        feats = []
-        for doc in docs:
-            feats.append(doc.to_array(cols))
-        return feats, None
-
-    model = layerize(forward)
-    model.cols = cols
-    return model
-
-
-def print_shape(prefix):
-    def forward(X, drop=0.0):
-        return X, lambda dX, **kwargs: dX
-
-    return layerize(forward)
-
-
-@layerize
-def get_token_vectors(tokens_attrs_vectors, drop=0.0):
-    tokens, attrs, vectors = tokens_attrs_vectors
-
-    def backward(d_output, sgd=None):
-        return (tokens, d_output)
-
-    return vectors, backward
-
-
-@layerize
-def logistic(X, drop=0.0):
-    xp = get_array_module(X)
-    if not isinstance(X, xp.ndarray):
-        X = xp.asarray(X)
-    # Clip to range (-10, 10)
-    X = xp.minimum(X, 10.0, X)
-    X = xp.maximum(X, -10.0, X)
-    Y = 1.0 / (1.0 + xp.exp(-X))
-
-    def logistic_bwd(dY, sgd=None):
-        dX = dY * (Y * (1 - Y))
-        return dX
-
-    return Y, logistic_bwd
-
-
-def zero_init(model):
-    def _zero_init_impl(self, X, y):
-        self.W.fill(0)
-
-    model.on_data_hooks.append(_zero_init_impl)
-    return model
-
-
-def getitem(i):
-    def getitem_fwd(X, drop=0.0):
-        return X[i], None
-
-    return layerize(getitem_fwd)
-
-
-@describe.attributes(
-    W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
-)
-class MultiSoftmax(Affine):
-    """Neural network layer that predicts several multi-class attributes at once.
-    For instance, we might predict one class with 6 variables, and another with 5.
-    We predict the 11 neurons required for this, and then softmax them such
-    that columns 0-6 make a probability distribution and coumns 6-11 make another.
-    """
-
-    name = "multisoftmax"
-
-    def __init__(self, out_sizes, nI=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.out_sizes = out_sizes
-        self.nO = sum(out_sizes)
-        self.nI = nI
-
-    def predict(self, input__BI):
-        output__BO = self.ops.affine(self.W, self.b, input__BI)
-        i = 0
-        for out_size in self.out_sizes:
-            self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
-            i += out_size
-        return output__BO
-
-    def begin_update(self, input__BI, drop=0.0):
-        output__BO = self.predict(input__BI)
-
-        def finish_update(grad__BO, sgd=None):
-            self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
-            self.d_b += grad__BO.sum(axis=0)
-            grad__BI = self.ops.gemm(grad__BO, self.W)
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return grad__BI
-
-        return output__BO, finish_update
-
-
-def build_tagger_model(nr_class, **cfg):
-    embed_size = util.env_opt("embed_size", 2000)
-    if "token_vector_width" in cfg:
-        token_vector_width = cfg["token_vector_width"]
-    else:
-        token_vector_width = util.env_opt("token_vector_width", 96)
-    pretrained_vectors = cfg.get("pretrained_vectors")
-    subword_features = cfg.get("subword_features", True)
-    with Model.define_operators({">>": chain, "+": add}):
-        if "tok2vec" in cfg:
-            tok2vec = cfg["tok2vec"]
-        else:
-            tok2vec = Tok2Vec(
-                token_vector_width,
-                embed_size,
-                subword_features=subword_features,
-                pretrained_vectors=pretrained_vectors,
-            )
-        softmax = with_flatten(Softmax(nr_class, token_vector_width))
-        model = tok2vec >> softmax
-    model.nI = None
-    model.tok2vec = tok2vec
-    model.softmax = softmax
-    return model
-
-
-def build_morphologizer_model(class_nums, **cfg):
-    embed_size = util.env_opt("embed_size", 7000)
-    if "token_vector_width" in cfg:
-        token_vector_width = cfg["token_vector_width"]
-    else:
-        token_vector_width = util.env_opt("token_vector_width", 128)
-    pretrained_vectors = cfg.get("pretrained_vectors")
-    char_embed = cfg.get("char_embed", True)
-    with Model.define_operators({">>": chain, "+": add, "**": clone}):
-        if "tok2vec" in cfg:
-            tok2vec = cfg["tok2vec"]
-        else:
-            tok2vec = Tok2Vec(
-                token_vector_width,
-                embed_size,
-                char_embed=char_embed,
-                pretrained_vectors=pretrained_vectors,
-            )
-        softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
-        softmax.out_sizes = class_nums
-        model = tok2vec >> softmax
-    model.nI = None
-    model.tok2vec = tok2vec
-    model.softmax = softmax
-    return model
-
-
-@layerize
-def SpacyVectors(docs, drop=0.0):
-    batch = []
-    for doc in docs:
-        indices = numpy.zeros((len(doc),), dtype="i")
-        for i, word in enumerate(doc):
-            if word.orth in doc.vocab.vectors.key2row:
-                indices[i] = doc.vocab.vectors.key2row[word.orth]
-            else:
-                indices[i] = 0
-        vectors = doc.vocab.vectors.data[indices]
-        batch.append(vectors)
-    return batch, None
-
-
-def build_text_classifier(nr_class, width=64, **cfg):
-    depth = cfg.get("depth", 2)
-    nr_vector = cfg.get("nr_vector", 5000)
-    pretrained_dims = cfg.get("pretrained_dims", 0)
-    with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
-        if cfg.get("low_data") and pretrained_dims:
-            model = (
-                SpacyVectors
-                >> flatten_add_lengths
-                >> with_getitem(0, Affine(width, pretrained_dims))
-                >> ParametricAttention(width)
-                >> Pooling(sum_pool)
-                >> Residual(ReLu(width, width)) ** 2
-                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-                >> logistic
-            )
-            return model
-
-        lower = HashEmbed(width, nr_vector, column=1)
-        prefix = HashEmbed(width // 2, nr_vector, column=2)
-        suffix = HashEmbed(width // 2, nr_vector, column=3)
-        shape = HashEmbed(width // 2, nr_vector, column=4)
-
-        trained_vectors = FeatureExtracter(
-            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
-        ) >> with_flatten(
-            uniqued(
-                (lower | prefix | suffix | shape)
-                >> LN(Maxout(width, width + (width // 2) * 3)),
-                column=0,
-            )
-        )
-
-        if pretrained_dims:
-            static_vectors = SpacyVectors >> with_flatten(
-                Affine(width, pretrained_dims)
-            )
-            # TODO Make concatenate support lists
-            vectors = concatenate_lists(trained_vectors, static_vectors)
-            vectors_width = width * 2
-        else:
-            vectors = trained_vectors
-            vectors_width = width
-            static_vectors = None
-        tok2vec = vectors >> with_flatten(
-            LN(Maxout(width, vectors_width))
-            >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
-            pad=depth,
-        )
-        cnn_model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> ParametricAttention(width)
-            >> Pooling(sum_pool)
-            >> Residual(zero_init(Maxout(width, width)))
-            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-        )
-
-        linear_model = build_bow_text_classifier(
-            nr_class,
-            ngram_size=cfg.get("ngram_size", 1),
-            exclusive_classes=cfg.get("exclusive_classes", False),
-        )
-        if cfg.get("exclusive_classes", False):
-            output_layer = Softmax(nr_class, nr_class * 2)
-        else:
-            output_layer = (
-                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
-            )
-        model = (linear_model | cnn_model) >> output_layer
-        model.tok2vec = chain(tok2vec, flatten)
-    model.nO = nr_class
-    model.lsuv = False
-    return model
-
-
-def build_bow_text_classifier(
-    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
-):
-    with Model.define_operators({">>": chain}):
-        model = with_cpu(
-            Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
-        )
-        if not no_output_layer:
-            model = model >> (cpu_softmax if exclusive_classes else logistic)
-    model.nO = nr_class
-    return model
-
-
-@layerize
-def cpu_softmax(X, drop=0.0):
-    ops = NumpyOps()
-
-    def cpu_softmax_backward(dY, sgd=None):
-        return dY
-
-    return ops.softmax(X), cpu_softmax_backward
-
-
-def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
-    """
-    Build a simple CNN text classifier, given a token-to-vector model as inputs.
-    If exclusive_classes=True, a softmax non-linearity is applied, so that the
-    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
-    is applied instead, so that outputs are in the range [0, 1].
-    """
-    with Model.define_operators({">>": chain}):
-        if exclusive_classes:
-            output_layer = Softmax(nr_class, tok2vec.nO)
-        else:
-            output_layer = (
-                zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
-            )
-        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
-    model.tok2vec = chain(tok2vec, flatten)
-    model.nO = nr_class
-    return model
-
-
-def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
-    if "entity_width" not in cfg:
-        raise ValueError(Errors.E144.format(param="entity_width"))
-
-    conv_depth = cfg.get("conv_depth", 2)
-    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
-    pretrained_vectors = cfg.get("pretrained_vectors", None)
-    context_width = cfg.get("entity_width")
-
-    with Model.define_operators({">>": chain, "**": clone}):
-        # context encoder
-        tok2vec = Tok2Vec(
-            width=hidden_width,
-            embed_size=embed_width,
-            pretrained_vectors=pretrained_vectors,
-            cnn_maxout_pieces=cnn_maxout_pieces,
-            subword_features=True,
-            conv_depth=conv_depth,
-            bilstm_depth=0,
-        )
-
-        model = (
-            tok2vec
-            >> flatten_add_lengths
-            >> Pooling(mean_pool)
-            >> Residual(zero_init(Maxout(hidden_width, hidden_width)))
-            >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
-        )
-
-        model.tok2vec = tok2vec
-        model.nO = context_width
-    return model
-
-
-@layerize
-def flatten(seqs, drop=0.0):
-    ops = Model.ops
-    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
-
-    def finish_update(d_X, sgd=None):
-        return ops.unflatten(d_X, lengths, pad=0)
-
-    X = ops.flatten(seqs, pad=0)
-    return X, finish_update
-
-
-def concatenate_lists(*layers, **kwargs):  # pragma: no cover
-    """Compose two or more models `f`, `g`, etc, such that their outputs are
-    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
-    """
-    if not layers:
-        return noop()
-    drop_factor = kwargs.get("drop_factor", 1.0)
-    ops = layers[0].ops
-    layers = [chain(layer, flatten) for layer in layers]
-    concat = concatenate(*layers)
-
-    def concatenate_lists_fwd(Xs, drop=0.0):
-        if drop is not None:
-            drop *= drop_factor
-        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
-        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
-        ys = ops.unflatten(flat_y, lengths)
-
-        def concatenate_lists_bwd(d_ys, sgd=None):
-            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
-
-        return ys, concatenate_lists_bwd
-
-    model = wrap(concatenate_lists_fwd, concat)
-    return model
-
-
-def masked_language_model(vocab, model, mask_prob=0.15):
-    """Convert a model into a BERT-style masked language model"""
-
-    random_words = _RandomWords(vocab)
-
-    def mlm_forward(docs, drop=0.0):
-        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
-        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
-        output, backprop = model.begin_update(docs, drop=drop)
-
-        def mlm_backward(d_output, sgd=None):
-            d_output *= 1 - mask
-            return backprop(d_output, sgd=sgd)
-
-        return output, mlm_backward
-
-    return wrap(mlm_forward, model)
-
-
-class _RandomWords(object):
-    def __init__(self, vocab):
-        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
-        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
-        self.words = self.words[:10000]
-        self.probs = self.probs[:10000]
-        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
-        self.probs /= self.probs.sum()
-        self._cache = []
-
-    def next(self):
-        if not self._cache:
-            self._cache.extend(
-                numpy.random.choice(len(self.words), 10000, p=self.probs)
-            )
-        index = self._cache.pop()
-        return self.words[index]
-
-
-def _apply_mask(docs, random_words, mask_prob=0.15):
-    # This needs to be here to avoid circular imports
-    from .tokens.doc import Doc
-
-    N = sum(len(doc) for doc in docs)
-    mask = numpy.random.uniform(0.0, 1.0, (N,))
-    mask = mask >= mask_prob
-    i = 0
-    masked_docs = []
-    for doc in docs:
-        words = []
-        for token in doc:
-            if not mask[i]:
-                word = _replace_word(token.text, random_words)
-            else:
-                word = token.text
-            words.append(word)
-            i += 1
-        spaces = [bool(w.whitespace_) for w in doc]
-        # NB: If you change this implementation to instead modify
-        # the docs in place, take care that the IDs reflect the original
-        # words. Currently we use the original docs to make the vectors
-        # for the target, so we don't lose the original tokens. But if
-        # you modified the docs in place here, you would.
-        masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
-    return mask, masked_docs
-
-
-def _replace_word(word, random_words, mask="[MASK]"):
-    roll = numpy.random.random()
-    if roll < 0.8:
-        return mask
-    elif roll < 0.9:
-        return random_words.next()
-    else:
-        return word
-
-
-def _uniform_init(lo, hi):
-    def wrapped(W, ops):
-        copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
-
-    return wrapped
-
-
-@describe.attributes(
-    nM=Dimension("Vector dimensions"),
-    nC=Dimension("Number of characters per word"),
-    vectors=Synapses(
-        "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
-    ),
-    d_vectors=Gradient("vectors"),
-)
-class CharacterEmbed(Model):
-    def __init__(self, nM=None, nC=None, **kwargs):
-        Model.__init__(self, **kwargs)
-        self.nM = nM
-        self.nC = nC
-
-    @property
-    def nO(self):
-        return self.nM * self.nC
-
-    @property
-    def nV(self):
-        return 256
-
-    def begin_update(self, docs, drop=0.0):
-        if not docs:
-            return []
-        ids = []
-        output = []
-        weights = self.vectors
-        # This assists in indexing; it's like looping over this dimension.
-        # Still consider this weird witch craft...But thanks to Mark Neumann
-        # for the tip.
-        nCv = self.ops.xp.arange(self.nC)
-        for doc in docs:
-            doc_ids = doc.to_utf8_array(nr_char=self.nC)
-            doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
-            # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
-            # incantation do I chant to get
-            # output[i, j, k] == data[j, ids[i, j], k]?
-            doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
-            output.append(doc_vectors.reshape((len(doc), self.nO)))
-            ids.append(doc_ids)
-
-        def backprop_character_embed(d_vectors, sgd=None):
-            gradient = self.d_vectors
-            for doc_ids, d_doc_vectors in zip(ids, d_vectors):
-                d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
-                gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
-            if sgd is not None:
-                sgd(self._mem.weights, self._mem.gradient, key=self.id)
-            return None
-
-        return output, backprop_character_embed
-
-
-def get_cossim_loss(yh, y, ignore_zeros=False):
-    xp = get_array_module(yh)
-    # Find the zero vectors
-    if ignore_zeros:
-        zero_indices = xp.abs(y).sum(axis=1) == 0
-    # Add a small constant to avoid 0 vectors
-    yh = yh + 1e-8
-    y = y + 1e-8
-    # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-    norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
-    norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
-    mul_norms = norm_yh * norm_y
-    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
-    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
-    losses = xp.abs(cosine - 1)
-    if ignore_zeros:
-        # If the target was a zero vector, don't count it in the loss.
-        d_yh[zero_indices] = 0
-        losses[zero_indices] = 0
-    loss = losses.sum()
-    return loss, -d_yh
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.3.0"
+__version__ = "3.0.0.dev9"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -91,6 +91,7 @@ cdef enum attr_id_t:

    LANG
    ENT_KB_ID = symbols.ENT_KB_ID
+    MORPH
    ENT_ID = symbols.ENT_ID

    IDX
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-

 IDS = {
    "": NULL_ATTR,
@ -92,6 +89,7 @@ IDS = {
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
+    "MORPH": MORPH,
    "IDX": IDX
 }

--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,12 +1,20 @@
+from wasabi import msg
+
 from .download import download  # noqa: F401
 from .info import info  # noqa: F401
-from .link import link  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
-from .train import train  # noqa: F401
+from .train_from_config import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_model import init_model  # noqa: F401
 from .validate import validate  # noqa: F401
+
+
+def link(*args, **kwargs):
+    msg.warn(
+        "As of spaCy v3.0, model symlinks are deprecated. You can load models "
+        "using their full names or from a directory path."
+    )
--- a/spacy/cli/_schemas.py
+++ b/spacy/cli/_schemas.py
@ -1,220 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-
-# NB: This schema describes the new format of the training data, see #2928
-TRAINING_SCHEMA = {
-    "$schema": "http://json-schema.org/draft-06/schema",
-    "title": "Training data for spaCy models",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "text": {
-                "title": "The text of the training example",
-                "type": "string",
-                "minLength": 1,
-            },
-            "ents": {
-                "title": "Named entity spans in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "start": {
-                            "title": "Start character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "label": {
-                            "title": "Entity label",
-                            "type": "string",
-                            "minLength": 1,
-                            "pattern": "^[A-Z0-9]*$",
-                        },
-                    },
-                    "required": ["start", "end", "label"],
-                },
-            },
-            "sents": {
-                "title": "Sentence spans in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "start": {
-                            "title": "Start character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the span",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                    },
-                    "required": ["start", "end"],
-                },
-            },
-            "cats": {
-                "title": "Text categories for the text classifier",
-                "type": "object",
-                "patternProperties": {
-                    "*": {
-                        "title": "A text category",
-                        "oneOf": [
-                            {"type": "boolean"},
-                            {"type": "number", "minimum": 0},
-                        ],
-                    }
-                },
-                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
-            },
-            "tokens": {
-                "title": "The tokens in the text",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "minProperties": 1,
-                    "properties": {
-                        "id": {
-                            "title": "Token ID, usually token index",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "start": {
-                            "title": "Start character offset of the token",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "end": {
-                            "title": "End character offset of the token",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                        "pos": {
-                            "title": "Coarse-grained part-of-speech tag",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "tag": {
-                            "title": "Fine-grained part-of-speech tag",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "dep": {
-                            "title": "Dependency label",
-                            "type": "string",
-                            "minLength": 1,
-                        },
-                        "head": {
-                            "title": "Index of the token's head",
-                            "type": "integer",
-                            "minimum": 0,
-                        },
-                    },
-                    "required": ["start", "end"],
-                },
-            },
-            "_": {"title": "Custom user space", "type": "object"},
-        },
-        "required": ["text"],
-    },
-}
-
-META_SCHEMA = {
-    "$schema": "http://json-schema.org/draft-06/schema",
-    "type": "object",
-    "properties": {
-        "lang": {
-            "title": "Two-letter language code, e.g. 'en'",
-            "type": "string",
-            "minLength": 2,
-            "maxLength": 2,
-            "pattern": "^[a-z]*$",
-        },
-        "name": {
-            "title": "Model name",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[a-z_]*$",
-        },
-        "version": {
-            "title": "Model version",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[0-9a-z.-]*$",
-        },
-        "spacy_version": {
-            "title": "Compatible spaCy version identifier",
-            "type": "string",
-            "minLength": 1,
-            "pattern": "^[0-9a-z.-><=]*$",
-        },
-        "parent_package": {
-            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
-            "type": "string",
-            "minLength": 1,
-            "default": "spacy",
-        },
-        "pipeline": {
-            "title": "Names of pipeline components",
-            "type": "array",
-            "items": {"type": "string", "minLength": 1},
-        },
-        "description": {"title": "Model description", "type": "string"},
-        "license": {"title": "Model license", "type": "string"},
-        "author": {"title": "Model author name", "type": "string"},
-        "email": {"title": "Model author email", "type": "string", "format": "email"},
-        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
-        "sources": {
-            "title": "Training data sources",
-            "type": "array",
-            "items": {"type": "string"},
-        },
-        "vectors": {
-            "title": "Included word vectors",
-            "type": "object",
-            "properties": {
-                "keys": {
-                    "title": "Number of unique keys",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-                "vectors": {
-                    "title": "Number of unique vectors",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-                "width": {
-                    "title": "Number of dimensions",
-                    "type": "integer",
-                    "minimum": 0,
-                },
-            },
-        },
-        "accuracy": {
-            "title": "Accuracy numbers",
-            "type": "object",
-            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
-        },
-        "speed": {
-            "title": "Speed evaluation numbers",
-            "type": "object",
-            "patternProperties": {
-                "*": {
-                    "oneOf": [
-                        {"type": "number", "minimum": 0.0},
-                        {"type": "integer", "minimum": 0},
-                    ]
-                }
-            },
-        },
-    },
-    "required": ["lang", "name", "version"],
-}
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
 from pathlib import Path
 from wasabi import Printer
 import srsly
@ -29,27 +25,20 @@ FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")


-@plac.annotations(
-    input_file=("Input file", "positional", None, str),
-    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
-    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
-    n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
-    seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
-    model=("Model for sentence segmentation (for -s)", "option", "b", str),
-    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
-    lang=("Language (if tokenizer required)", "option", "l", str),
-    morphology=("Enable appending morphology to tags", "flag", "m", bool),
-)
 def convert(
-    input_file,
-    output_dir="-",
-    file_type="json",
-    n_sents=1,
-    seg_sents=False,
-    model=None,
-    morphology=False,
-    converter="auto",
-    lang=None,
+    # fmt: off
+    input_file: ("Input file", "positional", None, str),
+    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
+    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+    n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
+    seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
+    model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
+    morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
+    merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
+    converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
+    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+    lang: ("Language (if tokenizer required)", "option", "l", str) = None,
+    # fmt: on
 ):
    """
    Convert files into JSON format for use with train command and other
@ -60,16 +49,10 @@ def convert(
    no_print = output_dir == "-"
    msg = Printer(no_print=no_print)
    input_path = Path(input_file)
-    if file_type not in FILE_TYPES:
-        msg.fail(
-            "Unknown file type: '{}'".format(file_type),
-            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
-            exits=1,
-        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
-            "Can't write .{} data to stdout.".format(file_type),
+            f"Can't write .{file_type} data to stdout",
            "Please specify an output directory.",
            exits=1,
        )
@ -93,21 +76,26 @@ def convert(
                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
            )
    if converter not in CONVERTERS:
-        msg.fail("Can't find converter for {}".format(converter), exits=1)
+        msg.fail(f"Can't find converter for {converter}", exits=1)
+    ner_map = None
+    if ner_map_path is not None:
+        ner_map = srsly.read_json(ner_map_path)
    # Use converter function to convert data
    func = CONVERTERS[converter]
    data = func(
        input_data,
        n_sents=n_sents,
        seg_sents=seg_sents,
-        use_morphology=morphology,
+        append_morphology=morphology,
+        merge_subtokens=merge_subtokens,
        lang=lang,
        model=model,
        no_print=no_print,
+        ner_map=ner_map,
    )
    if output_dir != "-":
        # Export data to a file
-        suffix = ".{}".format(file_type)
+        suffix = f".{file_type}"
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
        if file_type == "json":
            srsly.write_json(output_file, data)
@ -115,9 +103,7 @@ def convert(
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
-        msg.good(
-            "Generated output file ({} documents): {}".format(len(data), output_file)
-        )
+        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
    else:
        # Print to stdout
        if file_type == "json":
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from wasabi import Printer

 from ...gold import iob_to_biluo
@ -64,9 +61,9 @@ def conll_ner2json(
        # sentence segmentation required for document segmentation
        if n_sents > 0 and not seg_sents:
            msg.warn(
-                "No sentence boundaries found to use with option `-n {}`. "
-                "Use `-s` to automatically segment sentences or `-n 0` "
-                "to disable.".format(n_sents)
+                f"No sentence boundaries found to use with option `-n {n_sents}`. "
+                f"Use `-s` to automatically segment sentences or `-n 0` "
+                f"to disable."
            )
        else:
            n_sents_info(msg, n_sents)
@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
    if model:
        nlp = load_model(model)
        if "parser" in nlp.pipe_names:
-            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
+            msg.info(f"Segmenting sentences with parser from model '{model}'.")
            sentencizer = nlp.get_pipe("parser")
    if not sentencizer:
        msg.info(
@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):


 def n_sents_info(msg, n_sents):
-    msg.info("Grouping every {} sentences into a document.".format(n_sents))
+    msg.info(f"Grouping every {n_sents} sentences into a document.")
    if n_sents == 1:
        msg.warn(
            "To generate better training data, you may want to group "
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,141 +1,349 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import re

-from ...gold import iob_to_biluo
+from ...gold import Example
+from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
+from ...language import Language
+from ...tokens import Doc, Token
+from .conll_ner2json import n_sents_info
+from wasabi import Printer


-def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
+def conllu2json(
+    input_data,
+    n_sents=10,
+    append_morphology=False,
+    lang=None,
+    ner_map=None,
+    merge_subtokens=False,
+    no_print=False,
+    **_
+):
    """
    Convert conllu files into JSON format for use with train cli.
-    use_morphology parameter enables appending morphology to tags, which is
+    append_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.

    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
-    # by @dvsrepo, via #11 explosion/spacy-dev-resources
-    # by @katarkor
+    MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
+    msg = Printer(no_print=no_print)
+    n_sents_info(msg, n_sents)
    docs = []
+    raw = ""
    sentences = []
-    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
-    checked_for_ner = False
-    has_ner_tags = False
-    for i, (raw_text, tokens) in enumerate(conll_tuples):
-        sentence, brackets = tokens[0]
-        if not checked_for_ner:
-            has_ner_tags = is_ner(sentence[5][0])
-            checked_for_ner = True
-        sentences.append(generate_sentence(sentence, has_ner_tags))
+    conll_data = read_conllx(
+        input_data,
+        append_morphology=append_morphology,
+        ner_tag_pattern=MISC_NER_PATTERN,
+        ner_map=ner_map,
+        merge_subtokens=merge_subtokens,
+    )
+    has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
+    for i, example in enumerate(conll_data):
+        raw += example.text
+        sentences.append(
+            generate_sentence(
+                example.token_annotation,
+                has_ner_tags,
+                MISC_NER_PATTERN,
+                ner_map=ner_map,
+            )
+        )
        # Real-sized documents could be extracted using the comments on the
-        # conluu document
+        # conllu document
        if len(sentences) % n_sents == 0:
-            doc = create_doc(sentences, i)
+            doc = create_json_doc(raw, sentences, i)
            docs.append(doc)
+            raw = ""
            sentences = []
    if sentences:
-        doc = create_doc(sentences, i)
+        doc = create_json_doc(raw, sentences, i)
        docs.append(doc)
    return docs


-def is_ner(tag):
+def has_ner(input_data, ner_tag_pattern):
    """
-    Check the 10th column of the first token to determine if the file contains
-    NER tags
+    Check the MISC column for NER tags.
    """
-    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
-    if tag_match:
-        return True
-    elif tag == "O":
-        return True
-    else:
-        return False
-
-
-def read_conllx(input_data, use_morphology=False, n=0):
-    i = 0
    for sent in input_data.strip().split("\n\n"):
        lines = sent.strip().split("\n")
        if lines:
            while lines[0].startswith("#"):
                lines.pop(0)
-            tokens = []
            for line in lines:
-
                parts = line.split("\t")
-                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
-                if "-" in id_ or "." in id_:
-                    continue
-                try:
-                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head not in ["0", "_"] else id_
-                    dep = "ROOT" if dep == "root" else dep
-                    tag = pos if tag == "_" else tag
-                    tag = tag + "__" + morph if use_morphology else tag
-                    iob = iob if iob else "O"
-                    tokens.append((id_, word, tag, head, dep, iob))
-                except:  # noqa: E722
-                    print(line)
-                    raise
-            tuples = [list(t) for t in zip(*tokens)]
-            yield (None, [[tuples, []]])
-            i += 1
-            if n >= 1 and i >= n:
+                id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+                for misc_part in misc.split("|"):
+                    if re.match(ner_tag_pattern, misc_part):
+                        return True
+    return False
+
+
+def read_conllx(
+    input_data,
+    append_morphology=False,
+    merge_subtokens=False,
+    ner_tag_pattern="",
+    ner_map=None,
+):
+    """ Yield examples, one for each sentence """
+    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
+        if lines:
+            while lines[0].startswith("#"):
+                lines.pop(0)
+            example = example_from_conllu_sentence(
+                vocab,
+                lines,
+                ner_tag_pattern,
+                merge_subtokens=merge_subtokens,
+                append_morphology=append_morphology,
+                ner_map=ner_map,
+            )
+            yield example
+
+
+def get_entities(lines, tag_pattern, ner_map=None):
+    """Find entities in the MISC column according to the pattern and map to
+    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
+    the pattern is not matched.
+
+    lines (str): CONLL-U lines for one sentences
+    tag_pattern (str): Regex pattern for entity tag
+    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
+    RETURNS (list): List of BILUO entity tags
+    """
+    miscs = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_ or "." in id_:
+            continue
+        miscs.append(misc)
+
+    iob = []
+    for misc in miscs:
+        iob_tag = "O"
+        for misc_part in misc.split("|"):
+            tag_match = re.match(tag_pattern, misc_part)
+            if tag_match:
+                prefix = tag_match.group(2)
+                suffix = tag_match.group(3)
+                if prefix and suffix:
+                    iob_tag = prefix + "-" + suffix
+                    if ner_map:
+                        suffix = ner_map.get(suffix, suffix)
+                        if suffix == "":
+                            iob_tag = "O"
+                        else:
+                            iob_tag = prefix + "-" + suffix
                break
+        iob.append(iob_tag)
+    return iob_to_biluo(iob)


-def simplify_tags(iob):
-    """
-    Simplify tags obtained from the dataset in order to follow Wikipedia
-    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
-    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
-    'MISC'.
-    """
-    new_iob = []
-    for tag in iob:
-        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
-        if tag_match:
-            prefix = tag_match.group(1)
-            suffix = tag_match.group(2)
-            if suffix == "GPE_LOC":
-                suffix = "LOC"
-            elif suffix == "GPE_ORG":
-                suffix = "ORG"
-            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
-                suffix = "MISC"
-            tag = prefix + "-" + suffix
-        new_iob.append(tag)
-    return new_iob
-
-
-def generate_sentence(sent, has_ner_tags):
-    (id_, word, tag, head, dep, iob) = sent
+def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
    sentence = {}
    tokens = []
-    if has_ner_tags:
-        iob = simplify_tags(iob)
-        biluo = iob_to_biluo(iob)
-    for i, id in enumerate(id_):
+    for i, id_ in enumerate(token_annotation.ids):
        token = {}
-        token["id"] = id
-        token["orth"] = word[i]
-        token["tag"] = tag[i]
-        token["head"] = head[i] - id
-        token["dep"] = dep[i]
+        token["id"] = id_
+        token["orth"] = token_annotation.get_word(i)
+        token["tag"] = token_annotation.get_tag(i)
+        token["pos"] = token_annotation.get_pos(i)
+        token["lemma"] = token_annotation.get_lemma(i)
+        token["morph"] = token_annotation.get_morph(i)
+        token["head"] = token_annotation.get_head(i) - id_
+        token["dep"] = token_annotation.get_dep(i)
        if has_ner_tags:
-            token["ner"] = biluo[i]
+            token["ner"] = token_annotation.get_entity(i)
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence


-def create_doc(sentences, id):
+def create_json_doc(raw, sentences, id_):
    doc = {}
    paragraph = {}
-    doc["id"] = id
+    doc["id"] = id_
    doc["paragraphs"] = []
+    paragraph["raw"] = raw.strip()
    paragraph["sentences"] = sentences
    doc["paragraphs"].append(paragraph)
    return doc
+
+
+def example_from_conllu_sentence(
+    vocab,
+    lines,
+    ner_tag_pattern,
+    merge_subtokens=False,
+    append_morphology=False,
+    ner_map=None,
+):
+    """Create an Example from the lines for one CoNLL-U sentence, merging
+    subtokens and appending morphology to tags if required.
+
+    lines (str): The non-comment lines for a CoNLL-U sentence
+    ner_tag_pattern (str): The regex pattern for matching NER in MISC col
+    RETURNS (Example): An example containing the annotation
+    """
+    # create a Doc with each subtoken as its own token
+    # if merging subtokens, each subtoken orth is the merged subtoken form
+    if not Token.has_extension("merged_orth"):
+        Token.set_extension("merged_orth", default="")
+    if not Token.has_extension("merged_lemma"):
+        Token.set_extension("merged_lemma", default="")
+    if not Token.has_extension("merged_morph"):
+        Token.set_extension("merged_morph", default="")
+    if not Token.has_extension("merged_spaceafter"):
+        Token.set_extension("merged_spaceafter", default="")
+    words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
+    heads, deps = [], []
+    subtok_word = ""
+    in_subtok = False
+    for i in range(len(lines)):
+        line = lines[i]
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "." in id_:
+            continue
+        if "-" in id_:
+            in_subtok = True
+        if "-" in id_:
+            in_subtok = True
+            subtok_word = word
+            subtok_start, subtok_end = id_.split("-")
+            subtok_spaceafter = "SpaceAfter=No" not in misc
+            continue
+        if merge_subtokens and in_subtok:
+            words.append(subtok_word)
+        else:
+            words.append(word)
+        if in_subtok:
+            if id_ == subtok_end:
+                spaces.append(subtok_spaceafter)
+            else:
+                spaces.append(False)
+        elif "SpaceAfter=No" in misc:
+            spaces.append(False)
+        else:
+            spaces.append(True)
+        if in_subtok and id_ == subtok_end:
+            subtok_word = ""
+            in_subtok = False
+        id_ = int(id_) - 1
+        head = (int(head) - 1) if head not in ("0", "_") else id_
+        tag = pos if tag == "_" else tag
+        morph = morph if morph != "_" else ""
+        dep = "ROOT" if dep == "root" else dep
+        lemmas.append(lemma)
+        poses.append(pos)
+        tags.append(tag)
+        morphs.append(morph)
+        heads.append(head)
+        deps.append(dep)
+
+    doc = Doc(vocab, words=words, spaces=spaces)
+    for i in range(len(doc)):
+        doc[i].tag_ = tags[i]
+        doc[i].pos_ = poses[i]
+        doc[i].dep_ = deps[i]
+        doc[i].lemma_ = lemmas[i]
+        doc[i].head = doc[heads[i]]
+        doc[i]._.merged_orth = words[i]
+        doc[i]._.merged_morph = morphs[i]
+        doc[i]._.merged_lemma = lemmas[i]
+        doc[i]._.merged_spaceafter = spaces[i]
+    ents = get_entities(lines, ner_tag_pattern, ner_map)
+    doc.ents = spans_from_biluo_tags(doc, ents)
+    doc.is_parsed = True
+    doc.is_tagged = True
+
+    if merge_subtokens:
+        doc = merge_conllu_subtokens(lines, doc)
+
+    # create Example from custom Doc annotation
+    ids, words, tags, heads, deps = [], [], [], [], []
+    pos, lemmas, morphs, spaces = [], [], [], []
+    for i, t in enumerate(doc):
+        ids.append(i)
+        words.append(t._.merged_orth)
+        if append_morphology and t._.merged_morph:
+            tags.append(t.tag_ + "__" + t._.merged_morph)
+        else:
+            tags.append(t.tag_)
+        pos.append(t.pos_)
+        morphs.append(t._.merged_morph)
+        lemmas.append(t._.merged_lemma)
+        heads.append(t.head.i)
+        deps.append(t.dep_)
+        spaces.append(t._.merged_spaceafter)
+    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+    ents = biluo_tags_from_offsets(doc, ent_offsets)
+    raw = ""
+    for word, space in zip(words, spaces):
+        raw += word
+        if space:
+            raw += " "
+    example = Example(doc=raw)
+    example.set_token_annotation(
+        ids=ids,
+        words=words,
+        tags=tags,
+        pos=pos,
+        morphs=morphs,
+        lemmas=lemmas,
+        heads=heads,
+        deps=deps,
+        entities=ents,
+    )
+    return example
+
+
+def merge_conllu_subtokens(lines, doc):
+    # identify and process all subtoken spans to prepare attrs for merging
+    subtok_spans = []
+    for line in lines:
+        parts = line.split("\t")
+        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
+        if "-" in id_:
+            subtok_start, subtok_end = id_.split("-")
+            subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)]
+            subtok_spans.append(subtok_span)
+            # create merged tag, morph, and lemma values
+            tags = []
+            morphs = {}
+            lemmas = []
+            for token in subtok_span:
+                tags.append(token.tag_)
+                lemmas.append(token.lemma_)
+                if token._.merged_morph:
+                    for feature in token._.merged_morph.split("|"):
+                        field, values = feature.split("=", 1)
+                        if field not in morphs:
+                            morphs[field] = set()
+                        for value in values.split(","):
+                            morphs[field].add(value)
+            # create merged features for each morph field
+            for field, values in morphs.items():
+                morphs[field] = field + "=" + ",".join(sorted(values))
+            # set the same attrs on all subtok tokens so that whatever head the
+            # retokenizer chooses, the final attrs are available on that token
+            for token in subtok_span:
+                token._.merged_orth = token.orth_
+                token._.merged_lemma = " ".join(lemmas)
+                token.tag_ = "_".join(tags)
+                token._.merged_morph = "|".join(sorted(morphs.values()))
+                token._.merged_spaceafter = (
+                    True if subtok_span[-1].whitespace_ else False
+                )
+
+    with doc.retokenize() as retokenizer:
+        for span in subtok_spans:
+            retokenizer.merge(span)
+
+    return doc
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from wasabi import Printer

 from ...gold import iob_to_biluo
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import srsly

 from ...gold import docs_to_json
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -1,9 +1,5 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
 from pathlib import Path
 from collections import Counter
-import plac
 import sys
 import srsly
 from wasabi import Printer, MESSAGES
@ -22,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
 BLANK_MODEL_THRESHOLD = 2000


-@plac.annotations(
-    # fmt: off
-    lang=("model language", "positional", None, str),
-    train_path=("location of JSON-formatted training data", "positional", None, Path),
-    dev_path=("location of JSON-formatted development data", "positional", None, Path),
-    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
-    base_model=("name of model to update (optional)", "option", "b", str),
-    pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
-    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
-    verbose=("Print additional information and explanations", "flag", "V", bool),
-    no_format=("Don't pretty-print the results", "flag", "NF", bool),
-    # fmt: on
-)
 def debug_data(
-    lang,
-    train_path,
-    dev_path,
-    tag_map_path=None,
-    base_model=None,
-    pipeline="tagger,parser,ner",
-    ignore_warnings=False,
-    verbose=False,
-    no_format=False,
+    # fmt: off
+    lang: ("Model language", "positional", None, str),
+    train_path: ("Location of JSON-formatted training data", "positional", None, Path),
+    dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
+    tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
+    base_model: ("Name of model to update (optional)", "option", "b", str) = None,
+    pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
+    ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
+    verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
+    no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
+    # fmt: on
 ):
    """
    Analyze, debug and validate your training and development data, get useful
@ -85,20 +70,16 @@ def debug_data(
    with msg.loading("Loading corpus..."):
        corpus = GoldCorpus(train_path, dev_path)
        try:
-            train_docs = list(corpus.train_docs(nlp))
-            train_docs_unpreprocessed = list(
-                corpus.train_docs_without_preprocessing(nlp)
+            train_dataset = list(corpus.train_dataset(nlp))
+            train_dataset_unpreprocessed = list(
+                corpus.train_dataset_without_preprocessing(nlp)
            )
        except ValueError as e:
-            loading_train_error_message = "Training data cannot be loaded: {}".format(
-                str(e)
-            )
+            loading_train_error_message = f"Training data cannot be loaded: {e}"
        try:
-            dev_docs = list(corpus.dev_docs(nlp))
+            dev_dataset = list(corpus.dev_dataset(nlp))
        except ValueError as e:
-            loading_dev_error_message = "Development data cannot be loaded: {}".format(
-                str(e)
-            )
+            loading_dev_error_message = f"Development data cannot be loaded: {e}"
    if loading_train_error_message or loading_dev_error_message:
        if loading_train_error_message:
            msg.fail(loading_train_error_message)
@ -107,82 +88,68 @@ def debug_data(
        sys.exit(1)
    msg.good("Corpus is loadable")

-    # Create all gold data here to avoid iterating over the train_docs constantly
-    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
+    # Create all gold data here to avoid iterating over the train_dataset constantly
+    gold_train_data = _compile_gold(train_dataset, pipeline, nlp)
    gold_train_unpreprocessed_data = _compile_gold(
-        train_docs_unpreprocessed, pipeline, nlp
+        train_dataset_unpreprocessed, pipeline
    )
-    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
+    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
-    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
+    msg.text(f"Training pipeline: {', '.join(pipeline)}")
    for pipe in [p for p in pipeline if p not in nlp.factories]:
-        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
+        msg.fail(f"Pipeline component '{pipe}' not available in factories")
    if base_model:
-        msg.text("Starting with base model '{}'".format(base_model))
+        msg.text(f"Starting with base model '{base_model}'")
    else:
-        msg.text("Starting with blank model '{}'".format(lang))
-    msg.text("{} training docs".format(len(train_docs)))
-    msg.text("{} evaluation docs".format(len(dev_docs)))
+        msg.text(f"Starting with blank model '{lang}'")
+    msg.text(f"{len(train_dataset)} training docs")
+    msg.text(f"{len(dev_dataset)} evaluation docs")

-    if not len(dev_docs):
+    if not len(gold_dev_data):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
-        msg.warn("{} training examples also in evaluation data".format(overlap))
+        msg.warn(f"{overlap} training examples also in evaluation data")
    else:
        msg.good("No overlap between training and evaluation data")
-    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
-        text = "Low number of examples to train from a blank model ({})".format(
-            len(train_docs)
+    if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
+        text = (
+            f"Low number of examples to train from a blank model ({len(train_dataset)})"
        )
-        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
+        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
-            "It's recommended to use at least {} examples (minimum {})".format(
-                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
-            ),
+            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
+            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info(
-        "{} total {} in the data ({} unique)".format(
-            n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
-        )
+        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
    )
    if gold_train_data["n_misaligned_words"] > 0:
-        msg.warn(
-            "{} misaligned tokens in the training data".format(
-                gold_train_data["n_misaligned_words"]
-            )
-        )
+        n_misaligned = gold_train_data["n_misaligned_words"]
+        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
    if gold_dev_data["n_misaligned_words"] > 0:
-        msg.warn(
-            "{} misaligned tokens in the dev data".format(
-                gold_dev_data["n_misaligned_words"]
-            )
-        )
+        n_misaligned = gold_dev_data["n_misaligned_words"]
+        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
-        "10 most common words: {}".format(
-            _format_labels(most_common_words, counts=True)
-        ),
+        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
-            "{} vectors ({} unique keys, {} dimensions)".format(
-                len(nlp.vocab.vectors),
-                nlp.vocab.vectors.n_keys,
-                nlp.vocab.vectors_length,
-            )
+            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
+            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
        )
        n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
        msg.warn(
@ -205,7 +172,7 @@ def debug_data(
    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(
-            label for label in gold_train_data["ner"] if label not in ("O", "-")
+            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
@ -218,19 +185,10 @@ def debug_data(

        msg.divider("Named Entity Recognition")
        msg.info(
-            "{} new {}, {} existing {}".format(
-                len(new_labels),
-                "label" if len(new_labels) == 1 else "labels",
-                len(existing_labels),
-                "label" if len(existing_labels) == 1 else "labels",
-            )
+            f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
        )
        missing_values = label_counts["-"]
-        msg.text(
-            "{} missing {} (tokens with '-' label)".format(
-                missing_values, "value" if missing_values == 1 else "values"
-            )
-        )
+        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
        for label in new_labels:
            if len(label) == 0:
                msg.fail("Empty label found in new labels")
@ -241,43 +199,28 @@ def debug_data(
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
-            msg.text("New: {}".format(labels_with_counts), show=verbose)
+            msg.text(f"New: {labels_with_counts}", show=verbose)
        if existing_labels:
-            msg.text(
-                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
-            )
-
+            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
        if gold_train_data["ws_ents"]:
-            msg.fail(
-                "{} invalid whitespace entity span(s)".format(
-                    gold_train_data["ws_ents"]
-                )
-            )
+            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
-            msg.warn(
-                "{} entity span(s) with punctuation".format(
-                    gold_train_data["punct_ents"]
-                )
-            )
+            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
            has_punct_ents_warning = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
-                    "Low number of examples for new label '{}' ({})".format(
-                        label, label_counts[label]
-                    )
+                    f"Low number of examples for new label '{label}' ({label_counts[label]})"
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
-                    neg_docs = _get_examples_without_label(train_docs, label)
+                    neg_docs = _get_examples_without_label(train_dataset, label)
                if neg_docs == 0:
-                    msg.warn(
-                        "No examples for texts WITHOUT new label '{}'".format(label)
-                    )
+                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
                    has_no_neg_warning = True

        if not has_low_data_warning:
@ -291,8 +234,8 @@ def debug_data(

        if has_low_data_warning:
            msg.text(
-                "To train a new entity type, your data should include at "
-                "least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
+                f"To train a new entity type, your data should include at "
+                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
                show=verbose,
            )
        if has_no_neg_warning:
@ -321,27 +264,21 @@ def debug_data(
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
-            "Text Classification: {} new label(s), {} existing label(s)".format(
-                len(new_labels), len(existing_labels)
-            )
+            f"Text Classification: {len(new_labels)} new label(s), "
+            f"{len(existing_labels)} existing label(s)"
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_train_data["cats"].most_common(), counts=True
            )
-            msg.text("New: {}".format(labels_with_counts), show=verbose)
+            msg.text(f"New: {labels_with_counts}", show=verbose)
        if existing_labels:
-            msg.text(
-                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
-            )
+            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.fail(
-                "The train and dev labels are not the same. "
-                "Train labels: {}. "
-                "Dev labels: {}.".format(
-                    _format_labels(gold_train_data["cats"]),
-                    _format_labels(gold_dev_data["cats"]),
-                )
+                f"The train and dev labels are not the same. "
+                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
+                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
            )
        if gold_train_data["n_cats_multilabel"] > 0:
            msg.info(
@ -371,27 +308,16 @@ def debug_data(
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.vocab.morphology.tag_map
-        msg.info(
-            "{} {} in data ({} {} in tag map)".format(
-                len(labels),
-                "label" if len(labels) == 1 else "labels",
-                len(tag_map),
-                "label" if len(tag_map) == 1 else "labels",
-            )
-        )
+        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
-            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
+            msg.good(f"All labels present in tag map for language '{nlp.lang}'")
        for label in non_tagmap:
-            msg.fail(
-                "Label '{}' not found in tag map for language '{}'".format(
-                    label, nlp.lang
-                )
-            )
+            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")

    if "parser" in pipeline:
        has_low_data_warning = False
@ -399,21 +325,18 @@ def debug_data(

        # profile sentence length
        msg.info(
-            "Found {} sentence{} with an average length of {:.1f} words.".format(
-                gold_train_data["n_sents"],
-                "s" if len(train_docs) > 1 else "",
-                gold_train_data["n_words"] / gold_train_data["n_sents"],
-            )
+            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
+            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
        )

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
-                "The training data contains {:.2f} sentences per "
-                "document. When there are very few documents containing more "
-                "than one sentence, the parser will not learn how to segment "
-                "longer texts into sentences.".format(sents_per_doc)
+                f"The training data contains {sents_per_doc:.2f} sentences per "
+                f"document. When there are very few documents containing more "
+                f"than one sentence, the parser will not learn how to segment "
+                f"longer texts into sentences."
            )

        # profile labels
@ -424,32 +347,13 @@ def debug_data(
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
-            msg.info(
-                "Found {} nonprojective train sentence{}".format(
-                    gold_train_unpreprocessed_data["n_nonproj"],
-                    "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
-                )
-            )
+            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
+            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
        if gold_dev_data["n_nonproj"] > 0:
-            msg.info(
-                "Found {} nonprojective dev sentence{}".format(
-                    gold_dev_data["n_nonproj"],
-                    "s" if gold_dev_data["n_nonproj"] > 1 else "",
-                )
-            )
-
-        msg.info(
-            "{} {} in train data".format(
-                len(labels_train_unpreprocessed),
-                "label" if len(labels_train) == 1 else "labels",
-            )
-        )
-        msg.info(
-            "{} {} in projectivized train data".format(
-                len(labels_train), "label" if len(labels_train) == 1 else "labels"
-            )
-        )
-
+            n_nonproj = gold_dev_data["n_nonproj"]
+            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
+        msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
+        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
        )
@ -459,9 +363,8 @@ def debug_data(
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
                msg.warn(
-                    "Low number of examples for label '{}' ({})".format(
-                        label, gold_train_unpreprocessed_data["deps"][label]
-                    )
+                    f"Low number of examples for label '{label}' "
+                    f"({gold_train_unpreprocessed_data['deps'][label]})"
                )
                has_low_data_warning = True

@ -470,22 +373,19 @@ def debug_data(
        for label in gold_train_data["deps"]:
            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
                rare_projectivized_labels.append(
-                    "{}: {}".format(label, str(gold_train_data["deps"][label]))
+                    f"{label}: {gold_train_data['deps'][label]}"
                )

        if len(rare_projectivized_labels) > 0:
            msg.warn(
-                "Low number of examples for {} label{} in the "
-                "projectivized dependency trees used for training. You may "
-                "want to projectivize labels such as punct before "
-                "training in order to improve parser performance.".format(
-                    len(rare_projectivized_labels),
-                    "s" if len(rare_projectivized_labels) > 1 else "",
-                )
+                f"Low number of examples for {len(rare_projectivized_labels)} "
+                "label(s) in the projectivized dependency trees used for "
+                "training. You may want to projectivize labels such as punct "
+                "before training in order to improve parser performance."
            )
            msg.warn(
-                "Projectivized labels with low numbers of examples: "
-                "{}".format("\n".join(rare_projectivized_labels)),
+                f"Projectivized labels with low numbers of examples: ",
+                ", ".join(rare_projectivized_labels),
                show=verbose,
            )
            has_low_data_warning = True
@ -493,50 +393,44 @@ def debug_data(
        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
-                "The following labels were found only in the train data: "
-                "{}".format(", ".join(set(labels_train) - set(labels_dev))),
+                "The following labels were found only in the train data:",
+                ", ".join(set(labels_train) - set(labels_dev)),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
-                "The following labels were found only in the dev data: "
-                + ", ".join(set(labels_dev) - set(labels_train)),
+                "The following labels were found only in the dev data:",
+                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
-                "To train a parser, your data should include at "
-                "least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
+                f"To train a parser, your data should include at "
+                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
-                "Multiple root labels ({}) ".format(
-                    ", ".join(gold_train_unpreprocessed_data["roots"])
-                )
-                + "found in training data. spaCy's parser uses a single root "
-                "label ROOT so this distinction will not be available."
+                f"Multiple root labels "
+                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
+                f"found in training data. spaCy's parser uses a single root "
+                f"label ROOT so this distinction will not be available."
            )

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
-                "Found {} nonprojective projectivized train sentence{}".format(
-                    gold_train_data["n_nonproj"],
-                    "s" if gold_train_data["n_nonproj"] > 1 else "",
-                )
+                f"Found {gold_train_data['n_nonproj']} nonprojective "
+                f"projectivized train sentence(s)"
            )
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
-                "Found {} projectivized train sentence{} with cycles".format(
-                    gold_train_data["n_cycles"],
-                    "s" if gold_train_data["n_cycles"] > 1 else "",
-                )
+                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
            )

    msg.divider("Summary")
@ -544,42 +438,34 @@ def debug_data(
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
-        msg.good(
-            "{} {} passed".format(
-                good_counts, "check" if good_counts == 1 else "checks"
-            )
-        )
+        msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
    if warn_counts:
-        msg.warn(
-            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
-        )
-    if fail_counts:
-        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
-
+        msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
    if fail_counts:
+        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
        sys.exit(1)


 def _load_file(file_path, msg):
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
-        with msg.loading("Loading {}...".format(file_name)):
+        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_json(file_path)
-        msg.good("Loaded {}".format(file_name))
+        msg.good(f"Loaded {file_name}")
        return data
    elif file_path.suffix == ".jsonl":
-        with msg.loading("Loading {}...".format(file_name)):
+        with msg.loading(f"Loading {file_name}..."):
            data = srsly.read_jsonl(file_path)
-        msg.good("Loaded {}".format(file_name))
+        msg.good(f"Loaded {file_name}")
        return data
    msg.fail(
-        "Can't load file extension {}".format(file_path.suffix),
+        f"Can't load file extension {file_path.suffix}",
        "Expected .json or .jsonl",
        exits=1,
    )


-def _compile_gold(train_docs, pipeline, nlp):
+def _compile_gold(examples, pipeline, nlp):
    data = {
        "ner": Counter(),
        "cats": Counter(),
@ -598,7 +484,9 @@ def _compile_gold(train_docs, pipeline, nlp):
        "n_cats_multilabel": 0,
        "texts": set(),
    }
-    for doc, gold in train_docs:
+    for example in examples:
+        gold = example.gold
+        doc = example.doc
        valid_words = [x for x in gold.words if x is not None]
        data["words"].update(valid_words)
        data["n_words"] += len(valid_words)
@ -651,17 +539,17 @@ def _compile_gold(train_docs, pipeline, nlp):

 def _format_labels(labels, counts=False):
    if counts:
-        return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
-    return ", ".join(["'{}'".format(l) for l in labels])
+        return ", ".join([f"'{l}' ({c})" for l, c in labels])
+    return ", ".join([f"'{l}'" for l in labels])


 def _get_examples_without_label(data, label):
    count = 0
-    for doc, gold in data:
+    for ex in data:
        labels = [
            label.split("-")[1]
-            for label in gold.ner
-            if label is not None and label not in ("O", "-")
+            for label in ex.gold.ner
+            if label not in ("O", "-", None)
        ]
        if label not in labels:
            count += 1
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,30 +1,24 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
 import requests
 import os
 import subprocess
 import sys
 from wasabi import msg

-from .link import link
-from ..util import get_package_path
 from .. import about
+from ..util import is_package, get_base_version


-@plac.annotations(
-    model=("Model to download (shortcut or name)", "positional", None, str),
-    direct=("Force direct download of name + version", "flag", "d", bool),
-    pip_args=("Additional arguments to be passed to `pip install` on model install"),
-)
-def download(model, direct=False, *pip_args):
+def download(
+    model: ("Model to download (shortcut or name)", "positional", None, str),
+    direct: ("Force direct download of name + version", "flag", "d", bool) = False,
+    *pip_args: ("Additional arguments to be passed to `pip install` on model install"),
+):
    """
-    Download compatible model from default download path using pip. Model
-    can be shortcut, model name or, if --direct flag is set, full model name
-    with version. For direct downloads, the compatibility check will be skipped.
+    Download compatible model from default download path using pip. If --direct
+    flag is set, the command expects the full model name with version.
+    For direct downloads, the compatibility check will be skipped.
    """
-    if not require_package("spacy") and "--no-deps" not in pip_args:
+    if not is_package("spacy") and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping model package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
@ -50,77 +44,38 @@ def download(model, direct=False, *pip_args):
            sys.exit(dl)
        msg.good(
            "Download and installation successful",
-            "You can now load the model via spacy.load('{}')".format(model_name),
+            f"You can now load the model via spacy.load('{model_name}')",
        )
-        # Only create symlink if the model is installed via a shortcut like 'en'.
-        # There's no real advantage over an additional symlink for en_core_web_sm
-        # and if anything, it's more error prone and causes more confusion.
-        if model in shortcuts:
-            try:
-                # Get package path here because link uses
-                # pip.get_installed_distributions() to check if model is a
-                # package, which fails if model was just installed via
-                # subprocess
-                package_path = get_package_path(model_name)
-                link(model_name, model, force=True, model_path=package_path)
-            except:  # noqa: E722
-                # Dirty, but since spacy.download and the auto-linking is
-                # mostly a convenience wrapper, it's best to show a success
-                # message and loading instructions, even if linking fails.
-                msg.warn(
-                    "Download successful but linking failed",
-                    "Creating a shortcut link for '{}' didn't work (maybe you "
-                    "don't have admin permissions?), but you can still load "
-                    "the model via its full package name: "
-                    "nlp = spacy.load('{}')".format(model, model_name),
-                )
-        # If a model is downloaded and then loaded within the same process, our
-        # is_package check currently fails, because pkg_resources.working_set
-        # is not refreshed automatically (see #3923). We're trying to work
-        # around this here be requiring the package explicitly.
-        require_package(model_name)
-
-
-def require_package(name):
-    try:
-        import pkg_resources
-
-        pkg_resources.working_set.require(name)
-        return True
-    except:  # noqa: E722
-        return False


 def get_json(url, desc):
    r = requests.get(url)
    if r.status_code != 200:
        msg.fail(
-            "Server error ({})".format(r.status_code),
-            "Couldn't fetch {}. Please find a model for your spaCy "
-            "installation (v{}), and download it manually. For more "
-            "details, see the documentation: "
-            "https://spacy.io/usage/models".format(desc, about.__version__),
+            f"Server error ({r.status_code})",
+            f"Couldn't fetch {desc}. Please find a model for your spaCy "
+            f"installation (v{about.__version__}), and download it manually. "
+            f"For more details, see the documentation: "
+            f"https://spacy.io/usage/models",
            exits=1,
        )
    return r.json()


 def get_compatibility():
-    version = about.__version__
-    version = version.rsplit(".dev", 1)[0]
+    version = get_base_version(about.__version__)
    comp_table = get_json(about.__compatibility__, "compatibility table")
    comp = comp_table["spacy"]
    if version not in comp:
-        msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
+        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
    return comp[version]


 def get_version(model, comp):
-    model = model.rsplit(".dev", 1)[0]
+    model = get_base_version(model)
    if model not in comp:
        msg.fail(
-            "No compatible model found for '{}' "
-            "(spaCy v{}).".format(model, about.__version__),
+            f"No compatible model found for '{model}' (spaCy v{about.__version__})",
            exits=1,
        )
    return comp[model][0]
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
-import plac
 from timeit import default_timer as timer
 from wasabi import msg

@ -10,23 +6,16 @@ from .. import util
 from .. import displacy


-@plac.annotations(
-    model=("Model name or path", "positional", None, str),
-    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    gpu_id=("Use GPU", "option", "g", int),
-    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
-    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
-    return_scores=("Return dict containing model scores", "flag", "R", bool),
-)
 def evaluate(
-    model,
-    data_path,
-    gpu_id=-1,
-    gold_preproc=False,
-    displacy_path=None,
-    displacy_limit=25,
-    return_scores=False,
+    # fmt: off
+    model: ("Model name or path", "positional", None, str),
+    data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
+    gpu_id: ("Use GPU", "option", "g", int) = -1,
+    gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
+    displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
+    displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
+    return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
+    # fmt: on
 ):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
@ -47,28 +36,34 @@ def evaluate(
        nlp = util.get_lang_class(model.replace("blank:", ""))()
    else:
        nlp = util.load_model(model)
-    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
+    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
    begin = timer()
-    scorer = nlp.evaluate(dev_docs, verbose=False)
+    scorer = nlp.evaluate(dev_dataset, verbose=False)
    end = timer()
-    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+    nwords = sum(len(ex.doc) for ex in dev_dataset)
    results = {
-        "Time": "%.2f s" % (end - begin),
+        "Time": f"{end - begin:.2f} s",
        "Words": nwords,
-        "Words/s": "%.0f" % (nwords / (end - begin)),
-        "TOK": "%.2f" % scorer.token_acc,
-        "POS": "%.2f" % scorer.tags_acc,
-        "UAS": "%.2f" % scorer.uas,
-        "LAS": "%.2f" % scorer.las,
-        "NER P": "%.2f" % scorer.ents_p,
-        "NER R": "%.2f" % scorer.ents_r,
-        "NER F": "%.2f" % scorer.ents_f,
-        "Textcat": "%.2f" % scorer.textcat_score,
+        "Words/s": f"{nwords / (end - begin):.0f}",
+        "TOK": f"{scorer.token_acc:.2f}",
+        "TAG": f"{scorer.tags_acc:.2f}",
+        "POS": f"{scorer.pos_acc:.2f}",
+        "MORPH": f"{scorer.morphs_acc:.2f}",
+        "UAS": f"{scorer.uas:.2f}",
+        "LAS": f"{scorer.las:.2f}",
+        "NER P": f"{scorer.ents_p:.2f}",
+        "NER R": f"{scorer.ents_r:.2f}",
+        "NER F": f"{scorer.ents_f:.2f}",
+        "Textcat AUC": f"{scorer.textcat_auc:.2f}",
+        "Textcat F": f"{scorer.textcat_f:.2f}",
+        "Sent P": f"{scorer.sent_p:.2f}",
+        "Sent R": f"{scorer.sent_r:.2f}",
+        "Sent F": f"{scorer.sent_f:.2f}",
    }
    msg.table(results, title="Results")

    if displacy_path:
-        docs, golds = zip(*dev_docs)
+        docs = [ex.doc for ex in dev_dataset]
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
@ -79,7 +74,7 @@ def evaluate(
            deps=render_deps,
            ents=render_ents,
        )
-        msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
+        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
    if return_scores:
        return scorer.scores

--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,44 +1,39 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
 import platform
 from pathlib import Path
 from wasabi import msg
 import srsly

-from ..compat import path2str, basestring_, unicode_
+from .validate import get_model_pkgs
 from .. import util
 from .. import about


-@plac.annotations(
-    model=("Optional shortcut link of model", "positional", None, str),
-    markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
-    silent=("Don't print anything (just return)", "flag", "s"),
-)
-def info(model=None, markdown=False, silent=False):
+def info(
+    model: ("Optional model name", "positional", None, str) = None,
+    markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
+    silent: ("Don't print anything (just return)", "flag", "s") = False,
+):
    """
-    Print info about spaCy installation. If a model shortcut link is
-    speficied as an argument, print model information. Flag --markdown
-    prints details in Markdown for easy copy-pasting to GitHub issues.
+    Print info about spaCy installation. If a model is speficied as an argument,
+    print model information. Flag --markdown prints details in Markdown for easy
+    copy-pasting to GitHub issues.
    """
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
-            model_path = util.get_data_path() / model
+            model_path = model
        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
            msg.fail("Can't find model meta.json", meta_path, exits=1)
        meta = srsly.read_json(meta_path)
        if model_path.resolve() != model_path:
-            meta["link"] = path2str(model_path)
-            meta["source"] = path2str(model_path.resolve())
+            meta["link"] = str(model_path)
+            meta["source"] = str(model_path.resolve())
        else:
-            meta["source"] = path2str(model_path)
+            meta["source"] = str(model_path)
        if not silent:
-            title = "Info about model '{}'".format(model)
+            title = f"Info about model '{model}'"
            model_meta = {
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
@ -47,12 +42,15 @@ def info(model=None, markdown=False, silent=False):
            else:
                msg.table(model_meta, title=title)
        return meta
+    all_models, _ = get_model_pkgs()
    data = {
        "spaCy version": about.__version__,
-        "Location": path2str(Path(__file__).parent.parent),
+        "Location": str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
-        "Models": list_models(),
+        "Models": ", ".join(
+            f"{m['name']} ({m['version']})" for m in all_models.values()
+        ),
    }
    if not silent:
        title = "Info about spaCy"
@ -63,30 +61,17 @@ def info(model=None, markdown=False, silent=False):
    return data


-def list_models():
-    def exclude_dir(dir_name):
-        # exclude common cache directories and hidden directories
-        exclude = ("cache", "pycache", "__pycache__")
-        return dir_name in exclude or dir_name.startswith(".")
-
-    data_path = util.get_data_path()
-    if data_path:
-        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
-        return ", ".join([m for m in models if not exclude_dir(m)])
-    return "-"
-
-
 def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
-    title (unicode or None): Title, will be rendered as headline 2.
+    title (str / None): Title, will be rendered as headline 2.
    """
    markdown = []
    for key, value in data.items():
-        if isinstance(value, basestring_) and Path(value).exists():
+        if isinstance(value, str) and Path(value).exists():
            continue
-        markdown.append("* **{}:** {}".format(key, unicode_(value)))
+        markdown.append(f"* **{key}:** {value}")
    if title:
-        print("\n## {}".format(title))
+        print(f"\n## {title}")
    print("\n{}\n".format("\n".join(markdown)))
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
 import math
 from tqdm import tqdm
 import numpy
@ -20,7 +16,6 @@ from ..errors import Errors, Warnings
 from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
 from ..lookups import Lookups

-
 try:
    import ftfy
 except ImportError:
@ -30,43 +25,21 @@ except ImportError:
 DEFAULT_OOV_PROB = -20


-@plac.annotations(
-    lang=("Model language", "positional", None, str),
-    output_dir=("Model output directory", "positional", None, Path),
-    freqs_loc=("Location of words frequencies file", "option", "f", Path),
-    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
-    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
-    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
-    truncate_vectors=(
-        "Optional number of vectors to truncate to when reading in vectors file",
-        "option",
-        "t",
-        int,
-    ),
-    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
-    vectors_name=(
-        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
-        "option",
-        "vn",
-        str,
-    ),
-    model_name=("Optional name for the model meta", "option", "mn", str),
-    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
-    base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
-)
 def init_model(
-    lang,
-    output_dir,
-    freqs_loc=None,
-    clusters_loc=None,
-    jsonl_loc=None,
-    vectors_loc=None,
-    truncate_vectors=0,
-    prune_vectors=-1,
-    vectors_name=None,
-    model_name=None,
-    omit_extra_lookups=False,
-    base_model=None,
+    # fmt: off
+    lang: ("Model language", "positional", None, str),
+    output_dir: ("Model output directory", "positional", None, Path),
+    freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
+    clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
+    jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
+    vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
+    prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
+    truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0,
+    vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
+    model_name: ("Optional name for the model meta", "option", "mn", str) = None,
+    omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
+    base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None
+    # fmt: on
 ):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
@ -114,8 +87,7 @@ def init_model(
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
    msg.good(
-        "Sucessfully compiled vocab",
-        "{} entries, {} vectors".format(lex_added, vec_added),
+        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
    )
    if not output_dir.exists():
        output_dir.mkdir()
@ -203,9 +175,9 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
        if vectors_loc:
-            with msg.loading("Reading vectors from {}".format(vectors_loc)):
-                vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
-            msg.good("Loaded vectors from {}".format(vectors_loc))
+            with msg.loading(f"Reading vectors from {vectors_loc}"):
+                vectors_data, vector_keys = read_vectors(vectors_loc)
+            msg.good(f"Loaded vectors from {vectors_loc}")
        else:
            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
@ -215,7 +187,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if name is None:
-        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
+        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
    else:
        nlp.vocab.vectors.name = name
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
@ -265,7 +237,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
                    word = literal_eval(key)
                except SyntaxError:
                    # Take odd strings literally.
-                    word = literal_eval("'%s'" % key)
+                    word = literal_eval(f"'{key}'")
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -1,77 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
-from pathlib import Path
-from wasabi import msg
-
-from ..compat import symlink_to, path2str
-from .. import util
-
-
-@plac.annotations(
-    origin=("package name or local path to model", "positional", None, str),
-    link_name=("name of shortuct link to create", "positional", None, str),
-    force=("force overwriting of existing link", "flag", "f", bool),
-)
-def link(origin, link_name, force=False, model_path=None):
-    """
-    Create a symlink for models within the spacy/data directory. Accepts
-    either the name of a pip package, or the local path to the model data
-    directory. Linking models allows loading them via spacy.load(link_name).
-    """
-    if util.is_package(origin):
-        model_path = util.get_package_path(origin)
-    else:
-        model_path = Path(origin) if model_path is None else Path(model_path)
-    if not model_path.exists():
-        msg.fail(
-            "Can't locate model data",
-            "The data should be located in {}".format(path2str(model_path)),
-            exits=1,
-        )
-    data_path = util.get_data_path()
-    if not data_path or not data_path.exists():
-        spacy_loc = Path(__file__).parent.parent
-        msg.fail(
-            "Can't find the spaCy data path to create model symlink",
-            "Make sure a directory `/data` exists within your spaCy "
-            "installation and try again. The data directory should be located "
-            "here:".format(path=spacy_loc),
-            exits=1,
-        )
-    link_path = util.get_data_path() / link_name
-    if link_path.is_symlink() and not force:
-        msg.fail(
-            "Link '{}' already exists".format(link_name),
-            "To overwrite an existing link, use the --force flag",
-            exits=1,
-        )
-    elif link_path.is_symlink():  # does a symlink exist?
-        # NB: It's important to check for is_symlink here and not for exists,
-        # because invalid/outdated symlinks would return False otherwise.
-        link_path.unlink()
-    elif link_path.exists():  # does it exist otherwise?
-        # NB: Check this last because valid symlinks also "exist".
-        msg.fail(
-            "Can't overwrite symlink '{}'".format(link_name),
-            "This can happen if your data directory contains a directory or "
-            "file of the same name.",
-            exits=1,
-        )
-    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
-    try:
-        symlink_to(link_path, model_path)
-    except:  # noqa: E722
-        # This is quite dirty, but just making sure other errors are caught.
-        msg.fail(
-            "Couldn't link model to '{}'".format(link_name),
-            "Creating a symlink in spacy/data failed. Make sure you have the "
-            "required permissions and try re-running the command as admin, or "
-            "use a virtualenv. You can still import the model as a module and "
-            "call its load() method, or create the symlink manually.",
-        )
-        msg.text(details)
-        raise
-    msg.good("Linking successful", details)
-    msg.text("You can now load the model via spacy.load('{}')".format(link_name))
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,25 +1,21 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
 import shutil
 from pathlib import Path
 from wasabi import msg, get_raw_input
 import srsly

-from ..compat import path2str
 from .. import util
 from .. import about


-@plac.annotations(
-    input_dir=("Directory with model data", "positional", None, str),
-    output_dir=("Output parent directory", "positional", None, str),
-    meta_path=("Path to meta.json", "option", "m", str),
-    create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
-    force=("Force overwriting existing model in output directory", "flag", "f", bool),
-)
-def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
+def package(
+    # fmt: off
+    input_dir: ("Directory with model data", "positional", None, str),
+    output_dir: ("Output parent directory", "positional", None, str),
+    meta_path: ("Path to meta.json", "option", "m", str) = None,
+    create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
+    force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
+    # fmt: on
+):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
-                "No '{}' setting found in meta.json".format(key),
+                f"No '{key}' setting found in meta.json",
                "This setting is required to build your package.",
                exits=1,
            )
@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals

    if package_path.exists():
        if force:
-            shutil.rmtree(path2str(package_path))
+            shutil.rmtree(str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
-                "`--force` flag to overwrite existing "
-                "directories.".format(path=path2str(package_path)),
+                "`--force` flag to overwrite existing directories.",
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
-    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
+    shutil.copytree(str(input_path), str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
-    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
+    msg.good(f"Successfully created package '{model_name_v}'", main_path)
    msg.text("To build the package, run `python setup.py sdist` in this directory.")


@ -88,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
        ("lang", "Model language", meta.get("lang", "en")),
        ("name", "Model name", meta.get("name", "model")),
        ("version", "Model version", meta.get("version", "0.0.0")),
-        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
        ("description", "Model description", meta.get("description", False)),
        ("author", "Author", meta.get("author", False)),
        ("email", "Author email", meta.get("email", False)),
        ("url", "Author website", meta.get("url", False)),
-        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
+        ("license", "License", meta.get("license", "MIT")),
    ]
    nlp = util.load_model_from_path(Path(model_path))
+    meta["spacy_version"] = util.get_model_version_range(about.__version__)
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):

 TEMPLATE_SETUP = """
 #!/usr/bin/env python
-# coding: utf8
-from __future__ import unicode_literals
-
 import io
 import json
 from os import path, walk
@ -176,6 +168,7 @@ def setup_package():
        package_data={model_name: list_files(model_dir)},
        install_requires=list_requirements(meta),
        zip_safe=False,
+        entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
    )


@ -190,9 +183,6 @@ include meta.json


 TEMPLATE_INIT = """
-# coding: utf8
-from __future__ import unicode_literals
-
 from pathlib import Path
 from spacy.util import load_model_from_init_py, get_model_meta

--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,107 +1,41 @@
-# coding: utf8
-from __future__ import print_function, unicode_literals
-
-import plac
 import random
 import numpy
 import time
 import re
 from collections import Counter
+import plac
 from pathlib import Path
-from thinc.v2v import Affine, Maxout
-from thinc.misc import LayerNorm as LN
-from thinc.neural.util import prefer_gpu
+from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
 from wasabi import msg
 import srsly

 from ..errors import Errors
+from ..ml.models.multi_task import build_masked_language_model
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss
 from .. import util
-from .train import _load_pretrained_tok2vec
+from ..gold import Example


@plac.annotations(
-    texts_loc=(
-        "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
-        "key 'tokens'",
-        "positional",
-        None,
-        str,
-    ),
-    vectors_model=("Name or path to spaCy model with vectors to learn from"),
-    output_dir=("Directory to write models to on each epoch", "positional", None, str),
-    width=("Width of CNN layers", "option", "cw", int),
-    conv_depth=("Depth of CNN layers", "option", "cd", int),
-    cnn_window=("Window size for CNN layers", "option", "cW", int),
-    cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
-    use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
-    sa_depth=("Depth of self-attention layers", "option", "sa", int),
-    bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
-    embed_rows=("Number of embedding rows", "option", "er", int),
-    loss_func=(
-        "Loss function to use for the objective. Either 'L2' or 'cosine'",
-        "option",
-        "L",
-        str,
-    ),
-    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
-    dropout=("Dropout rate", "option", "d", float),
-    batch_size=("Number of words per training batch", "option", "bs", int),
-    max_length=(
-        "Max words per example. Longer examples are discarded",
-        "option",
-        "xw",
-        int,
-    ),
-    min_length=(
-        "Min words per example. Shorter examples are discarded",
-        "option",
-        "nw",
-        int,
-    ),
-    seed=("Seed for random number generators", "option", "s", int),
-    n_iter=("Number of iterations to pretrain", "option", "i", int),
-    n_save_every=("Save model every X batches.", "option", "se", int),
-    init_tok2vec=(
-        "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
-        "option",
-        "t2v",
-        Path,
-    ),
-    epoch_start=(
-        "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
-        "renamed. Prevents unintended overwriting of existing weight files.",
-        "option",
-        "es",
-        int,
-    ),
+    # fmt: off
+    texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
+    vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
+    output_dir=("Directory to write models to on each epoch", "positional", None, Path),
+    config_path=("Path to config file", "positional", None, Path),
+    use_gpu=("Use GPU", "option", "g", int),
+    resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
+    epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
+    # fmt: on
 )
 def pretrain(
    texts_loc,
    vectors_model,
+    config_path,
    output_dir,
-    width=96,
-    conv_depth=4,
-    bilstm_depth=0,
-    cnn_pieces=3,
-    sa_depth=0,
-    use_chars=False,
-    cnn_window=1,
-    embed_rows=2000,
-    loss_func="cosine",
-    use_vectors=False,
-    dropout=0.2,
-    n_iter=1000,
-    batch_size=3000,
-    max_length=500,
-    min_length=5,
-    seed=0,
-    n_save_every=None,
-    init_tok2vec=None,
-    epoch_start=None,
+    use_gpu=-1,
+    resume_path=None,
+    epoch_resume=None,
 ):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@ -115,34 +49,46 @@ def pretrain(
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
-    all settings are the same between pretraining and training. The API and
-    errors around this need some improvement.
+    all settings are the same between pretraining and training. Ideally,
+    this is done by using the same config file for both commands.
    """
-    config = dict(locals())
-    for key in config:
-        if isinstance(config[key], Path):
-            config[key] = str(config[key])
-    util.fix_random_seed(seed)
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)

-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
+    if use_gpu >= 0:
+        msg.info("Using GPU")
+        util.use_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")

-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
-    msg.info("Using GPU" if has_gpu else "Not using GPU")
+    msg.info(f"Loading config from: {config_path}")
+    config = util.load_config(config_path, create_objects=False)
+    util.fix_random_seed(config["pretraining"]["seed"])
+    if config["pretraining"]["use_pytorch_for_gpu_memory"]:
+        use_pytorch_for_gpu_memory()

-    output_dir = Path(output_dir)
    if output_dir.exists() and [p for p in output_dir.iterdir()]:
-        msg.warn(
-            "Output directory is not empty",
-            "It is better to use an empty directory or refer to a new output path, "
-            "then the new directory will be created for you.",
-        )
+        if resume_path:
+            msg.warn(
+                "Output directory is not empty. ",
+                "If you're resuming a run from a previous model in this directory, "
+                "the old models for the consecutive epochs will be overwritten "
+                "with the new ones.",
+            )
+        else:
+            msg.warn(
+                "Output directory is not empty. ",
+                "It is better to use an empty directory or refer to a new output path, "
+                "then the new directory will be created for you.",
+            )
    if not output_dir.exists():
        output_dir.mkdir()
-        msg.good("Created output directory: {}".format(output_dir))
+        msg.good(f"Created output directory: {output_dir}")
    srsly.write_json(output_dir / "config.json", config)
-    msg.good("Saved settings to config.json")
+    msg.good("Saved config file in the output directory")
+
+    config = util.load_config(config_path, create_objects=True)
+    pretrain_config = config["pretraining"]

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
@ -156,64 +102,58 @@ def pretrain(
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
-        msg.text("Reading input text from stdin...")
+        msg.info("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

-    with msg.loading("Loading model '{}'...".format(vectors_model)):
+    with msg.loading(f"Loading model '{vectors_model}'..."):
        nlp = util.load_model(vectors_model)
-    msg.good("Loaded model '{}'".format(vectors_model))
-    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
-    model = create_pretraining_model(
-        nlp,
-        Tok2Vec(
-            width,
-            embed_rows,
-            conv_depth=conv_depth,
-            pretrained_vectors=pretrained_vectors,
-            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
-            subword_features=not use_chars,  # Set to False for Chinese etc
-            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
-        ),
-    )
-    # Load in pretrained weights
-    if init_tok2vec is not None:
-        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
-        msg.text("Loaded pretrained tok2vec for: {}".format(components))
+    msg.good(f"Loaded model '{vectors_model}'")
+    tok2vec_path = pretrain_config["tok2vec_model"]
+    tok2vec = config
+    for subpath in tok2vec_path.split("."):
+        tok2vec = tok2vec.get(subpath)
+    model = create_pretraining_model(nlp, tok2vec)
+    optimizer = pretrain_config["optimizer"]
+
+    # Load in pretrained weights to resume from
+    if resume_path is not None:
+        msg.info(f"Resume training tok2vec from: {resume_path}")
+        with resume_path.open("rb") as file_:
+            weights_data = file_.read()
+            model.get_ref("tok2vec").from_bytes(weights_data)
        # Parse the epoch number from the given weight file
-        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
+        model_name = re.search(r"model\d+\.bin", str(resume_path))
        if model_name:
            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
-            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
+            epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
+            msg.info(f"Resuming from epoch: {epoch_resume}")
        else:
-            if not epoch_start:
+            if not epoch_resume:
                msg.fail(
-                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
-                    "'--init-tok2vec'",
+                    "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
                    exits=True,
                )
-            elif epoch_start < 0:
+            elif epoch_resume < 0:
                msg.fail(
-                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
-                    % epoch_start,
+                    f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
                    exits=True,
                )
+            else:
+                msg.info(f"Resuming from epoch: {epoch_resume}")
    else:
-        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
-        epoch_start = 0
+        # Without '--resume-path' the '--epoch-resume' argument is ignored
+        epoch_resume = 0

-    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
-    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
+    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
-            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
-                "wb"
-            ) as file_:
-                file_.write(model.tok2vec.to_bytes())
+            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
+                file_.write(model.get_ref("tok2vec").to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
@ -224,26 +164,27 @@ def pretrain(
                file_.write(srsly.json_dumps(log) + "\n")

    skip_counter = 0
-    for epoch in range(epoch_start, n_iter + epoch_start):
-        for batch_id, batch in enumerate(
-            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
-        ):
+    loss_func = pretrain_config["loss_func"]
+    for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
+        examples = [Example(doc=text) for text in texts]
+        batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
+        for batch_id, batch in enumerate(batches):
            docs, count = make_docs(
                nlp,
-                [text for (text, _) in batch],
-                max_length=max_length,
-                min_length=min_length,
+                [ex.doc for ex in batch],
+                max_length=pretrain_config["max_length"],
+                min_length=pretrain_config["min_length"],
            )
            skip_counter += count
-            loss = make_update(
-                model, docs, optimizer, objective=loss_func, drop=dropout
-            )
+            loss = make_update(model, docs, optimizer, distance=loss_func)
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
-            if n_save_every and (batch_id % n_save_every == 0):
+            if pretrain_config["n_save_every"] and (
+                batch_id % pretrain_config["n_save_every"] == 0
+            ):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
@ -251,21 +192,21 @@ def pretrain(
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
    if skip_counter > 0:
-        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
+        msg.warn(f"Skipped {skip_counter} empty values")
    msg.good("Successfully finished pretrain")


-def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
+def make_update(model, docs, optimizer, distance):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
-    drop (float): The dropout rate.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
-    predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
-    backprop(gradients, sgd=optimizer)
+    predictions, backprop = model.begin_update(docs)
+    loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance)
+    backprop(gradients)
+    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
@ -297,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length):
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
-        if len(doc) >= min_length and len(doc) < max_length:
+        if min_length <= len(doc) < max_length:
            docs.append(doc)
    return docs, skip_count


-def get_vectors_loss(ops, docs, prediction, objective="L2"):
+def get_vectors_loss(ops, docs, prediction, distance):
    """Compute a mean-squared error loss between the documents' vectors and
    the prediction.

@ -316,13 +257,7 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
-    if objective == "L2":
-        d_target = prediction - target
-        loss = (d_target ** 2).sum()
-    elif objective == "cosine":
-        loss, d_target = get_cossim_loss(prediction, target)
-    else:
-        raise ValueError(Errors.E142.format(loss_func=objective))
+    d_target, loss = distance(prediction, target)
    return loss, d_target


@ -331,22 +266,21 @@ def create_pretraining_model(nlp, tok2vec):
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
+    The actual tok2vec layer is stored as a reference, and only this bit will be
+    serialized to file and read back in when calling the 'train' command.
    """
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
    )
-    # This is annoying, but the parser etc have the flatten step after
-    # the tok2vec. To load the weights in cleanly, we need to match
-    # the shape of the models' components exactly. So what we cann
-    # "tok2vec" has to be the same set of processes as what the components do.
-    tok2vec = chain(tok2vec, flatten)
-    model = chain(tok2vec, output_layer)
-    model = masked_language_model(nlp.vocab, model)
-    model.tok2vec = tok2vec
-    model.output_layer = output_layer
-    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
-    return model
+    model = chain(tok2vec, list2array())
+    model = chain(model, output_layer)
+    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    mlm_model = build_masked_language_model(nlp.vocab, model)
+    mlm_model.set_ref("tok2vec", tok2vec)
+    mlm_model.set_ref("output_layer", output_layer)
+    mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    return mlm_model


 class ProgressTracker(object):
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
-import plac
 import tqdm
 from pathlib import Path
 import srsly
@ -9,18 +5,19 @@ import cProfile
 import pstats
 import sys
 import itertools
-import thinc.extra.datasets
+import ml_datasets
 from wasabi import msg

 from ..util import load_model


-@plac.annotations(
-    model=("Model to load", "positional", None, str),
-    inputs=("Location of input file. '-' for stdin.", "positional", None, str),
-    n_texts=("Maximum number of texts to use if available", "option", "n", int),
-)
-def profile(model, inputs=None, n_texts=10000):
+def profile(
+    # fmt: off
+    model: ("Model to load", "positional", None, str),
+    inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
+    n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
+    # fmt: on
+):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
@ -32,13 +29,13 @@ def profile(model, inputs=None, n_texts=10000):
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
-            imdb_train, _ = thinc.extra.datasets.imdb()
+            imdb_train, _ = ml_datasets.imdb()
            inputs, _ = zip(*imdb_train)
-        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
+        msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
        inputs = inputs[:n_inputs]
-    with msg.loading("Loading model '{}'...".format(model)):
+    with msg.loading(f"Loading model '{model}'..."):
        nlp = load_model(model)
-    msg.good("Loaded model '{}'".format(model))
+    msg.good(f"Loaded model '{model}'")
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
        input_path = Path(loc)
        if not input_path.exists() or not input_path.is_file():
            msg.fail("Not a valid input data file", loc, exits=1)
-        msg.info("Using data from {}".format(input_path.parts[-1]))
+        msg.info(f"Using data from {input_path.parts[-1]}")
        file_ = input_path.open()
    for line in file_:
        data = srsly.json_loads(line)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,770 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals, division, print_function
-
-import plac
-import os
-import tqdm
-from pathlib import Path
-from thinc.neural._classes.model import Model
-from timeit import default_timer as timer
-import shutil
-import srsly
-from wasabi import msg
-import contextlib
-import random
-
-from .._ml import create_default_optimizer
-from ..util import use_gpu as set_gpu
-from ..errors import Errors
-from ..gold import GoldCorpus
-from ..compat import path2str
-from ..lookups import Lookups
-from .. import util
-from .. import about
-
-
-@plac.annotations(
-    # fmt: off
-    lang=("Model language", "positional", None, str),
-    output_path=("Output directory to store model in", "positional", None, Path),
-    train_path=("Location of JSON-formatted training data", "positional", None, Path),
-    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
-    raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
-    base_model=("Name of model to update (optional)", "option", "b", str),
-    pipeline=("Comma-separated names of pipeline components", "option", "p", str),
-    replace_components=("Replace components from base model", "flag", "R", bool),
-    vectors=("Model to load vectors from", "option", "v", str),
-    width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
-    conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
-    cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
-    cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
-    use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
-    bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
-    embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
-    n_iter=("Number of iterations", "option", "n", int),
-    n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
-    n_examples=("Number of examples", "option", "ns", int),
-    use_gpu=("Use GPU", "option", "g", int),
-    version=("Model version", "option", "V", str),
-    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
-    init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
-    parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
-    entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
-    noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
-    orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
-    eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
-    learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
-    textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
-    textcat_arch=("Textcat model architecture", "option", "ta", str),
-    textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
-    tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
-    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
-    verbose=("Display more information for debug", "flag", "VV", bool),
-    debug=("Run data diagnostics before training", "flag", "D", bool),
-    # fmt: on
-)
-def train(
-    lang,
-    output_path,
-    train_path,
-    dev_path,
-    raw_text=None,
-    base_model=None,
-    pipeline="tagger,parser,ner",
-    replace_components=False,
-    vectors=None,
-    width=96,
-    conv_depth=4,
-    cnn_window=1,
-    cnn_pieces=3,
-    use_chars=False,
-    bilstm_depth=0,
-    embed_rows=2000,
-    n_iter=30,
-    n_early_stopping=None,
-    n_examples=0,
-    use_gpu=-1,
-    version="0.0.0",
-    meta_path=None,
-    init_tok2vec=None,
-    parser_multitasks="",
-    entity_multitasks="",
-    noise_level=0.0,
-    orth_variant_level=0.0,
-    eval_beam_widths="",
-    gold_preproc=False,
-    learn_tokens=False,
-    textcat_multilabel=False,
-    textcat_arch="bow",
-    textcat_positive_label=None,
-    tag_map_path=None,
-    omit_extra_lookups=False,
-    verbose=False,
-    debug=False,
-):
-    """
-    Train or update a spaCy model. Requires data to be formatted in spaCy's
-    JSON format. To convert data from other formats, use the `spacy convert`
-    command.
-    """
-    util.fix_random_seed()
-    util.set_env_log(verbose)
-
-    # Make sure all files and paths exists if they are needed
-    train_path = util.ensure_path(train_path)
-    dev_path = util.ensure_path(dev_path)
-    meta_path = util.ensure_path(meta_path)
-    output_path = util.ensure_path(output_path)
-    if raw_text is not None:
-        raw_text = list(srsly.read_jsonl(raw_text))
-    if not train_path or not train_path.exists():
-        msg.fail("Training data not found", train_path, exits=1)
-    if not dev_path or not dev_path.exists():
-        msg.fail("Development data not found", dev_path, exits=1)
-    if meta_path is not None and not meta_path.exists():
-        msg.fail("Can't find model meta.json", meta_path, exits=1)
-    meta = srsly.read_json(meta_path) if meta_path else {}
-    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
-        msg.warn(
-            "Output directory is not empty",
-            "This can lead to unintended side effects when saving the model. "
-            "Please use an empty directory or a different path instead. If "
-            "the specified output path doesn't exist, the directory will be "
-            "created for you.",
-        )
-    if not output_path.exists():
-        output_path.mkdir()
-        msg.good("Created output directory: {}".format(output_path))
-
-    tag_map = {}
-    if tag_map_path is not None:
-        tag_map = srsly.read_json(tag_map_path)
-    # Take dropout and batch size as generators of values -- dropout
-    # starts high and decays sharply, to force the optimizer to explore.
-    # Batch size starts at 1 and grows, so that we make updates quickly
-    # at the beginning of training.
-    dropout_rates = util.decaying(
-        util.env_opt("dropout_from", 0.2),
-        util.env_opt("dropout_to", 0.2),
-        util.env_opt("dropout_decay", 0.0),
-    )
-    batch_sizes = util.compounding(
-        util.env_opt("batch_from", 100.0),
-        util.env_opt("batch_to", 1000.0),
-        util.env_opt("batch_compound", 1.001),
-    )
-
-    if not eval_beam_widths:
-        eval_beam_widths = [1]
-    else:
-        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
-        if 1 not in eval_beam_widths:
-            eval_beam_widths.append(1)
-        eval_beam_widths.sort()
-    has_beam_widths = eval_beam_widths != [1]
-
-    # Set up the base model and pipeline. If a base model is specified, load
-    # the model and make sure the pipeline matches the pipeline setting. If
-    # training starts from a blank model, intitalize the language class.
-    pipeline = [p.strip() for p in pipeline.split(",")]
-    disabled_pipes = None
-    pipes_added = False
-    msg.text("Training pipeline: {}".format(pipeline))
-    if use_gpu >= 0:
-        activated_gpu = None
-        try:
-            activated_gpu = set_gpu(use_gpu)
-        except Exception as e:
-            msg.warn("Exception: {}".format(e))
-        if activated_gpu is not None:
-            msg.text("Using GPU: {}".format(use_gpu))
-        else:
-            msg.warn("Unable to activate GPU: {}".format(use_gpu))
-            msg.text("Using CPU only")
-            use_gpu = -1
-    base_components = []
-    if base_model:
-        msg.text("Starting with base model '{}'".format(base_model))
-        nlp = util.load_model(base_model)
-        if nlp.lang != lang:
-            msg.fail(
-                "Model language ('{}') doesn't match language specified as "
-                "`lang` argument ('{}') ".format(nlp.lang, lang),
-                exits=1,
-            )
-        for pipe in pipeline:
-            pipe_cfg = {}
-            if pipe == "parser":
-                pipe_cfg = {"learn_tokens": learn_tokens}
-            elif pipe == "textcat":
-                pipe_cfg = {
-                    "exclusive_classes": not textcat_multilabel,
-                    "architecture": textcat_arch,
-                    "positive_label": textcat_positive_label,
-                }
-            if pipe not in nlp.pipe_names:
-                msg.text("Adding component to base model '{}'".format(pipe))
-                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
-                pipes_added = True
-            elif replace_components:
-                msg.text("Replacing component from base model '{}'".format(pipe))
-                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
-                pipes_added = True
-            else:
-                if pipe == "textcat":
-                    textcat_cfg = nlp.get_pipe("textcat").cfg
-                    base_cfg = {
-                        "exclusive_classes": textcat_cfg["exclusive_classes"],
-                        "architecture": textcat_cfg["architecture"],
-                        "positive_label": textcat_cfg["positive_label"],
-                    }
-                    if base_cfg != pipe_cfg:
-                        msg.fail(
-                            "The base textcat model configuration does"
-                            "not match the provided training options. "
-                            "Existing cfg: {}, provided cfg: {}".format(
-                                base_cfg, pipe_cfg
-                            ),
-                            exits=1,
-                        )
-                msg.text("Extending component from base model '{}'".format(pipe))
-                base_components.append(pipe)
-        disabled_pipes = nlp.disable_pipes(
-            [p for p in nlp.pipe_names if p not in pipeline]
-        )
-    else:
-        msg.text("Starting with blank model '{}'".format(lang))
-        lang_cls = util.get_lang_class(lang)
-        nlp = lang_cls()
-        for pipe in pipeline:
-            if pipe == "parser":
-                pipe_cfg = {"learn_tokens": learn_tokens}
-            elif pipe == "textcat":
-                pipe_cfg = {
-                    "exclusive_classes": not textcat_multilabel,
-                    "architecture": textcat_arch,
-                    "positive_label": textcat_positive_label,
-                }
-            else:
-                pipe_cfg = {}
-            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
-
-    # Update tag map with provided mapping
-    nlp.vocab.morphology.tag_map.update(tag_map)
-
-    # Create empty extra lexeme tables so the data from spacy-lookups-data
-    # isn't loaded if these features are accessed
-    if omit_extra_lookups:
-        nlp.vocab.lookups_extra = Lookups()
-        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
-        nlp.vocab.lookups_extra.add_table("lexeme_prob")
-        nlp.vocab.lookups_extra.add_table("lexeme_settings")
-
-    if vectors:
-        msg.text("Loading vector from model '{}'".format(vectors))
-        _load_vectors(nlp, vectors)
-
-    # Multitask objectives
-    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
-    for pipe_name, multitasks in multitask_options:
-        if multitasks:
-            if pipe_name not in pipeline:
-                msg.fail(
-                    "Can't use multitask objective without '{}' in the "
-                    "pipeline".format(pipe_name)
-                )
-            pipe = nlp.get_pipe(pipe_name)
-            for objective in multitasks.split(","):
-                pipe.add_multitask_objective(objective)
-
-    # Prepare training corpus
-    msg.text("Counting training words (limit={})".format(n_examples))
-    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
-    n_train_words = corpus.count_train()
-
-    if base_model and not pipes_added:
-        # Start with an existing model, use default optimizer
-        optimizer = create_default_optimizer(Model.ops)
-    else:
-        # Start with a blank model, call begin_training
-        cfg = {"device": use_gpu}
-        cfg["conv_depth"] = conv_depth
-        cfg["token_vector_width"] = width
-        cfg["bilstm_depth"] = bilstm_depth
-        cfg["cnn_maxout_pieces"] = cnn_pieces
-        cfg["embed_size"] = embed_rows
-        cfg["conv_window"] = cnn_window
-        cfg["subword_features"] = not use_chars
-        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
-
-    nlp._optimizer = None
-
-    # Load in pretrained weights
-    if init_tok2vec is not None:
-        components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components)
-        msg.text("Loaded pretrained tok2vec for: {}".format(components))
-
-    # Verify textcat config
-    if "textcat" in pipeline:
-        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
-        if textcat_positive_label and textcat_positive_label not in textcat_labels:
-            msg.fail(
-                "The textcat_positive_label (tpl) '{}' does not match any "
-                "label in the training data.".format(textcat_positive_label),
-                exits=1,
-            )
-        if textcat_positive_label and len(textcat_labels) != 2:
-            msg.fail(
-                "A textcat_positive_label (tpl) '{}' was provided for training "
-                "data that does not appear to be a binary classification "
-                "problem with two labels.".format(textcat_positive_label),
-                exits=1,
-            )
-        train_docs = corpus.train_docs(
-            nlp,
-            noise_level=noise_level,
-            gold_preproc=gold_preproc,
-            max_length=0,
-            ignore_misaligned=True,
-        )
-        train_labels = set()
-        if textcat_multilabel:
-            multilabel_found = False
-            for text, gold in train_docs:
-                train_labels.update(gold.cats.keys())
-                if list(gold.cats.values()).count(1.0) != 1:
-                    multilabel_found = True
-            if not multilabel_found and not base_model:
-                msg.warn(
-                    "The textcat training instances look like they have "
-                    "mutually-exclusive classes. Remove the flag "
-                    "'--textcat-multilabel' to train a classifier with "
-                    "mutually-exclusive classes."
-                )
-        if not textcat_multilabel:
-            for text, gold in train_docs:
-                train_labels.update(gold.cats.keys())
-                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
-                    msg.warn(
-                        "Some textcat training instances do not have exactly "
-                        "one positive label. Modifying training options to "
-                        "include the flag '--textcat-multilabel' for classes "
-                        "that are not mutually exclusive."
-                    )
-                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
-                    textcat_multilabel = True
-                    break
-        if base_model and set(textcat_labels) != train_labels:
-            msg.fail(
-                "Cannot extend textcat model using data with different "
-                "labels. Base model labels: {}, training data labels: "
-                "{}.".format(textcat_labels, list(train_labels)),
-                exits=1,
-            )
-        if textcat_multilabel:
-            msg.text(
-                "Textcat evaluation score: ROC AUC score macro-averaged across "
-                "the labels '{}'".format(", ".join(textcat_labels))
-            )
-        elif textcat_positive_label and len(textcat_labels) == 2:
-            msg.text(
-                "Textcat evaluation score: F1-score for the "
-                "label '{}'".format(textcat_positive_label)
-            )
-        elif len(textcat_labels) > 1:
-            if len(textcat_labels) == 2:
-                msg.warn(
-                    "If the textcat component is a binary classifier with "
-                    "exclusive classes, provide '--textcat-positive-label' for "
-                    "an evaluation on the positive class."
-                )
-            msg.text(
-                "Textcat evaluation score: F1-score macro-averaged across "
-                "the labels '{}'".format(", ".join(textcat_labels))
-            )
-        else:
-            msg.fail(
-                "Unsupported textcat configuration. Use `spacy debug-data` "
-                "for more information."
-            )
-
-    # fmt: off
-    row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
-    row_widths = [len(w) for w in row_head]
-    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
-    # fmt: on
-    print("")
-    msg.row(row_head, **row_settings)
-    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
-    try:
-        iter_since_best = 0
-        best_score = 0.0
-        for i in range(n_iter):
-            train_docs = corpus.train_docs(
-                nlp,
-                noise_level=noise_level,
-                orth_variant_level=orth_variant_level,
-                gold_preproc=gold_preproc,
-                max_length=0,
-                ignore_misaligned=True,
-            )
-            if raw_text:
-                random.shuffle(raw_text)
-                raw_batches = util.minibatch(
-                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
-                )
-            words_seen = 0
-            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
-                losses = {}
-                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
-                    if not batch:
-                        continue
-                    docs, golds = zip(*batch)
-                    try:
-                        nlp.update(
-                            docs,
-                            golds,
-                            sgd=optimizer,
-                            drop=next(dropout_rates),
-                            losses=losses,
-                        )
-                    except ValueError as e:
-                        err = "Error during training"
-                        if init_tok2vec:
-                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
-                        msg.fail(err, "Original error message: {}".format(e), exits=1)
-                    if raw_text:
-                        # If raw text is available, perform 'rehearsal' updates,
-                        # which use unlabelled data to reduce overfitting.
-                        raw_batch = list(next(raw_batches))
-                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
-                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
-                        pbar.update(sum(len(doc) for doc in docs))
-                    words_seen += sum(len(doc) for doc in docs)
-            with nlp.use_params(optimizer.averages):
-                util.set_env_log(False)
-                epoch_model_path = output_path / ("model%d" % i)
-                nlp.to_disk(epoch_model_path)
-                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                for beam_width in eval_beam_widths:
-                    for name, component in nlp_loaded.pipeline:
-                        if hasattr(component, "cfg"):
-                            component.cfg["beam_width"] = beam_width
-                    dev_docs = list(
-                        corpus.dev_docs(
-                            nlp_loaded,
-                            gold_preproc=gold_preproc,
-                            ignore_misaligned=True,
-                        )
-                    )
-                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
-                    start_time = timer()
-                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
-                    end_time = timer()
-                    if use_gpu < 0:
-                        gpu_wps = None
-                        cpu_wps = nwords / (end_time - start_time)
-                    else:
-                        gpu_wps = nwords / (end_time - start_time)
-                        # Only evaluate on CPU in the first iteration (for
-                        # timing) if GPU is enabled
-                        if i == 0:
-                            with Model.use_device("cpu"):
-                                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                                for name, component in nlp_loaded.pipeline:
-                                    if hasattr(component, "cfg"):
-                                        component.cfg["beam_width"] = beam_width
-                                dev_docs = list(
-                                    corpus.dev_docs(
-                                        nlp_loaded,
-                                        gold_preproc=gold_preproc,
-                                        ignore_misaligned=True,
-                                    )
-                                )
-                                start_time = timer()
-                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
-                                end_time = timer()
-                                cpu_wps = nwords / (end_time - start_time)
-                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
-                    srsly.write_json(acc_loc, scorer.scores)
-
-                    # Update model meta.json
-                    meta["lang"] = nlp.lang
-                    meta["pipeline"] = nlp.pipe_names
-                    meta["spacy_version"] = ">=%s" % about.__version__
-                    if beam_width == 1:
-                        meta["speed"] = {
-                            "nwords": nwords,
-                            "cpu": cpu_wps,
-                            "gpu": gpu_wps,
-                        }
-                        meta.setdefault("accuracy", {})
-                        for component in nlp.pipe_names:
-                            for metric in _get_metrics(component):
-                                meta["accuracy"][metric] = scorer.scores[metric]
-                    else:
-                        meta.setdefault("beam_accuracy", {})
-                        meta.setdefault("beam_speed", {})
-                        for component in nlp.pipe_names:
-                            for metric in _get_metrics(component):
-                                meta["beam_accuracy"][metric] = scorer.scores[metric]
-                        meta["beam_speed"][beam_width] = {
-                            "nwords": nwords,
-                            "cpu": cpu_wps,
-                            "gpu": gpu_wps,
-                        }
-                    meta["vectors"] = {
-                        "width": nlp.vocab.vectors_length,
-                        "vectors": len(nlp.vocab.vectors),
-                        "keys": nlp.vocab.vectors.n_keys,
-                        "name": nlp.vocab.vectors.name,
-                    }
-                    meta.setdefault("name", "model%d" % i)
-                    meta.setdefault("version", version)
-                    meta["labels"] = nlp.meta["labels"]
-                    meta_loc = output_path / ("model%d" % i) / "meta.json"
-                    srsly.write_json(meta_loc, meta)
-                    util.set_env_log(verbose)
-
-                    progress = _get_progress(
-                        i,
-                        losses,
-                        scorer.scores,
-                        output_stats,
-                        beam_width=beam_width if has_beam_widths else None,
-                        cpu_wps=cpu_wps,
-                        gpu_wps=gpu_wps,
-                    )
-                    if i == 0 and "textcat" in pipeline:
-                        textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
-                        for cat, cat_score in textcats_per_cat.items():
-                            if cat_score.get("roc_auc_score", 0) < 0:
-                                msg.warn(
-                                    "Textcat ROC AUC score is undefined due to "
-                                    "only one value in label '{}'.".format(cat)
-                                )
-                    msg.row(progress, **row_settings)
-                # Early stopping
-                if n_early_stopping is not None:
-                    current_score = _score_for_model(meta)
-                    if current_score < best_score:
-                        iter_since_best += 1
-                    else:
-                        iter_since_best = 0
-                        best_score = current_score
-                    if iter_since_best >= n_early_stopping:
-                        msg.text(
-                            "Early stopping, best iteration "
-                            "is: {}".format(i - iter_since_best)
-                        )
-                        msg.text(
-                            "Best score = {}; Final iteration "
-                            "score = {}".format(best_score, current_score)
-                        )
-                        break
-    except Exception as e:
-        msg.warn(
-            "Aborting and saving the final best model. "
-            "Encountered exception: {}".format(e),
-            exits=1,
-        )
-    finally:
-        best_pipes = nlp.pipe_names
-        if disabled_pipes:
-            disabled_pipes.restore()
-        with nlp.use_params(optimizer.averages):
-            final_model_path = output_path / "model-final"
-            nlp.to_disk(final_model_path)
-            meta_loc = output_path / "model-final" / "meta.json"
-            final_meta = srsly.read_json(meta_loc)
-            final_meta.setdefault("accuracy", {})
-            final_meta["accuracy"].update(meta.get("accuracy", {}))
-            final_meta.setdefault("speed", {})
-            final_meta["speed"].setdefault("cpu", None)
-            final_meta["speed"].setdefault("gpu", None)
-            meta.setdefault("speed", {})
-            meta["speed"].setdefault("cpu", None)
-            meta["speed"].setdefault("gpu", None)
-            # combine cpu and gpu speeds with the base model speeds
-            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
-                speed = _get_total_speed(
-                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
-                )
-                final_meta["speed"]["cpu"] = speed
-            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
-                speed = _get_total_speed(
-                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
-                )
-                final_meta["speed"]["gpu"] = speed
-            # if there were no speeds to update, overwrite with meta
-            if (
-                final_meta["speed"]["cpu"] is None
-                and final_meta["speed"]["gpu"] is None
-            ):
-                final_meta["speed"].update(meta["speed"])
-            # note: beam speeds are not combined with the base model
-            if has_beam_widths:
-                final_meta.setdefault("beam_accuracy", {})
-                final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
-                final_meta.setdefault("beam_speed", {})
-                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
-            srsly.write_json(meta_loc, final_meta)
-        msg.good("Saved model to output directory", final_model_path)
-        with msg.loading("Creating best model..."):
-            best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
-        msg.good("Created best model", best_model_path)
-
-
-def _score_for_model(meta):
-    """ Returns mean score between tasks in pipeline that can be used for early stopping. """
-    mean_acc = list()
-    pipes = meta["pipeline"]
-    acc = meta["accuracy"]
-    if "tagger" in pipes:
-        mean_acc.append(acc["tags_acc"])
-    if "parser" in pipes:
-        mean_acc.append((acc["uas"] + acc["las"]) / 2)
-    if "ner" in pipes:
-        mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
-    if "textcat" in pipes:
-        mean_acc.append(acc["textcat_score"])
-    return sum(mean_acc) / len(mean_acc)
-
-
-@contextlib.contextmanager
-def _create_progress_bar(total):
-    if int(os.environ.get("LOG_FRIENDLY", 0)):
-        yield
-    else:
-        pbar = tqdm.tqdm(total=total, leave=False)
-        yield pbar
-
-
-def _load_vectors(nlp, vectors):
-    util.load_model(vectors, vocab=nlp.vocab)
-
-
-def _load_pretrained_tok2vec(nlp, loc, base_components):
-    """Load pretrained weights for the 'token-to-vector' part of the component
-    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
-    """
-    with loc.open("rb") as file_:
-        weights_data = file_.read()
-    loaded = []
-    for name, component in nlp.pipeline:
-        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
-            if name in base_components:
-                raise ValueError(Errors.E200.format(component=name))
-            component.tok2vec.from_bytes(weights_data)
-            loaded.append(name)
-    return loaded
-
-
-def _collate_best_model(meta, output_path, components):
-    bests = {}
-    meta.setdefault("accuracy", {})
-    for component in components:
-        bests[component] = _find_best(output_path, component)
-    best_dest = output_path / "model-best"
-    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
-    for component, best_component_src in bests.items():
-        shutil.rmtree(path2str(best_dest / component))
-        shutil.copytree(
-            path2str(best_component_src / component), path2str(best_dest / component)
-        )
-        accs = srsly.read_json(best_component_src / "accuracy.json")
-        for metric in _get_metrics(component):
-            meta["accuracy"][metric] = accs[metric]
-    srsly.write_json(best_dest / "meta.json", meta)
-    return best_dest
-
-
-def _find_best(experiment_dir, component):
-    accuracies = []
-    for epoch_model in experiment_dir.iterdir():
-        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
-            accs = srsly.read_json(epoch_model / "accuracy.json")
-            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
-            # remove per_type dicts from score list for max() comparison
-            scores = [score for score in scores if isinstance(score, float)]
-            accuracies.append((scores, epoch_model))
-    if accuracies:
-        return max(accuracies)[1]
-    else:
-        return None
-
-
-def _get_metrics(component):
-    if component == "parser":
-        return ("las", "uas", "las_per_type", "token_acc")
-    elif component == "tagger":
-        return ("tags_acc", "token_acc")
-    elif component == "ner":
-        return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
-    elif component == "textcat":
-        return ("textcat_score", "token_acc")
-    return ("token_acc",)
-
-
-def _configure_training_output(pipeline, use_gpu, has_beam_widths):
-    row_head = ["Itn"]
-    output_stats = []
-    for pipe in pipeline:
-        if pipe == "tagger":
-            row_head.extend(["Tag Loss ", " Tag %  "])
-            output_stats.extend(["tag_loss", "tags_acc"])
-        elif pipe == "parser":
-            row_head.extend(["Dep Loss ", " UAS  ", " LAS  "])
-            output_stats.extend(["dep_loss", "uas", "las"])
-        elif pipe == "ner":
-            row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
-            output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
-        elif pipe == "textcat":
-            row_head.extend(["Textcat Loss", "Textcat"])
-            output_stats.extend(["textcat_loss", "textcat_score"])
-    row_head.extend(["Token %", "CPU WPS"])
-    output_stats.extend(["token_acc", "cpu_wps"])
-
-    if use_gpu >= 0:
-        row_head.extend(["GPU WPS"])
-        output_stats.extend(["gpu_wps"])
-
-    if has_beam_widths:
-        row_head.insert(1, "Beam W.")
-    return row_head, output_stats
-
-
-def _get_progress(
-    itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
-):
-    scores = {}
-    for stat in output_stats:
-        scores[stat] = 0.0
-    scores["dep_loss"] = losses.get("parser", 0.0)
-    scores["ner_loss"] = losses.get("ner", 0.0)
-    scores["tag_loss"] = losses.get("tagger", 0.0)
-    scores["textcat_loss"] = losses.get("textcat", 0.0)
-    scores["cpu_wps"] = cpu_wps
-    scores["gpu_wps"] = gpu_wps or 0.0
-    scores.update(dev_scores)
-    formatted_scores = []
-    for stat in output_stats:
-        format_spec = "{:.3f}"
-        if stat.endswith("_wps"):
-            format_spec = "{:.0f}"
-        formatted_scores.append(format_spec.format(scores[stat]))
-    result = [itn + 1]
-    result.extend(formatted_scores)
-    if beam_width is not None:
-        result.insert(1, beam_width)
-    return result
-
-
-def _get_total_speed(speeds):
-    seconds_per_word = 0.0
-    for words_per_second in speeds:
-        if words_per_second is None:
-            return None
-        seconds_per_word += 1.0 / words_per_second
-    return 1.0 / seconds_per_word
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -0,0 +1,606 @@
+from typing import Optional, Dict, List, Union, Sequence
+from timeit import default_timer as timer
+
+import srsly
+from pydantic import BaseModel, FilePath
+import tqdm
+from pathlib import Path
+from wasabi import msg
+import thinc
+import thinc.schedules
+from thinc.api import Model, use_pytorch_for_gpu_memory
+import random
+
+from ..gold import GoldCorpus
+from ..lookups import Lookups
+from .. import util
+from ..errors import Errors
+
+# Don't remove - required to load the built-in architectures
+from ..ml import models  # noqa: F401
+
+registry = util.registry
+
+CONFIG_STR = """
+[training]
+patience = 10
+eval_frequency = 10
+dropout = 0.2
+init_tok2vec = null
+max_epochs = 100
+orth_variant_level = 0.0
+gold_preproc = false
+max_length = 0
+use_gpu = 0
+scores = ["ents_p",  "ents_r", "ents_f"]
+score_weights = {"ents_f": 1.0}
+limit = 0
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 64
+maxout_pieces = 3
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 128
+depth = 4
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
+"""
+
+
+class PipelineComponent(BaseModel):
+    factory: str
+    model: Model
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class ConfigSchema(BaseModel):
+    optimizer: Optional["Optimizer"]
+
+    class training(BaseModel):
+        patience: int = 10
+        eval_frequency: int = 100
+        dropout: float = 0.2
+        init_tok2vec: Optional[FilePath] = None
+        max_epochs: int = 100
+        orth_variant_level: float = 0.0
+        gold_preproc: bool = False
+        max_length: int = 0
+        use_gpu: int = 0
+        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
+        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
+        limit: int = 0
+        batch_size: Union[Sequence[int], int]
+
+    class nlp(BaseModel):
+        lang: str
+        vectors: Optional[str]
+        pipeline: Optional[Dict[str, PipelineComponent]]
+
+    class Config:
+        extra = "allow"
+
+
+def train_cli(
+    # fmt: off
+    train_path: ("Location of JSON-formatted training data", "positional", None, Path),
+    dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
+    config_path: ("Path to config file", "positional", None, Path),
+    output_path: ("Output directory to store model in", "option", "o", Path) = None,
+    init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
+    raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
+    verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
+    use_gpu: ("Use GPU", "option", "g", int) = -1,
+    tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
+    omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
+    # fmt: on
+):
+    """
+    Train or update a spaCy model. Requires data to be formatted in spaCy's
+    JSON format. To convert data from other formats, use the `spacy convert`
+    command.
+    """
+    util.set_env_log(verbose)
+
+    # Make sure all files and paths exists if they are needed
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if not train_path or not train_path.exists():
+        msg.fail("Training data not found", train_path, exits=1)
+    if not dev_path or not dev_path.exists():
+        msg.fail("Development data not found", dev_path, exits=1)
+    if output_path is not None:
+        if not output_path.exists():
+            output_path.mkdir()
+            msg.good(f"Created output directory: {output_path}")
+        elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
+            msg.warn(
+                "Output directory is not empty.",
+                "This can lead to unintended side effects when saving the model. "
+                "Please use an empty directory or a different path instead. If "
+                "the specified output path doesn't exist, the directory will be "
+                "created for you.",
+            )
+    if raw_text is not None:
+        raw_text = list(srsly.read_jsonl(raw_text))
+    tag_map = {}
+    if tag_map_path is not None:
+        tag_map = srsly.read_json(tag_map_path)
+
+    weights_data = None
+    if init_tok2vec is not None:
+        if not init_tok2vec.exists():
+            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+        with init_tok2vec.open("rb") as file_:
+            weights_data = file_.read()
+
+    if use_gpu >= 0:
+        msg.info("Using GPU: {use_gpu}")
+        util.use_gpu(use_gpu)
+    else:
+        msg.info("Using CPU")
+
+    train(
+        config_path,
+        {"train": train_path, "dev": dev_path},
+        output_path=output_path,
+        raw_text=raw_text,
+        tag_map=tag_map,
+        weights_data=weights_data,
+        omit_extra_lookups=omit_extra_lookups,
+    )
+
+
+def train(
+    config_path,
+    data_paths,
+    raw_text=None,
+    output_path=None,
+    tag_map=None,
+    weights_data=None,
+    omit_extra_lookups=False,
+):
+    msg.info(f"Loading config from: {config_path}")
+    # Read the config first without creating objects, to get to the original nlp_config
+    config = util.load_config(config_path, create_objects=False)
+    util.fix_random_seed(config["training"]["seed"])
+    if config["training"].get("use_pytorch_for_gpu_memory"):
+        # It feels kind of weird to not have a default for this.
+        use_pytorch_for_gpu_memory()
+    nlp_config = config["nlp"]
+    config = util.load_config(config_path, create_objects=True)
+    training = config["training"]
+    msg.info("Creating nlp from config")
+    nlp = util.load_model_from_config(nlp_config)
+    optimizer = training["optimizer"]
+    limit = training["limit"]
+    msg.info("Loading training corpus")
+    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
+
+    # verify textcat config
+    if "textcat" in nlp_config["pipeline"]:
+        textcat_labels = set(nlp.get_pipe("textcat").labels)
+        textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
+            "exclusive_classes"
+        ]
+
+        # check whether the setting 'exclusive_classes' corresponds to the provided training data
+        if textcat_multilabel:
+            multilabel_found = False
+            for ex in corpus.train_examples:
+                cats = ex.doc_annotation.cats
+                textcat_labels.update(cats.keys())
+                if list(cats.values()).count(1.0) != 1:
+                    multilabel_found = True
+            if not multilabel_found:
+                msg.warn(
+                    "The textcat training instances look like they have "
+                    "mutually exclusive classes. Set 'exclusive_classes' "
+                    "to 'true' in the config to train a classifier with "
+                    "mutually exclusive classes more accurately."
+                )
+        else:
+            for ex in corpus.train_examples:
+                cats = ex.doc_annotation.cats
+                textcat_labels.update(cats.keys())
+                if list(cats.values()).count(1.0) != 1:
+                    msg.fail(
+                        "Some textcat training instances do not have exactly "
+                        "one positive label. Set 'exclusive_classes' "
+                        "to 'false' in the config to train a classifier with classes "
+                        "that are not mutually exclusive."
+                    )
+        msg.info(
+            f"Initialized textcat component for {len(textcat_labels)} unique labels"
+        )
+        nlp.get_pipe("textcat").labels = tuple(textcat_labels)
+
+        # if 'positive_label' is provided: double check whether it's in the data and the task is binary
+        if nlp_config["pipeline"]["textcat"].get("positive_label", None):
+            textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
+            pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
+            if pos_label not in textcat_labels:
+                msg.fail(
+                    f"The textcat's 'positive_label' config setting '{pos_label}' "
+                    f"does not match any label in the training data.",
+                    exits=1,
+                )
+            if len(textcat_labels) != 2:
+                msg.fail(
+                    f"A textcat 'positive_label' '{pos_label}' was "
+                    f"provided for training data that does not appear to be a "
+                    f"binary classification problem with two labels.",
+                    exits=1,
+                )
+
+    if training.get("resume", False):
+        msg.info("Resuming training")
+        nlp.resume_training()
+    else:
+        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
+        nlp.begin_training(lambda: corpus.train_examples)
+
+    # Update tag map with provided mapping
+    nlp.vocab.morphology.tag_map.update(tag_map)
+
+    # Create empty extra lexeme tables so the data from spacy-lookups-data
+    # isn't loaded if these features are accessed
+    if omit_extra_lookups:
+        nlp.vocab.lookups_extra = Lookups()
+        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+        nlp.vocab.lookups_extra.add_table("lexeme_prob")
+        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
+    # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
+    if weights_data is not None:
+        tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
+        if tok2vec_path is None:
+            msg.fail(
+                f"To use a pretrained tok2vec model, the config needs to specify which "
+                f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
+                exits=1,
+            )
+        tok2vec = config
+        for subpath in tok2vec_path.split("."):
+            tok2vec = tok2vec.get(subpath)
+        if not tok2vec:
+            msg.fail(
+                f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
+            )
+        tok2vec.from_bytes(weights_data)
+
+    train_batches = create_train_batches(nlp, corpus, training)
+    evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
+
+    # Create iterator, which yields out info after each optimization step.
+    msg.info("Start training")
+    training_step_iterator = train_while_improving(
+        nlp,
+        optimizer,
+        train_batches,
+        evaluate,
+        dropout=training["dropout"],
+        accumulate_gradient=training["accumulate_gradient"],
+        patience=training.get("patience", 0),
+        max_steps=training.get("max_steps", 0),
+        eval_frequency=training["eval_frequency"],
+        raw_text=raw_text,
+    )
+
+    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
+    print_row = setup_printer(training, nlp)
+
+    try:
+        progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
+        for batch, info, is_best_checkpoint in training_step_iterator:
+            progress.update(1)
+            if is_best_checkpoint is not None:
+                progress.close()
+                print_row(info)
+                if is_best_checkpoint and output_path is not None:
+                    update_meta(training, nlp, info)
+                    nlp.to_disk(output_path / "model-best")
+                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
+            # Clean up the objects to faciliate garbage collection.
+            for eg in batch:
+                eg.doc = None
+                eg.goldparse = None
+                eg.doc_annotation = None
+                eg.token_annotation = None
+    except Exception as e:
+        msg.warn(
+            f"Aborting and saving the final best model. "
+            f"Encountered exception: {str(e)}",
+            exits=1,
+        )
+    finally:
+        if output_path is not None:
+            final_model_path = output_path / "model-final"
+            if optimizer.averages:
+                with nlp.use_params(optimizer.averages):
+                    nlp.to_disk(final_model_path)
+            else:
+                nlp.to_disk(final_model_path)
+            msg.good(f"Saved model to output directory {final_model_path}")
+
+
+def create_train_batches(nlp, corpus, cfg):
+    epochs_todo = cfg.get("max_epochs", 0)
+    while True:
+        train_examples = list(
+            corpus.train_dataset(
+                nlp,
+                noise_level=0.0,  # I think this is deprecated?
+                orth_variant_level=cfg["orth_variant_level"],
+                gold_preproc=cfg["gold_preproc"],
+                max_length=cfg["max_length"],
+                ignore_misaligned=True,
+            )
+        )
+        if len(train_examples) == 0:
+            raise ValueError(Errors.E988)
+        random.shuffle(train_examples)
+        batches = util.minibatch_by_words(
+            train_examples,
+            size=cfg["batch_size"],
+            discard_oversize=cfg["discard_oversize"],
+        )
+        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
+        try:
+            first = next(batches)
+            yield first
+        except StopIteration:
+            raise ValueError(Errors.E986)
+        for batch in batches:
+            yield batch
+        epochs_todo -= 1
+        # We intentionally compare exactly to 0 here, so that max_epochs < 1
+        # will not break.
+        if epochs_todo == 0:
+            break
+
+
+def create_evaluation_callback(nlp, optimizer, corpus, cfg):
+    def evaluate():
+        dev_examples = list(
+            corpus.dev_dataset(
+                nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
+            )
+        )
+        n_words = sum(len(ex.doc) for ex in dev_examples)
+        start_time = timer()
+
+        if optimizer.averages:
+            with nlp.use_params(optimizer.averages):
+                scorer = nlp.evaluate(dev_examples, batch_size=32)
+        else:
+            scorer = nlp.evaluate(dev_examples, batch_size=32)
+        end_time = timer()
+        wps = n_words / (end_time - start_time)
+        scores = scorer.scores
+        # Calculate a weighted sum based on score_weights for the main score
+        weights = cfg["score_weights"]
+        try:
+            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
+        except KeyError as e:
+            raise KeyError(
+                Errors.E983.format(
+                    dict_name="score_weights", key=str(e), keys=list(scores.keys())
+                )
+            )
+
+        scores["speed"] = wps
+        return weighted_score, scores
+
+    return evaluate
+
+
+def train_while_improving(
+    nlp,
+    optimizer,
+    train_data,
+    evaluate,
+    *,
+    dropout,
+    eval_frequency,
+    accumulate_gradient=1,
+    patience=0,
+    max_steps=0,
+    raw_text=None,
+):
+    """Train until an evaluation stops improving. Works as a generator,
+    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
+    where info is a dict, and is_best_checkpoint is in [True, False, None] --
+    None indicating that the iteration was not evaluated as a checkpoint.
+    The evaluation is conducted by calling the evaluate callback, which should
+
+    Positional arguments:
+        nlp: The spaCy pipeline to evaluate.
+        optimizer: The optimizer callable.
+        train_data (Iterable[Batch]): A generator of batches, with the training
+            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
+            data iterable needs to take care of iterating over the epochs and
+            shuffling.
+        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
+            The callback should take no arguments and return a tuple
+            `(main_score, other_scores)`. The main_score should be a float where
+            higher is better. other_scores can be any object.
+
+    Every iteration, the function yields out a tuple with:
+
+    * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
+    * info: A dict with various information about the last update (see below).
+    * is_best_checkpoint: A value in None, False, True, indicating whether this
+        was the best evaluation so far. You should use this to save the model
+        checkpoints during training. If None, evaluation was not conducted on
+        that iteration. False means evaluation was conducted, but a previous
+        evaluation was better.
+
+    The info dict provides the following information:
+
+        epoch (int): How many passes over the data have been completed.
+        step (int): How many steps have been completed.
+        score (float): The main score form the last evaluation.
+        other_scores: : The other scores from the last evaluation.
+        loss: The accumulated losses throughout training.
+        checkpoints: A list of previous results, where each result is a
+            (score, step, epoch) tuple.
+    """
+    if isinstance(dropout, float):
+        dropouts = thinc.schedules.constant(dropout)
+    else:
+        dropouts = dropout
+    results = []
+    losses = {}
+    to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
+
+    if raw_text:
+        random.shuffle(raw_text)
+        raw_batches = util.minibatch(
+            (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
+        )
+
+    for step, batch in enumerate(train_data):
+        dropout = next(dropouts)
+        with nlp.select_pipes(enable=to_enable):
+            for subbatch in subdivide_batch(batch, accumulate_gradient):
+                nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
+                if raw_text:
+                    # If raw text is available, perform 'rehearsal' updates,
+                    # which use unlabelled data to reduce overfitting.
+                    raw_batch = list(next(raw_batches))
+                    nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
+            for name, proc in nlp.pipeline:
+                if hasattr(proc, "model"):
+                    proc.model.finish_update(optimizer)
+        optimizer.step_schedules()
+        if not (step % eval_frequency):
+            score, other_scores = evaluate()
+            results.append((score, step))
+            is_best_checkpoint = score == max(results)[0]
+        else:
+            score, other_scores = (None, None)
+            is_best_checkpoint = None
+        info = {
+            "step": step,
+            "score": score,
+            "other_scores": other_scores,
+            "losses": losses,
+            "checkpoints": results,
+        }
+        yield batch, info, is_best_checkpoint
+        if is_best_checkpoint is not None:
+            losses = {}
+        # Stop if no improvement in `patience` updates (if specified)
+        best_score, best_step = max(results)
+        if patience and (step - best_step) >= patience:
+            break
+        # Stop if we've exhausted our max steps (if specified)
+        if max_steps and (step * accumulate_gradient) >= max_steps:
+            break
+
+
+def subdivide_batch(batch, accumulate_gradient):
+    batch = list(batch)
+    batch.sort(key=lambda eg: len(eg.doc))
+    sub_len = len(batch) // accumulate_gradient
+    start = 0
+    for i in range(accumulate_gradient):
+        subbatch = batch[start : start + sub_len]
+        if subbatch:
+            yield subbatch
+        start += len(subbatch)
+    subbatch = batch[start:]
+    if subbatch:
+        yield subbatch
+
+
+def setup_printer(training, nlp):
+    score_cols = training["scores"]
+    score_widths = [max(len(col), 6) for col in score_cols]
+    loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
+    loss_widths = [max(len(col), 8) for col in loss_cols]
+    table_header = ["#"] + loss_cols + score_cols + ["Score"]
+    table_header = [col.upper() for col in table_header]
+    table_widths = [6] + loss_widths + score_widths + [6]
+    table_aligns = ["r" for _ in table_widths]
+
+    msg.row(table_header, widths=table_widths)
+    msg.row(["-" * width for width in table_widths])
+
+    def print_row(info):
+        try:
+            losses = [
+                "{0:.2f}".format(float(info["losses"][pipe_name]))
+                for pipe_name in nlp.pipe_names
+            ]
+        except KeyError as e:
+            raise KeyError(
+                Errors.E983.format(
+                    dict_name="scores (losses)",
+                    key=str(e),
+                    keys=list(info["losses"].keys()),
+                )
+            )
+
+        try:
+            scores = [
+                "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
+            ]
+        except KeyError as e:
+            raise KeyError(
+                Errors.E983.format(
+                    dict_name="scores (other)",
+                    key=str(e),
+                    keys=list(info["other_scores"].keys()),
+                )
+            )
+        data = (
+            [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
+        )
+        msg.row(data, widths=table_widths, aligns=table_aligns)
+
+    return print_row
+
+
+def update_meta(training, nlp, info):
+    score_cols = training["scores"]
+    nlp.meta["performance"] = {}
+    for metric in score_cols:
+        nlp.meta["performance"][metric] = info["other_scores"][metric]
+    for pipe_name in nlp.pipe_names:
+        nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -1,15 +1,11 @@
-# coding: utf8
-from __future__ import unicode_literals, print_function
-
 from pathlib import Path
 import sys
 import requests
-import srsly
 from wasabi import msg

-from ..compat import path2str
-from ..util import get_data_path
 from .. import about
+from ..util import get_package_version, get_installed_models, get_base_version
+from ..util import get_package_path, get_model_meta, is_compatible_version


 def validate():
@ -17,51 +13,30 @@ def validate():
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
-    with msg.loading("Loading compatibility table..."):
-        r = requests.get(about.__compatibility__)
-        if r.status_code != 200:
-            msg.fail(
-                "Server error ({})".format(r.status_code),
-                "Couldn't fetch compatibility table.",
-                exits=1,
-            )
-    msg.good("Loaded compatibility table")
-    compat = r.json()["spacy"]
-    version = about.__version__
-    version = version.rsplit(".dev", 1)[0]
-    current_compat = compat.get(version)
+    model_pkgs, compat = get_model_pkgs()
+    spacy_version = get_base_version(about.__version__)
+    current_compat = compat.get(spacy_version, {})
    if not current_compat:
-        msg.fail(
-            "Can't find spaCy v{} in compatibility table".format(version),
-            about.__compatibility__,
-            exits=1,
-        )
-    all_models = set()
-    for spacy_v, models in dict(compat).items():
-        all_models.update(models.keys())
-        for model, model_vs in models.items():
-            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
-    model_links = get_model_links(current_compat)
-    model_pkgs = get_model_pkgs(current_compat, all_models)
-    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
+        msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
-    incompat_models.update(
-        [d["name"] for _, d in model_links.items() if not d["compat"]]
-    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

-    msg.divider("Installed models (spaCy v{})".format(about.__version__))
-    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
+    msg.divider(f"Installed models (spaCy v{about.__version__})")
+    msg.info(f"spaCy installation: {spacy_dir}")

-    if model_links or model_pkgs:
-        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
+    if model_pkgs:
+        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
-            rows.append(get_model_row(current_compat, name, data, msg))
-        for name, data in model_links.items():
-            rows.append(get_model_row(current_compat, name, data, msg, "link"))
+            if data["compat"]:
+                comp = msg.text("", color="green", icon="good", no_print=True)
+                version = msg.text(data["version"], color="green", no_print=True)
+            else:
+                version = msg.text(data["version"], color="red", no_print=True)
+                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
+            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
@ -71,75 +46,51 @@ def validate():
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
-        msg.text(
-            "The following models are not available for spaCy "
-            "v{}: {}".format(about.__version__, ", ".join(na_models))
+        msg.info(
+            f"The following models are custom spaCy models or not "
+            f"available for spaCy v{about.__version__}:",
+            ", ".join(na_models),
        )
-    if incompat_links:
-        msg.text(
-            "You may also want to overwrite the incompatible links using the "
-            "`python -m spacy link` command with `--force`, or remove them "
-            "from the data directory. "
-            "Data path: {path}".format(path=path2str(get_data_path()))
-        )
-    if incompat_models or incompat_links:
+    if incompat_models:
        sys.exit(1)


-def get_model_links(compat):
-    links = {}
-    data_path = get_data_path()
-    if data_path:
-        models = [p for p in data_path.iterdir() if is_model_path(p)]
-        for model in models:
-            meta_path = Path(model) / "meta.json"
-            if not meta_path.exists():
-                continue
-            meta = srsly.read_json(meta_path)
-            link = model.parts[-1]
-            name = meta["lang"] + "_" + meta["name"]
-            links[link] = {
-                "name": name,
-                "version": meta["version"],
-                "compat": is_compat(compat, name, meta["version"]),
-            }
-    return links
-
-
-def get_model_pkgs(compat, all_models):
-    import pkg_resources
-
+def get_model_pkgs():
+    with msg.loading("Loading compatibility table..."):
+        r = requests.get(about.__compatibility__)
+        if r.status_code != 200:
+            msg.fail(
+                f"Server error ({r.status_code})",
+                "Couldn't fetch compatibility table.",
+                exits=1,
+            )
+    msg.good("Loaded compatibility table")
+    compat = r.json()["spacy"]
+    all_models = set()
+    installed_models = get_installed_models()
+    for spacy_v, models in dict(compat).items():
+        all_models.update(models.keys())
+        for model, model_vs in models.items():
+            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    pkgs = {}
-    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+    for pkg_name in installed_models:
        package = pkg_name.replace("-", "_")
-        if package in all_models:
-            version = pkg_data.version
-            pkgs[pkg_name] = {
-                "name": package,
-                "version": version,
-                "compat": is_compat(compat, package, version),
-            }
-    return pkgs
-
-
-def get_model_row(compat, name, data, msg, model_type="package"):
-    if data["compat"]:
-        comp = msg.text("", color="green", icon="good", no_print=True)
-        version = msg.text(data["version"], color="green", no_print=True)
-    else:
-        version = msg.text(data["version"], color="red", no_print=True)
-        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
-    return (model_type, name, data["name"], version, comp)
-
-
-def is_model_path(model_path):
-    exclude = ["cache", "pycache", "__pycache__"]
-    name = model_path.parts[-1]
-    return model_path.is_dir() and name not in exclude and not name.startswith(".")
-
-
-def is_compat(compat, name, version):
-    return name in compat and version in compat[name]
+        version = get_package_version(pkg_name)
+        if package in compat:
+            is_compat = version in compat[package]
+            spacy_version = about.__version__
+        else:
+            model_path = get_package_path(package)
+            model_meta = get_model_meta(model_path)
+            spacy_version = model_meta.get("spacy_version", "n/a")
+            is_compat = is_compatible_version(about.__version__, spacy_version)
+        pkgs[pkg_name] = {
+            "name": package,
+            "version": version,
+            "spacy": spacy_version,
+            "compat": is_compat,
+        }
+    return pkgs, compat


 def reformat_version(version):
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,4 +1,3 @@
-# coding: utf8
 """
 Helpers for Python and platform compatibility. To distinguish them from
 the builtin functions, replacement functions are suffixed with an underscore,
@ -6,15 +5,9 @@ e.g. `unicode_`.

 DOCS: https://spacy.io/api/top-level#compat
 """
-from __future__ import unicode_literals
-
-import os
 import sys
-import itertools
-import ast
-import types

-from thinc.neural.util import copy_array
+from thinc.util import copy_array

 try:
    import cPickle as pickle
@ -36,91 +29,23 @@ try:
 except ImportError:
    cupy = None

-try:
-    from thinc.neural.optimizers import Optimizer  # noqa: F401
-except ImportError:
-    from thinc.neural.optimizers import Adam as Optimizer  # noqa: F401
+from thinc.api import Optimizer  # noqa: F401

 pickle = pickle
 copy_reg = copy_reg
 CudaStream = CudaStream
 cupy = cupy
 copy_array = copy_array
-izip = getattr(itertools, "izip", zip)

 is_windows = sys.platform.startswith("win")
 is_linux = sys.platform.startswith("linux")
 is_osx = sys.platform == "darwin"

-# See: https://github.com/benjaminp/six/blob/master/six.py
-is_python2 = sys.version_info[0] == 2
-is_python3 = sys.version_info[0] == 3
-is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)

-if is_python2:
-    bytes_ = str
-    unicode_ = unicode  # noqa: F821
-    basestring_ = basestring  # noqa: F821
-    input_ = raw_input  # noqa: F821
-    path2str = lambda path: str(path).decode("utf8")
-    class_types = (type, types.ClassType)
-
-elif is_python3:
-    bytes_ = bytes
-    unicode_ = str
-    basestring_ = str
-    input_ = input
-    path2str = lambda path: str(path)
-    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
-
-
-def b_to_str(b_str):
-    """Convert a bytes object to a string.
-
-    b_str (bytes): The object to convert.
-    RETURNS (unicode): The converted string.
-    """
-    if is_python2:
-        return b_str
-    # Important: if no encoding is set, string becomes "b'...'"
-    return str(b_str, encoding="utf8")
-
-
-def symlink_to(orig, dest):
-    """Create a symlink. Used for model shortcut links.
-
-    orig (unicode / Path): The origin path.
-    dest (unicode / Path): The destination path of the symlink.
-    """
-    if is_windows:
-        import subprocess
-
-        subprocess.check_call(
-            ["mklink", "/d", path2str(orig), path2str(dest)], shell=True
-        )
-    else:
-        orig.symlink_to(dest)
-
-
-def symlink_remove(link):
-    """Remove a symlink. Used for model shortcut links.
-
-    link (unicode / Path): The path to the symlink.
-    """
-    # https://stackoverflow.com/q/26554135/6400719
-    if os.path.isdir(path2str(link)) and is_windows:
-        # this should only be on Py2.7 and windows
-        os.rmdir(path2str(link))
-    else:
-        os.unlink(path2str(link))
-
-
-def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
+def is_config(windows=None, linux=None, osx=None, **kwargs):
    """Check if a specific configuration of Python version and operating system
    matches the user's setup. Mostly used to display targeted error messages.

-    python2 (bool): spaCy is executed with Python 2.x.
-    python3 (bool): spaCy is executed with Python 3.x.
    windows (bool): spaCy is executed on Windows.
    linux (bool): spaCy is executed on Linux.
    osx (bool): spaCy is executed on OS X or macOS.
@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
    DOCS: https://spacy.io/api/top-level#compat.is_config
    """
    return (
-        python2 in (None, is_python2)
-        and python3 in (None, is_python3)
-        and windows in (None, is_windows)
+        windows in (None, is_windows)
        and linux in (None, is_linux)
        and osx in (None, is_osx)
    )
-
-
-def import_file(name, loc):
-    """Import module from a file. Used to load models from a directory.
-
-    name (unicode): Name of module to load.
-    loc (unicode / Path): Path to the file.
-    RETURNS: The loaded module.
-    """
-    loc = path2str(loc)
-    if is_python_pre_3_5:
-        import imp
-
-        return imp.load_source(name, loc)
-    else:
-        import importlib.util
-
-        spec = importlib.util.spec_from_file_location(name, str(loc))
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-
-def unescape_unicode(string):
-    """Python2.7's re module chokes when compiling patterns that have ranges
-    between escaped unicode codepoints if the two codepoints are unrecognised
-    in the unicode database. For instance:
-
-        re.compile('[\\uAA77-\\uAA79]').findall("hello")
-
-    Ends up matching every character (on Python 2). This problem doesn't occur
-    if we're dealing with unicode literals.
-    """
-    if string is None:
-        return string
-    # We only want to unescape the unicode, so we first must protect the other
-    # backslashes.
-    string = string.replace("\\", "\\\\")
-    # Now we remove that protection for the unicode.
-    string = string.replace("\\\\u", "\\u")
-    string = string.replace("\\\\U", "\\U")
-    # Now we unescape by evaling the string with the AST. This can't execute
-    # code -- it only does the representational level.
-    return ast.literal_eval("u'''" + string + "'''")
--- a/spacy/data/init.py
+++ b/spacy/data/init.py
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -1,17 +1,13 @@
-# coding: utf8
 """
 spaCy's built in visualization suite for dependencies and named entities.

 DOCS: https://spacy.io/api/top-level#displacy
 USAGE: https://spacy.io/usage/visualizers
 """
-from __future__ import unicode_literals
-
 import warnings

 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc, Span
-from ..compat import b_to_str
 from ..errors import Errors, Warnings
 from ..util import is_in_jupyter

@ -26,13 +22,13 @@ def render(
    """Render displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
-    RETURNS (unicode): Rendered HTML markup.
+    RETURNS (str): Rendered HTML markup.

    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers
@ -77,13 +73,13 @@ def serve(
    """Serve displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
-    host (unicode): Host to serve visualisation.
+    host (str): Host to serve visualisation.

    DOCS: https://spacy.io/api/top-level#displacy.serve
    USAGE: https://spacy.io/usage/visualizers
@ -95,20 +91,20 @@ def serve(

    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server(host, port, app)
-    print("\nUsing the '{}' visualizer".format(style))
-    print("Serving on http://{}:{} ...\n".format(host, port))
+    print(f"\nUsing the '{style}' visualizer")
+    print(f"Serving on http://{host}:{port} ...\n")
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
-        print("Shutting down server on port {}.".format(port))
+        print(f"Shutting down server on port {port}.")
    finally:
        httpd.server_close()


 def app(environ, start_response):
    # Headers and status need to be bytes in Python 2, see #1227
-    headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
-    start_response(b_to_str(b"200 OK"), headers)
+    headers = [("Content-type", "text/html; charset=utf-8")]
+    start_response("200 OK", headers)
    res = _html["parsed"].encode(encoding="utf-8")
    return [res]

--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import uuid

 from .templates import (
@ -50,7 +47,7 @@ class DependencyRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered SVG or HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        # Create a random ID prefix to make sure parses don't receive the
        # same ID, even if they're identical
@ -61,7 +58,7 @@ class DependencyRenderer(object):
                settings = p.get("settings", {})
                self.direction = settings.get("direction", DEFAULT_DIR)
                self.lang = settings.get("lang", DEFAULT_LANG)
-            render_id = "{}-{}".format(id_prefix, i)
+            render_id = f"{id_prefix}-{i}"
            svg = self.render_svg(render_id, p["words"], p["arcs"])
            rendered.append(svg)
        if page:
@ -81,7 +78,7 @@ class DependencyRenderer(object):
        render_id (int): Unique ID, typically index of document.
        words (list): Individual words and their tags.
        arcs (list): Individual arcs and their start, end, direction and label.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        self.levels = self.get_levels(arcs)
        self.highest_level = len(self.levels)
@ -115,10 +112,10 @@ class DependencyRenderer(object):
    ):
        """Render individual word.

-        text (unicode): Word text.
-        tag (unicode): Part-of-speech tag.
+        text (str): Word text.
+        tag (str): Part-of-speech tag.
        i (int): Unique ID, typically word index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        y = self.offset_y + self.word_spacing
        x = self.offset_x + i * self.distance
@ -134,12 +131,12 @@ class DependencyRenderer(object):
    def render_arrow(self, label, start, end, direction, i):
        """Render individual arrow.

-        label (unicode): Dependency label.
+        label (str): Dependency label.
        start (int): Index of start word.
        end (int): Index of end word.
-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        i (int): Unique ID, typically arrow index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        if start < 0 or end < 0:
            error_args = dict(start=start, end=end, label=label, dir=direction)
@ -182,7 +179,7 @@ class DependencyRenderer(object):
        y (int): Y-coordinate of arrow start and end point.
        y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
        x_end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arc path ('d' attribute).
+        RETURNS (str): Definition of the arc path ('d' attribute).
        """
        template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
        if self.compact:
@ -192,11 +189,11 @@ class DependencyRenderer(object):
    def get_arrowhead(self, direction, x, y, end):
        """Render individual arrow head.

-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        x (int): X-coordinate of arrow start point.
        y (int): Y-coordinate of arrow start and end point.
        end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arrow head path ('d' attribute).
+        RETURNS (str): Definition of the arrow head path ('d' attribute).
        """
        if direction == "left":
            pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@ -282,7 +279,7 @@ class EntityRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered HTML markup.
+        RETURNS (str): Rendered HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
@ -303,9 +300,9 @@ class EntityRenderer(object):
    def render_ents(self, text, spans, title):
        """Render entities in text.

-        text (unicode): Original text.
+        text (str): Original text.
        spans (list): Individual entity spans and their start, end and label.
-        title (unicode or None): Document title set in Doc.user_data['title'].
+        title (str / None): Document title set in Doc.user_data['title'].
        """
        markup = ""
        offset = 0
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 # Setting explicit height and max-width: none on the SVG is required for
 # Jupyter to render it properly in a cell

--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 def add_codes(err_cls):
    """Add error codes to string messages via class attribute names."""

@ -93,7 +89,7 @@ class Warnings(object):
            "lemmatization rules or data. This means that the trained model "
            "may not be able to lemmatize correctly. If this is intentional "
            "or the language you're using doesn't have lemmatization data, "
-            "please ignore this warning. If this is surprising, make sure you "
+            "you can ignore this warning. If this is surprising, make sure you "
            "have the spacy-lookups-data package installed.")
    W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
            "'n_process' will be set to 1.")
@ -135,6 +131,31 @@ class Warnings(object):
            "package installed. The languages with lexeme normalization tables "
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")

+    # TODO: fix numbering after merging develop into master
+    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
+            "spaCy version requirement: {version}. This can lead to compatibility "
+            "problems with older versions, or as new spaCy versions are "
+            "released, because the model may say it's compatible when it's "
+            'not. Consider changing the "spacy_version" in your meta.json to a '
+            "version range, with a lower and upper pin. For example: {example}")
+    W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
+            "incompatible with the current version ({current}). This may lead "
+            "to unexpected results or runtime errors. To resolve this, "
+            "download a newer compatible model or retrain your custom model "
+            "with the current spaCy version. For more details and available "
+            "updates, run: python -m spacy validate")
+    W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
+            "instead.")
+    W097 = ("No Model config was provided to create the '{name}' component, "
+            "and no default configuration could be found either.")
+    W098 = ("No Model config was provided to create the '{name}' component, "
+            "so a default configuration was used.")
+    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
+            "but got '{type}' instead, so ignoring it.")
+    W100 = ("Skipping unsupported morphological feature(s): {feature}. "
+            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+            "string \"Field1=Value1,Value2|Field2=Value3\".")
+

@add_codes
 class Errors(object):
@ -156,7 +177,7 @@ class Errors(object):
    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
    E008 = ("Some current components would be lost when restoring previous "
            "pipeline state. If you added components after calling "
-            "`nlp.disable_pipes()`, you should remove them explicitly with "
+            "`nlp.select_pipes()`, you should remove them explicitly with "
            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
            "the new components: {names}")
    E009 = ("The `update` method expects same number of docs and golds, but "
@ -217,7 +238,7 @@ class Errors(object):
            "the documentation:\nhttps://spacy.io/usage/models")
    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
            "component to the pipeline with: "
-            "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+            "nlp.add_pipe(nlp.create_pipe('sentencizer')). "
            "Alternatively, add the dependency parser, or set sentence "
            "boundaries by setting doc[i].is_sent_start.")
    E031 = ("Invalid token: empty string ('') at position {i}.")
@ -253,15 +274,10 @@ class Errors(object):
    E047 = ("Can't assign a value to unregistered extension attribute "
            "'{name}'. Did you forget to call the `set_extension` method?")
    E048 = ("Can't import language {lang} from spacy.lang: {err}")
-    E049 = ("Can't find spaCy data directory: '{path}'. Check your "
-            "installation and permissions, or use spacy.util.set_data_path "
-            "to customise the location if necessary.")
-    E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
-            "link, a Python package or a valid path to a data directory.")
-    E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
-            "it points to a valid package (not just a data directory).")
+    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
+            "package or a valid path to a data directory.")
    E052 = ("Can't find model directory: {path}")
-    E053 = ("Could not read meta.json from {path}")
+    E053 = ("Could not read {name} from {path}")
    E054 = ("No valid '{setting}' setting found in model meta.json.")
    E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
    E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
@ -379,8 +395,8 @@ class Errors(object):
    E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
            "in favor of the pipe name `sentencizer`, which does the same "
            "thing. For example, use `nlp.create_pipeline('sentencizer')`")
-    E109 = ("Model for component '{name}' not initialized. Did you forget to "
-            "load a model, or forget to call begin_training()?")
+    E109 = ("Component '{name}' could not be run. Did you forget to "
+            "call begin_training()?")
    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
    E111 = ("Pickling a token is not supported, because tokens are only views "
            "of the parent Doc and can't exist on their own. A pickled token "
@ -450,8 +466,6 @@ class Errors(object):
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
-    E136 = ("This additional feature requires the jsonschema library to be "
-            "installed:\npip install jsonschema")
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
            "to provide a valid JSON object as input with either the `text` "
            "or `tokens` key. For more info, see the docs:\n"
@ -459,14 +473,11 @@ class Errors(object):
    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
            "includes either the `text` or `tokens` key. For more info, see "
            "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
-            "forget to call set_kb()?")
+    E139 = ("Knowledge Base for component '{name}' is empty.")
    E140 = ("The list of entities, prior probabilities and entity vectors "
            "should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the "
            "provided {found}.")
-    E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
-            "'cosine'.")
    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
            "call add_label()?")
    E144 = ("Could not find parameter `{param}` when building the entity "
@ -590,6 +601,47 @@ class Errors(object):
    E200 = ("Specifying a base model with a pretrained component '{component}' "
            "can not be combined with adding a pretrained Tok2Vec layer.")

+    # TODO: fix numbering after merging develop into master
+    E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
+            "{keys}")
+    E984 = ("Could not parse the {input} - double check the data is written "
+            "in the correct format as expected by spaCy.")
+    E985 = ("The pipeline component '{component}' is already available in the base "
+            "model. The settings in the component block in the config file are "
+            "being ignored. If you want to replace this component instead, set "
+            "'replace' to True in the training configuration.")
+    E986 = ("Could not create any training batches: check your input. "
+            "Perhaps discard_oversize should be set to False ?")
+    E987 = ("The text of an example training instance is either a Doc or "
+            "a string, but found {type} instead.")
+    E988 = ("Could not parse any training examples. Ensure the data is "
+            "formatted correctly.")
+    E989 = ("'nlp.update()' was called with two positional arguments. This "
+            "may be due to a backwards-incompatible change to the format "
+            "of the training data in spaCy 3.0 onwards. The 'update' "
+            "function should now be called with a batch of 'Example' "
+            "objects, instead of (text, annotation) tuples. ")
+    E990 = ("An entity linking component needs to be initialized with a "
+            "KnowledgeBase object, but found {type} instead.")
+    E991 = ("The function 'select_pipes' should be called with either a "
+            "'disable' argument to list the names of the pipe components "
+            "that should be disabled, or with an 'enable' argument that "
+            "specifies which pipes should not be disabled.")
+    E992 = ("The function `select_pipes` was called with `enable`={enable} "
+            "and `disable`={disable} but that information is conflicting "
+            "for the `nlp` pipeline with components {names}.")
+    E993 = ("The config for 'nlp' should include either a key 'name' to "
+            "refer to an existing model by name or path, or a key 'lang' "
+            "to create a new blank model.")
+    E996 = ("Could not parse {file}: {msg}")
+    E997 = ("Tokenizer special cases are not allowed to modify the text. "
+            "This would map '{chunk}' to '{orth}' given token attributes "
+            "'{token_attrs}'.")
+    E998 = ("To create GoldParse objects from Example objects without a "
+            "Doc, get_gold_parses() should be called with a Vocab object.")
+    E999 = ("Encountered an unexpected format for the dictionary holding "
+            "gold annotations: {gold_dict}")
+

@add_codes
 class TempErrors(object):
@ -610,14 +662,14 @@ class MatchPatternError(ValueError):
    def __init__(self, key, errors):
        """Custom error for validating match patterns.

-        key (unicode): The name of the matcher rule.
+        key (str): The name of the matcher rule.
        errors (dict): Validation errors (sequence of strings) mapped to pattern
            ID, i.e. the index of the added pattern.
        """
-        msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
+        msg = f"Invalid token patterns for matcher rule '{key}'\n"
        for pattern_idx, error_msgs in errors.items():
-            pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
-            msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
+            pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
+            msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
        ValueError.__init__(self, msg)


--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,12 +1,8 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.

-    term (unicode): The term to explain.
-    RETURNS (unicode): The explanation, or `None` if not found in the glossary.
+    term (str): The term to explain.
+    RETURNS (str): The explanation, or `None` if not found in the glossary.

    EXAMPLE:
        >>> spacy.explain(u'NORP')
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -1,9 +1,10 @@
 from cymem.cymem cimport Pool

-from .structs cimport TokenC
 from .typedefs cimport attr_t
 from .syntax.transition_system cimport Transition

+from .tokens import Doc
+

 cdef struct GoldParseC:
    int* tags
@ -19,23 +20,49 @@ cdef class GoldParse:
    cdef Pool mem

    cdef GoldParseC c
+    cdef readonly TokenAnnotation orig

    cdef int length
    cdef public int loss
    cdef public list words
    cdef public list tags
-    cdef public list morphology
+    cdef public list pos
+    cdef public list morphs
+    cdef public list lemmas
+    cdef public list sent_starts
    cdef public list heads
    cdef public list labels
    cdef public dict orths
    cdef public list ner
-    cdef public list ents
    cdef public dict brackets
-    cdef public object cats
+    cdef public dict cats
    cdef public dict links

    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
-    cdef readonly list orig_annot


+cdef class TokenAnnotation:
+    cdef public list ids
+    cdef public list words
+    cdef public list tags
+    cdef public list pos
+    cdef public list morphs
+    cdef public list lemmas
+    cdef public list heads
+    cdef public list deps
+    cdef public list entities
+    cdef public list sent_starts
+    cdef public dict brackets_by_start
+
+
+cdef class DocAnnotation:
+    cdef public object cats
+    cdef public object links
+
+
+cdef class Example:
+    cdef public object doc
+    cdef public TokenAnnotation token_annotation
+    cdef public DocAnnotation doc_annotation
+    cdef public object goldparse
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -1,15 +1,15 @@
 """Knowledge-base for entity or concept linking."""
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
-
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE

 from .vocab cimport Vocab
 from .typedefs cimport hash_t
-
 from .structs cimport KBEntryC, AliasC
+
+
 ctypedef vector[KBEntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
 ctypedef vector[float] float_vec
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -1,6 +1,4 @@
-# cython: infer_types=True
-# cython: profile=True
-# coding: utf8
+# cython: infer_types=True, profile=True
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
@ -8,12 +6,11 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 from libc.stdint cimport int32_t, int64_t
 from libcpp.vector cimport vector

+from pathlib import Path
 import warnings
 from os import path
-from pathlib import Path

 from .typedefs cimport hash_t
-
 from .errors import Errors, Warnings


@ -41,7 +38,7 @@ cdef class Candidate:

    @property
    def entity_(self):
-        """RETURNS (unicode): ID/name of this entity in the KB"""
+        """RETURNS (str): ID/name of this entity in the KB"""
        return self.kb.vocab.strings[self.entity_hash]

    @property
@ -51,7 +48,7 @@ cdef class Candidate:

    @property
    def alias_(self):
-        """RETURNS (unicode): ID of the original alias"""
+        """RETURNS (str): ID of the original alias"""
        return self.kb.vocab.strings[self.alias_hash]

    @property
@ -445,6 +442,8 @@ cdef class KnowledgeBase:

 cdef class Writer:
    def __init__(self, object loc):
+        if path.exists(loc):
+            assert not path.isdir(loc), f"{loc} is directory"
        if isinstance(loc, Path):
            loc = bytes(loc)
        if path.exists(loc):
--- a/spacy/lang/af/init.py
+++ b/spacy/lang/af/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...attrs import LANG
--- a/spacy/lang/af/stop_words.py
+++ b/spacy/lang/af/stop_words.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 # Source: https://github.com/stopwords-iso/stopwords-af

 STOP_WORDS = set(
--- a/spacy/lang/ar/init.py
+++ b/spacy/lang/ar/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
--- a/spacy/lang/ar/examples.py
+++ b/spacy/lang/ar/examples.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/ar/lex_attrs.py
+++ b/spacy/lang/ar/lex_attrs.py
@ -1,5 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
 from ...attrs import LIKE_NUM

 _num_words = set(
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 from ..char_classes import UNITS, ALPHA_UPPER

--- a/spacy/lang/ar/stop_words.py
+++ b/spacy/lang/ar/stop_words.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
    """
 من
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import ORTH, LEMMA


--- a/spacy/lang/bg/init.py
+++ b/spacy/lang/bg/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from ...language import Language
 from ...attrs import LANG
--- a/spacy/lang/bg/examples.py
+++ b/spacy/lang/bg/examples.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/bg/stop_words.py
+++ b/spacy/lang/bg/stop_words.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 # Source: https://github.com/Alir3z4/stop-words

 STOP_WORDS = set(
--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
--- a/spacy/lang/bn/examples.py
+++ b/spacy/lang/bn/examples.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/bn/morph_rules.py
+++ b/spacy/lang/bn/morph_rules.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import LEMMA, PRON_LEMMA


--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS

--- a/spacy/lang/bn/stop_words.py
+++ b/spacy/lang/bn/stop_words.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 STOP_WORDS = set(
    """
 অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত  অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
--- a/spacy/lang/bn/tag_map.py
+++ b/spacy/lang/bn/tag_map.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
 from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM

@ -14,8 +11,8 @@ TAG_MAP = {
    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    ":": {POS: PUNCT},
-    "৳": {POS: SYM, "Other": {"SymType": "currency"}},
-    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+    "৳": {POS: SYM, "SymType": "currency"},
+    "#": {POS: SYM, "SymType": "numbersign"},
    "AFX": {POS: ADJ, "Hyph": "yes"},
    "CC": {POS: CONJ, "ConjType": "coor"},
    "CD": {POS: NUM, "NumType": "card"},
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@ -1,6 +1,3 @@
-# coding=utf-8
-from __future__ import unicode_literals
-
 from ...symbols import ORTH, LEMMA


--- a/spacy/lang/ca/init.py
+++ b/spacy/lang/ca/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
--- a/spacy/lang/ca/examples.py
+++ b/spacy/lang/ca/examples.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM


--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..punctuation import TOKENIZER_INFIXES
 from ..char_classes import ALPHA

--- a/spacy/lang/ca/stop_words.py
+++ b/spacy/lang/ca/stop_words.py
@ -1,7 +1,3 @@
-# encoding: utf8
-from __future__ import unicode_literals
-
-
 STOP_WORDS = set(
    """
 a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
--- a/spacy/lang/ca/tag_map.py
+++ b/spacy/lang/ca/tag_map.py
@ -1,28 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
-from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
-
-
-TAG_MAP = {
-    "ADV": {POS: ADV},
-    "NOUN": {POS: NOUN},
-    "ADP": {POS: ADP},
-    "PRON": {POS: PRON},
-    "SCONJ": {POS: SCONJ},
-    "PROPN": {POS: PROPN},
-    "DET": {POS: DET},
-    "SYM": {POS: SYM},
-    "INTJ": {POS: INTJ},
-    "PUNCT": {POS: PUNCT},
-    "NUM": {POS: NUM},
-    "AUX": {POS: AUX},
-    "X": {POS: X},
-    "CONJ": {POS: CONJ},
-    "CCONJ": {POS: CCONJ},
-    "ADJ": {POS: ADJ},
-    "VERB": {POS: VERB},
-    "PART": {POS: PART},
-    "SP": {POS: SPACE},
-}
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import ORTH, LEMMA


@ -33,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]

 for h in range(1, 12 + 1):
    for period in ["a.m.", "am"]:
-        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
+        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
    for period in ["p.m.", "pm"]:
-        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
+        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]


 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 split_chars = lambda char: list(char.strip().split(" "))
 merge_chars = lambda char: char.strip().replace(" ", "|")
 group_chars = lambda char: char.strip().replace(" ", "")
--- a/Show More
+++ b/Show More